claude final draft

2025-12-29 23:44:30 -03:00
parent 116d4032e2
commit e5aafd5097
22 changed files with 2815 additions and 32 deletions
--- a/ctlptl.yaml
+++ b/ctlptl.yaml
@@ -1,32 +0,0 @@
 # ctlptl configuration for Kind cluster
 # Usage: ctlptl apply -f ctlptl.yaml
 apiVersion: ctlptl.dev/v1alpha1
 kind: Registry
 name: sysmonstm-registry
 port: 5005
 ---
 apiVersion: ctlptl.dev/v1alpha1
 kind: Cluster
 product: kind
 registry: sysmonstm-registry
 kindV1Alpha4Cluster:
  name: sysmonstm
  nodes:
    - role: control-plane
      extraPortMappings:
        # Gateway HTTP
        - containerPort: 30080
          hostPort: 8080
          protocol: TCP
        # Aggregator gRPC
        - containerPort: 30051
          hostPort: 50051
          protocol: TCP
  # Resource limits for t2.small compatibility
  kubeadmConfigPatches:
    - |
      kind: InitConfiguration
      nodeRegistration:
        kubeletExtraArgs:
          system-reserved: memory=256Mi
--- a/scripts/generate-proto.sh
+++ b/scripts/generate-proto.sh
@@ -0,0 +1,24 @@
 #!/bin/bash
 # Generate Python gRPC code from proto definitions
 set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$SCRIPT_DIR/.."
 cd "$PROJECT_ROOT"
 echo "Generating Python gRPC code from proto/metrics.proto..."
 python -m grpc_tools.protoc \
    -I./proto \
    --python_out=./shared \
    --grpc_python_out=./shared \
    ./proto/metrics.proto
 # Fix imports in generated files (grpc_tools generates incorrect imports)
 sed -i 's/import metrics_pb2/from shared import metrics_pb2/' shared/metrics_pb2_grpc.py
 echo "Generated:"
 echo "  - shared/metrics_pb2.py"
 echo "  - shared/metrics_pb2_grpc.py"
--- a/services/aggregator/init.py
+++ b/services/aggregator/init.py
@@ -0,0 +1 @@
 """Aggregator service."""
--- a/services/aggregator/main.py
+++ b/services/aggregator/main.py
@@ -0,0 +1,361 @@
 """Aggregator service - gRPC server that receives metrics and stores them."""
 import asyncio
 import signal
 import sys
 from pathlib import Path
 import grpc
 from grpc_health.v1 import health, health_pb2, health_pb2_grpc
 # Add project root to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from services.aggregator.storage import RedisStorage, TimescaleStorage
 from shared import metrics_pb2, metrics_pb2_grpc
 from shared.config import get_aggregator_config
 from shared.events import get_publisher
 from shared.logging import setup_logging
 class MetricsServicer(metrics_pb2_grpc.MetricsServiceServicer):
    """gRPC servicer for metrics ingestion."""
    def __init__(
        self,
        redis_storage: RedisStorage,
        timescale_storage: TimescaleStorage,
        event_publisher,
        logger,
    ):
        self.redis = redis_storage
        self.timescale = timescale_storage
        self.publisher = event_publisher
        self.logger = logger
    async def StreamMetrics(self, request_iterator, context):
        """Receive streaming metrics from a collector."""
        metrics_received = 0
        current_machine = None
        current_batch: list[tuple[str, float, dict]] = []
        batch_timestamp = 0
        batch_hostname = ""
        try:
            async for metric in request_iterator:
                metrics_received += 1
                # Track current machine
                if current_machine != metric.machine_id:
                    # Flush previous batch if switching machines
                    if current_machine and current_batch:
                        await self._flush_batch(
                            current_machine,
                            batch_hostname,
                            batch_timestamp,
                            current_batch,
                        )
                        current_batch = []
                    current_machine = metric.machine_id
                    self.logger.info(
                        "collector_connected",
                        machine_id=metric.machine_id,
                        hostname=metric.hostname,
                    )
                # Get metric type name
                metric_type = metrics_pb2.MetricType.Name(metric.type)
                # Add to batch
                current_batch.append(
                    (
                        metric_type,
                        metric.value,
                        dict(metric.labels),
                    )
                )
                batch_timestamp = metric.timestamp_ms
                batch_hostname = metric.hostname
                # Flush batch every 20 metrics or if timestamp changes significantly
                if len(current_batch) >= 20:
                    await self._flush_batch(
                        current_machine, batch_hostname, batch_timestamp, current_batch
                    )
                    current_batch = []
            # Flush remaining
            if current_machine and current_batch:
                await self._flush_batch(
                    current_machine, batch_hostname, batch_timestamp, current_batch
                )
            self.logger.info(
                "stream_completed",
                machine_id=current_machine,
                metrics_received=metrics_received,
            )
            return metrics_pb2.StreamAck(
                success=True,
                metrics_received=metrics_received,
                message="OK",
            )
        except Exception as e:
            self.logger.error(
                "stream_error",
                error=str(e),
                machine_id=current_machine,
                metrics_received=metrics_received,
            )
            return metrics_pb2.StreamAck(
                success=False,
                metrics_received=metrics_received,
                message=str(e),
            )
    async def _flush_batch(
        self,
        machine_id: str,
        hostname: str,
        timestamp_ms: int,
        batch: list[tuple[str, float, dict]],
    ) -> None:
        """Flush a batch of metrics to storage and events."""
        # Aggregate metrics for Redis state
        metrics_dict = {}
        for metric_type, value, labels in batch:
            key = metric_type
            if labels:
                key = f"{metric_type}:{','.join(f'{k}={v}' for k, v in labels.items())}"
            metrics_dict[key] = value
        # Update Redis (current state)
        await self.redis.update_machine_state(
            machine_id=machine_id,
            hostname=hostname,
            metrics=metrics_dict,
            timestamp_ms=timestamp_ms,
        )
        # Insert into TimescaleDB (historical)
        try:
            await self.timescale.insert_metrics(
                machine_id=machine_id,
                hostname=hostname,
                timestamp_ms=timestamp_ms,
                metrics=batch,
            )
        except Exception as e:
            self.logger.warning("timescale_insert_failed", error=str(e))
        # Update machine registry
        try:
            await self.timescale.update_machine_registry(
                machine_id=machine_id,
                hostname=hostname,
            )
        except Exception as e:
            self.logger.warning("machine_registry_update_failed", error=str(e))
        # Publish event for subscribers (alerts, gateway)
        await self.publisher.publish(
            topic="metrics.raw",
            payload={
                "machine_id": machine_id,
                "hostname": hostname,
                "timestamp_ms": timestamp_ms,
                "metrics": metrics_dict,
            },
        )
        self.logger.debug(
            "batch_flushed",
            machine_id=machine_id,
            count=len(batch),
        )
    async def GetCurrentState(self, request, context):
        """Get current state for a single machine."""
        state = await self.redis.get_machine_state(request.machine_id)
        if not state:
            context.set_code(grpc.StatusCode.NOT_FOUND)
            context.set_details(f"Machine {request.machine_id} not found")
            return metrics_pb2.MachineState()
        # Convert state to proto
        metrics = []
        for key, value in state.get("metrics", {}).items():
            parts = key.split(":")
            metric_type_str = parts[0]
            labels = {}
            if len(parts) > 1:
                for pair in parts[1].split(","):
                    k, v = pair.split("=")
                    labels[k] = v
            metric_type = getattr(metrics_pb2, metric_type_str, 0)
            metrics.append(
                metrics_pb2.Metric(
                    machine_id=state["machine_id"],
                    hostname=state["hostname"],
                    timestamp_ms=state["last_seen_ms"],
                    type=metric_type,
                    value=value,
                    labels=labels,
                )
            )
        return metrics_pb2.MachineState(
            machine_id=state["machine_id"],
            hostname=state["hostname"],
            last_seen_ms=state["last_seen_ms"],
            current_metrics=metrics,
            health=metrics_pb2.HEALTHY,
        )
    async def GetAllStates(self, request, context):
        """Get current state for all machines."""
        states = await self.redis.get_all_machines()
        machine_states = []
        for state in states:
            metrics = []
            for key, value in state.get("metrics", {}).items():
                parts = key.split(":")
                metric_type_str = parts[0]
                metric_type = getattr(metrics_pb2, metric_type_str, 0)
                metrics.append(
                    metrics_pb2.Metric(
                        machine_id=state["machine_id"],
                        hostname=state["hostname"],
                        timestamp_ms=state["last_seen_ms"],
                        type=metric_type,
                        value=value,
                    )
                )
            machine_states.append(
                metrics_pb2.MachineState(
                    machine_id=state["machine_id"],
                    hostname=state["hostname"],
                    last_seen_ms=state["last_seen_ms"],
                    current_metrics=metrics,
                    health=metrics_pb2.HEALTHY,
                )
            )
        return metrics_pb2.AllMachinesState(machines=machine_states)
 class AggregatorService:
    """Main aggregator service."""
    def __init__(self):
        self.config = get_aggregator_config()
        self.logger = setup_logging(
            service_name=self.config.service_name,
            log_level=self.config.log_level,
            log_format=self.config.log_format,
        )
        self.redis = RedisStorage(self.config.redis_url)
        self.timescale = TimescaleStorage(self.config.timescale_url)
        self.publisher = get_publisher(source="aggregator")
        self.server: grpc.aio.Server | None = None
        self.running = False
    async def start(self) -> None:
        """Start the gRPC server."""
        self.running = True
        # Connect to storage
        await self.redis.connect()
        try:
            await self.timescale.connect()
        except Exception as e:
            self.logger.warning(
                "timescale_connection_failed",
                error=str(e),
                message="Continuing without TimescaleDB - metrics won't be persisted",
            )
        # Connect to event publisher
        await self.publisher.connect()
        # Create gRPC server
        self.server = grpc.aio.server()
        # Add metrics servicer
        servicer = MetricsServicer(
            redis_storage=self.redis,
            timescale_storage=self.timescale,
            event_publisher=self.publisher,
            logger=self.logger,
        )
        metrics_pb2_grpc.add_MetricsServiceServicer_to_server(servicer, self.server)
        # Add health check servicer
        health_servicer = health.HealthServicer()
        health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
        health_servicer.set("MetricsService", health_pb2.HealthCheckResponse.SERVING)
        health_pb2_grpc.add_HealthServicer_to_server(health_servicer, self.server)
        # Start server
        listen_addr = f"[::]:{self.config.grpc_port}"
        self.server.add_insecure_port(listen_addr)
        await self.server.start()
        self.logger.info(
            "aggregator_started",
            port=self.config.grpc_port,
            listen_addr=listen_addr,
        )
    async def stop(self) -> None:
        """Stop the gRPC server."""
        self.running = False
        if self.server:
            await self.server.stop(grace=5)
            self.server = None
        await self.publisher.disconnect()
        await self.timescale.disconnect()
        await self.redis.disconnect()
        self.logger.info("aggregator_stopped")
    async def wait(self) -> None:
        """Wait for the server to terminate."""
        if self.server:
            await self.server.wait_for_termination()
 async def main():
    """Main entry point."""
    service = AggregatorService()
    # Handle shutdown signals
    loop = asyncio.get_event_loop()
    async def shutdown():
        service.logger.info("shutdown_signal_received")
        await service.stop()
    for sig in (signal.SIGTERM, signal.SIGINT):
        loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown()))
    await service.start()
    await service.wait()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/services/aggregator/storage.py
+++ b/services/aggregator/storage.py
@@ -0,0 +1,245 @@
 """Storage layer for metrics - Redis (current state) and TimescaleDB (historical)."""
 import json
 import time
 from datetime import datetime
 from typing import Any
 import asyncpg
 import redis.asyncio as redis
 from shared.logging import get_logger
 logger = get_logger("storage")
 class RedisStorage:
    """Redis storage for current machine state."""
    def __init__(self, redis_url: str):
        self.redis_url = redis_url
        self._client: redis.Redis | None = None
    async def connect(self) -> None:
        self._client = redis.from_url(self.redis_url, decode_responses=True)
        await self._client.ping()
        logger.info("redis_connected", url=self.redis_url)
    async def disconnect(self) -> None:
        if self._client:
            await self._client.close()
            self._client = None
            logger.info("redis_disconnected")
    async def update_machine_state(
        self,
        machine_id: str,
        hostname: str,
        metrics: dict[str, float],
        timestamp_ms: int,
    ) -> None:
        """Update the current state for a machine."""
        if not self._client:
            raise RuntimeError("Not connected to Redis")
        state = {
            "machine_id": machine_id,
            "hostname": hostname,
            "last_seen_ms": timestamp_ms,
            "metrics": metrics,
            "updated_at": datetime.utcnow().isoformat(),
        }
        # Store as hash for efficient partial reads
        key = f"machine:{machine_id}"
        await self._client.hset(
            key,
            mapping={
                "state": json.dumps(state),
                "last_seen": str(timestamp_ms),
            },
        )
        # Set expiry - if no updates for 5 minutes, consider stale
        await self._client.expire(key, 300)
        # Add to active machines set
        await self._client.sadd("machines:active", machine_id)
    async def get_machine_state(self, machine_id: str) -> dict[str, Any] | None:
        """Get current state for a machine."""
        if not self._client:
            raise RuntimeError("Not connected to Redis")
        key = f"machine:{machine_id}"
        data = await self._client.hget(key, "state")
        if data:
            return json.loads(data)
        return None
    async def get_all_machines(self) -> list[dict[str, Any]]:
        """Get current state for all active machines."""
        if not self._client:
            raise RuntimeError("Not connected to Redis")
        machine_ids = await self._client.smembers("machines:active")
        states = []
        for machine_id in machine_ids:
            state = await self.get_machine_state(machine_id)
            if state:
                states.append(state)
            else:
                # Remove stale machine from active set
                await self._client.srem("machines:active", machine_id)
        return states
 class TimescaleStorage:
    """TimescaleDB storage for historical metrics."""
    def __init__(self, connection_url: str):
        self.connection_url = connection_url
        self._pool: asyncpg.Pool | None = None
    async def connect(self) -> None:
        self._pool = await asyncpg.create_pool(
            self.connection_url,
            min_size=2,
            max_size=10,
        )
        logger.info("timescaledb_connected")
    async def disconnect(self) -> None:
        if self._pool:
            await self._pool.close()
            self._pool = None
            logger.info("timescaledb_disconnected")
    async def insert_metrics(
        self,
        machine_id: str,
        hostname: str,
        timestamp_ms: int,
        metrics: list[tuple[str, float, dict[str, str]]],
    ) -> int:
        """
        Insert a batch of metrics.
        Args:
            machine_id: Machine identifier
            hostname: Machine hostname
            timestamp_ms: Timestamp in milliseconds
            metrics: List of (metric_type, value, labels) tuples
        Returns:
            Number of rows inserted
        """
        if not self._pool:
            raise RuntimeError("Not connected to TimescaleDB")
        timestamp = datetime.utcfromtimestamp(timestamp_ms / 1000)
        # Prepare batch insert
        rows = [
            (timestamp, machine_id, hostname, metric_type, value, json.dumps(labels))
            for metric_type, value, labels in metrics
        ]
        async with self._pool.acquire() as conn:
            await conn.executemany(
                """
                INSERT INTO metrics_raw (time, machine_id, hostname, metric_type, value, labels)
                VALUES ($1, $2, $3, $4, $5, $6)
                """,
                rows,
            )
        return len(rows)
    async def update_machine_registry(
        self,
        machine_id: str,
        hostname: str,
        health: str = "HEALTHY",
    ) -> None:
        """Update the machines registry with last seen time."""
        if not self._pool:
            raise RuntimeError("Not connected to TimescaleDB")
        async with self._pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO machines (machine_id, hostname, last_seen, health)
                VALUES ($1, $2, NOW(), $3)
                ON CONFLICT (machine_id) DO UPDATE
                SET hostname = $2, last_seen = NOW(), health = $3
                """,
                machine_id,
                hostname,
                health,
            )
    async def get_metrics(
        self,
        machine_id: str | None = None,
        metric_type: str | None = None,
        start_time: datetime | None = None,
        end_time: datetime | None = None,
        limit: int = 1000,
    ) -> list[dict[str, Any]]:
        """Query historical metrics."""
        if not self._pool:
            raise RuntimeError("Not connected to TimescaleDB")
        conditions = []
        params = []
        param_idx = 1
        if machine_id:
            conditions.append(f"machine_id = ${param_idx}")
            params.append(machine_id)
            param_idx += 1
        if metric_type:
            conditions.append(f"metric_type = ${param_idx}")
            params.append(metric_type)
            param_idx += 1
        if start_time:
            conditions.append(f"time >= ${param_idx}")
            params.append(start_time)
            param_idx += 1
        if end_time:
            conditions.append(f"time <= ${param_idx}")
            params.append(end_time)
            param_idx += 1
        where_clause = " AND ".join(conditions) if conditions else "TRUE"
        query = f"""
            SELECT time, machine_id, hostname, metric_type, value, labels
            FROM metrics_raw
            WHERE {where_clause}
            ORDER BY time DESC
            LIMIT ${param_idx}
        """
        params.append(limit)
        async with self._pool.acquire() as conn:
            rows = await conn.fetch(query, *params)
        return [
            {
                "time": row["time"].isoformat(),
                "machine_id": row["machine_id"],
                "hostname": row["hostname"],
                "metric_type": row["metric_type"],
                "value": row["value"],
                "labels": json.loads(row["labels"]) if row["labels"] else {},
            }
            for row in rows
        ]
--- a/services/alerts/Dockerfile
+++ b/services/alerts/Dockerfile
@@ -14,6 +14,12 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY shared /app/shared
 COPY proto /app/proto
 RUN python -m grpc_tools.protoc \
    -I/app/proto \
    --python_out=/app/shared \
    --grpc_python_out=/app/shared \
    /app/proto/metrics.proto
 COPY services/alerts /app/services/alerts
 ENV PYTHONPATH=/app
--- a/services/alerts/init.py
+++ b/services/alerts/init.py
@@ -0,0 +1 @@
 """Alerts service."""
--- a/services/alerts/main.py
+++ b/services/alerts/main.py
@@ -0,0 +1,317 @@
 """Alerts service - subscribes to metrics events and evaluates thresholds."""
 import asyncio
 import signal
 import sys
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import Any
 import asyncpg
 # Add project root to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from shared.config import get_alerts_config
 from shared.events import get_publisher, get_subscriber
 from shared.logging import setup_logging
@dataclass
 class AlertRule:
    """An alert rule configuration."""
    id: int
    name: str
    metric_type: str
    operator: str  # gt, lt, gte, lte, eq
    threshold: float
    severity: str  # warning, critical
    enabled: bool
@dataclass
 class Alert:
    """A triggered alert."""
    rule: AlertRule
    machine_id: str
    value: float
    triggered_at: datetime
 class AlertEvaluator:
    """Evaluates metrics against alert rules."""
    OPERATORS = {
        "gt": lambda v, t: v > t,
        "lt": lambda v, t: v < t,
        "gte": lambda v, t: v >= t,
        "lte": lambda v, t: v <= t,
        "eq": lambda v, t: v == t,
    }
    def __init__(self, rules: list[AlertRule]):
        self.rules = {r.metric_type: r for r in rules if r.enabled}
        # Track active alerts to avoid duplicates
        self.active_alerts: dict[str, Alert] = {}  # key: f"{machine_id}:{rule_name}"
    def evaluate(self, machine_id: str, metrics: dict[str, float]) -> list[Alert]:
        """Evaluate metrics against rules and return new alerts."""
        new_alerts = []
        for metric_type, value in metrics.items():
            rule = self.rules.get(metric_type)
            if not rule:
                continue
            op_func = self.OPERATORS.get(rule.operator)
            if not op_func:
                continue
            alert_key = f"{machine_id}:{rule.name}"
            if op_func(value, rule.threshold):
                # Threshold exceeded
                if alert_key not in self.active_alerts:
                    alert = Alert(
                        rule=rule,
                        machine_id=machine_id,
                        value=value,
                        triggered_at=datetime.utcnow(),
                    )
                    self.active_alerts[alert_key] = alert
                    new_alerts.append(alert)
            else:
                # Threshold no longer exceeded - resolve alert
                if alert_key in self.active_alerts:
                    del self.active_alerts[alert_key]
        return new_alerts
    def update_rules(self, rules: list[AlertRule]) -> None:
        """Update the rules being evaluated."""
        self.rules = {r.metric_type: r for r in rules if r.enabled}
 class AlertsService:
    """Main alerts service."""
    def __init__(self):
        self.config = get_alerts_config()
        self.logger = setup_logging(
            service_name=self.config.service_name,
            log_level=self.config.log_level,
            log_format=self.config.log_format,
        )
        self.running = False
        self.db_pool: asyncpg.Pool | None = None
        self.evaluator: AlertEvaluator | None = None
        self.subscriber = get_subscriber(topics=["metrics.raw"])
        self.publisher = get_publisher(source="alerts")
    async def connect_db(self) -> None:
        """Connect to TimescaleDB for rules and alert storage."""
        try:
            self.db_pool = await asyncpg.create_pool(
                self.config.timescale_url,
                min_size=1,
                max_size=5,
            )
            self.logger.info("database_connected")
        except Exception as e:
            self.logger.warning("database_connection_failed", error=str(e))
            self.db_pool = None
    async def load_rules(self) -> list[AlertRule]:
        """Load alert rules from database."""
        if not self.db_pool:
            # Return default rules if no database
            return [
                AlertRule(
                    1, "High CPU Usage", "CPU_PERCENT", "gt", 80.0, "warning", True
                ),
                AlertRule(
                    2, "Critical CPU Usage", "CPU_PERCENT", "gt", 95.0, "critical", True
                ),
                AlertRule(
                    3,
                    "High Memory Usage",
                    "MEMORY_PERCENT",
                    "gt",
                    85.0,
                    "warning",
                    True,
                ),
                AlertRule(
                    4,
                    "Critical Memory Usage",
                    "MEMORY_PERCENT",
                    "gt",
                    95.0,
                    "critical",
                    True,
                ),
                AlertRule(
                    5, "High Disk Usage", "DISK_PERCENT", "gt", 80.0, "warning", True
                ),
                AlertRule(
                    6,
                    "Critical Disk Usage",
                    "DISK_PERCENT",
                    "gt",
                    90.0,
                    "critical",
                    True,
                ),
            ]
        async with self.db_pool.acquire() as conn:
            rows = await conn.fetch(
                "SELECT id, name, metric_type, operator, threshold, severity, enabled FROM alert_rules"
            )
        return [
            AlertRule(
                id=row["id"],
                name=row["name"],
                metric_type=row["metric_type"],
                operator=row["operator"],
                threshold=row["threshold"],
                severity=row["severity"],
                enabled=row["enabled"],
            )
            for row in rows
        ]
    async def store_alert(self, alert: Alert) -> None:
        """Store triggered alert in database."""
        if not self.db_pool:
            return
        try:
            async with self.db_pool.acquire() as conn:
                await conn.execute(
                    """
                    INSERT INTO alerts (time, machine_id, rule_id, rule_name, metric_type, value, threshold, severity)
                    VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                    """,
                    alert.triggered_at,
                    alert.machine_id,
                    alert.rule.id,
                    alert.rule.name,
                    alert.rule.metric_type,
                    alert.value,
                    alert.rule.threshold,
                    alert.rule.severity,
                )
        except Exception as e:
            self.logger.warning("alert_storage_failed", error=str(e))
    async def publish_alert(self, alert: Alert) -> None:
        """Publish alert event for other services (e.g., notifications)."""
        await self.publisher.publish(
            topic=f"alerts.{alert.rule.severity}",
            payload={
                "rule_name": alert.rule.name,
                "machine_id": alert.machine_id,
                "metric_type": alert.rule.metric_type,
                "value": alert.value,
                "threshold": alert.rule.threshold,
                "severity": alert.rule.severity,
                "triggered_at": alert.triggered_at.isoformat(),
            },
        )
    async def process_metrics(self, event_data: dict[str, Any]) -> None:
        """Process incoming metrics and evaluate alerts."""
        if not self.evaluator:
            return
        machine_id = event_data.get("machine_id", "unknown")
        metrics = event_data.get("metrics", {})
        alerts = self.evaluator.evaluate(machine_id, metrics)
        for alert in alerts:
            self.logger.warning(
                "alert_triggered",
                rule=alert.rule.name,
                machine_id=alert.machine_id,
                value=alert.value,
                threshold=alert.rule.threshold,
                severity=alert.rule.severity,
            )
            await self.store_alert(alert)
            await self.publish_alert(alert)
    async def run(self) -> None:
        """Main service loop."""
        self.running = True
        self.logger.info("alerts_service_starting")
        # Connect to database
        await self.connect_db()
        # Load rules
        rules = await self.load_rules()
        self.evaluator = AlertEvaluator(rules)
        self.logger.info("rules_loaded", count=len(rules))
        # Connect to event bus
        await self.subscriber.connect()
        await self.publisher.connect()
        self.logger.info("alerts_service_started")
        try:
            # Process events
            async for event in self.subscriber.consume():
                if not self.running:
                    break
                try:
                    await self.process_metrics(event.payload)
                except Exception as e:
                    self.logger.error("event_processing_error", error=str(e))
        except asyncio.CancelledError:
            self.logger.info("alerts_service_cancelled")
        finally:
            await self.subscriber.disconnect()
            await self.publisher.disconnect()
            if self.db_pool:
                await self.db_pool.close()
            self.logger.info("alerts_service_stopped")
    def stop(self) -> None:
        """Signal the service to stop."""
        self.running = False
 async def main():
    """Main entry point."""
    service = AlertsService()
    # Handle shutdown signals
    loop = asyncio.get_event_loop()
    def signal_handler():
        service.logger.info("shutdown_signal_received")
        service.stop()
    for sig in (signal.SIGTERM, signal.SIGINT):
        loop.add_signal_handler(sig, signal_handler)
    await service.run()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/services/alerts/requirements.txt
+++ b/services/alerts/requirements.txt
@@ -1,3 +1,5 @@
 grpcio>=1.60.0
 grpcio-tools>=1.60.0
 redis>=5.0.0
 asyncpg>=0.29.0
 structlog>=23.2.0
--- a/services/collector/init.py
+++ b/services/collector/init.py
@@ -0,0 +1 @@
 """Collector service."""
--- a/services/collector/main.py
+++ b/services/collector/main.py
@@ -0,0 +1,209 @@
 """Collector service - streams system metrics to the aggregator via gRPC."""
 import asyncio
 import signal
 import sys
 from pathlib import Path
 import grpc
 # Add project root to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from services.collector.metrics import MetricsCollector
 from shared import metrics_pb2, metrics_pb2_grpc
 from shared.config import get_collector_config
 from shared.logging import setup_logging
 class CollectorService:
    """Main collector service that streams metrics to the aggregator."""
    def __init__(self):
        self.config = get_collector_config()
        self.logger = setup_logging(
            service_name=self.config.service_name,
            log_level=self.config.log_level,
            log_format=self.config.log_format,
        )
        self.running = False
        self.channel: grpc.aio.Channel | None = None
        self.stub: metrics_pb2_grpc.MetricsServiceStub | None = None
        self.collector = MetricsCollector(
            machine_id=self.config.machine_id,
            collect_cpu=self.config.collect_cpu,
            collect_memory=self.config.collect_memory,
            collect_disk=self.config.collect_disk,
            collect_network=self.config.collect_network,
            collect_load=self.config.collect_load,
        )
    async def connect(self) -> None:
        """Establish connection to the aggregator."""
        self.logger.info(
            "connecting_to_aggregator",
            aggregator_url=self.config.aggregator_url,
        )
        self.channel = grpc.aio.insecure_channel(
            self.config.aggregator_url,
            options=[
                ("grpc.keepalive_time_ms", 10000),
                ("grpc.keepalive_timeout_ms", 5000),
                ("grpc.keepalive_permit_without_calls", True),
            ],
        )
        self.stub = metrics_pb2_grpc.MetricsServiceStub(self.channel)
        # Wait for channel to be ready
        try:
            await asyncio.wait_for(
                self.channel.channel_ready(),
                timeout=10.0,
            )
            self.logger.info("connected_to_aggregator")
        except asyncio.TimeoutError:
            self.logger.error("connection_timeout")
            raise
    async def disconnect(self) -> None:
        """Close connection to the aggregator."""
        if self.channel:
            await self.channel.close()
            self.channel = None
            self.stub = None
            self.logger.info("disconnected_from_aggregator")
    def _batch_to_proto(self, batch) -> list[metrics_pb2.Metric]:
        """Convert a MetricsBatch to protobuf messages."""
        protos = []
        for metric in batch.metrics:
            proto = metrics_pb2.Metric(
                machine_id=batch.machine_id,
                hostname=batch.hostname,
                timestamp_ms=batch.timestamp_ms,
                type=getattr(metrics_pb2, metric.metric_type, 0),
                value=metric.value,
                labels=metric.labels,
            )
            protos.append(proto)
        return protos
    async def _metric_generator(self):
        """Async generator that yields metrics at the configured interval."""
        while self.running:
            batch = self.collector.collect()
            protos = self._batch_to_proto(batch)
            for proto in protos:
                yield proto
            self.logger.debug(
                "collected_metrics",
                count=len(protos),
                machine_id=batch.machine_id,
            )
            await asyncio.sleep(self.config.collection_interval)
    async def stream_metrics(self) -> None:
        """Stream metrics to the aggregator."""
        if not self.stub:
            raise RuntimeError("Not connected to aggregator")
        retry_count = 0
        max_retries = 10
        base_delay = 1.0
        while self.running:
            try:
                self.logger.info("starting_metric_stream")
                response = await self.stub.StreamMetrics(self._metric_generator())
                self.logger.info(
                    "stream_completed",
                    success=response.success,
                    metrics_received=response.metrics_received,
                    message=response.message,
                )
                retry_count = 0
            except grpc.aio.AioRpcError as e:
                retry_count += 1
                delay = min(base_delay * (2**retry_count), 60.0)
                self.logger.warning(
                    "stream_error",
                    code=e.code().name,
                    details=e.details(),
                    retry_count=retry_count,
                    retry_delay=delay,
                )
                if retry_count >= max_retries:
                    self.logger.error("max_retries_exceeded")
                    raise
                await asyncio.sleep(delay)
                # Reconnect
                try:
                    await self.disconnect()
                    await self.connect()
                except Exception as conn_err:
                    self.logger.error("reconnect_failed", error=str(conn_err))
            except asyncio.CancelledError:
                self.logger.info("stream_cancelled")
                break
    async def run(self) -> None:
        """Main entry point for the collector service."""
        self.running = True
        self.logger.info(
            "collector_starting",
            machine_id=self.config.machine_id,
            interval=self.config.collection_interval,
        )
        # Initial CPU percent call to initialize (first call always returns 0)
        import psutil
        psutil.cpu_percent()
        await self.connect()
        try:
            await self.stream_metrics()
        finally:
            await self.disconnect()
            self.logger.info("collector_stopped")
    def stop(self) -> None:
        """Signal the collector to stop."""
        self.running = False
 async def main():
    """Main entry point."""
    service = CollectorService()
    # Handle shutdown signals
    loop = asyncio.get_event_loop()
    def signal_handler():
        service.logger.info("shutdown_signal_received")
        service.stop()
    for sig in (signal.SIGTERM, signal.SIGINT):
        loop.add_signal_handler(sig, signal_handler)
    await service.run()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/services/collector/metrics.py
+++ b/services/collector/metrics.py
@@ -0,0 +1,233 @@
 """System metrics collection using psutil."""
 import socket
 import time
 from dataclasses import dataclass, field
 import psutil
@dataclass
 class MetricPoint:
    """A single metric data point."""
    metric_type: str
    value: float
    labels: dict[str, str] = field(default_factory=dict)
@dataclass
 class MetricsBatch:
    """A batch of metrics from a single collection cycle."""
    machine_id: str
    hostname: str
    timestamp_ms: int
    metrics: list[MetricPoint]
 class MetricsCollector:
    """Collects system metrics using psutil."""
    def __init__(
        self,
        machine_id: str,
        collect_cpu: bool = True,
        collect_memory: bool = True,
        collect_disk: bool = True,
        collect_network: bool = True,
        collect_load: bool = True,
    ):
        self.machine_id = machine_id
        self.hostname = socket.gethostname()
        self.collect_cpu = collect_cpu
        self.collect_memory = collect_memory
        self.collect_disk = collect_disk
        self.collect_network = collect_network
        self.collect_load = collect_load
        # Track previous network counters for rate calculation
        self._prev_net_io: psutil._common.snetio | None = None
        self._prev_net_time: float | None = None
    def collect(self) -> MetricsBatch:
        """Collect all enabled metrics and return as a batch."""
        metrics: list[MetricPoint] = []
        if self.collect_cpu:
            metrics.extend(self._collect_cpu())
        if self.collect_memory:
            metrics.extend(self._collect_memory())
        if self.collect_disk:
            metrics.extend(self._collect_disk())
        if self.collect_network:
            metrics.extend(self._collect_network())
        if self.collect_load:
            metrics.extend(self._collect_load())
        return MetricsBatch(
            machine_id=self.machine_id,
            hostname=self.hostname,
            timestamp_ms=int(time.time() * 1000),
            metrics=metrics,
        )
    def _collect_cpu(self) -> list[MetricPoint]:
        """Collect CPU metrics."""
        metrics = []
        # Overall CPU percent
        cpu_percent = psutil.cpu_percent(interval=None)
        metrics.append(
            MetricPoint(
                metric_type="CPU_PERCENT",
                value=cpu_percent,
            )
        )
        # Per-core CPU percent
        per_cpu = psutil.cpu_percent(interval=None, percpu=True)
        for i, pct in enumerate(per_cpu):
            metrics.append(
                MetricPoint(
                    metric_type="CPU_PERCENT_PER_CORE",
                    value=pct,
                    labels={"core": str(i)},
                )
            )
        return metrics
    def _collect_memory(self) -> list[MetricPoint]:
        """Collect memory metrics."""
        mem = psutil.virtual_memory()
        return [
            MetricPoint(metric_type="MEMORY_PERCENT", value=mem.percent),
            MetricPoint(metric_type="MEMORY_USED_BYTES", value=float(mem.used)),
            MetricPoint(
                metric_type="MEMORY_AVAILABLE_BYTES", value=float(mem.available)
            ),
        ]
    def _collect_disk(self) -> list[MetricPoint]:
        """Collect disk metrics."""
        metrics = []
        # Disk usage for root partition
        try:
            disk = psutil.disk_usage("/")
            metrics.append(
                MetricPoint(
                    metric_type="DISK_PERCENT",
                    value=disk.percent,
                    labels={"mount": "/"},
                )
            )
            metrics.append(
                MetricPoint(
                    metric_type="DISK_USED_BYTES",
                    value=float(disk.used),
                    labels={"mount": "/"},
                )
            )
        except (PermissionError, FileNotFoundError):
            pass
        # Disk I/O rates
        try:
            io = psutil.disk_io_counters()
            if io:
                metrics.append(
                    MetricPoint(
                        metric_type="DISK_READ_BYTES_SEC",
                        value=float(
                            io.read_bytes
                        ),  # Will be converted to rate by aggregator
                    )
                )
                metrics.append(
                    MetricPoint(
                        metric_type="DISK_WRITE_BYTES_SEC",
                        value=float(io.write_bytes),
                    )
                )
        except (PermissionError, AttributeError):
            pass
        return metrics
    def _collect_network(self) -> list[MetricPoint]:
        """Collect network metrics with rate calculation."""
        metrics = []
        try:
            net_io = psutil.net_io_counters()
            current_time = time.time()
            if self._prev_net_io is not None and self._prev_net_time is not None:
                time_delta = current_time - self._prev_net_time
                if time_delta > 0:
                    bytes_sent_rate = (
                        net_io.bytes_sent - self._prev_net_io.bytes_sent
                    ) / time_delta
                    bytes_recv_rate = (
                        net_io.bytes_recv - self._prev_net_io.bytes_recv
                    ) / time_delta
                    metrics.append(
                        MetricPoint(
                            metric_type="NETWORK_SENT_BYTES_SEC",
                            value=bytes_sent_rate,
                        )
                    )
                    metrics.append(
                        MetricPoint(
                            metric_type="NETWORK_RECV_BYTES_SEC",
                            value=bytes_recv_rate,
                        )
                    )
            self._prev_net_io = net_io
            self._prev_net_time = current_time
            # Connection count
            connections = len(psutil.net_connections(kind="inet"))
            metrics.append(
                MetricPoint(
                    metric_type="NETWORK_CONNECTIONS",
                    value=float(connections),
                )
            )
        except (PermissionError, psutil.AccessDenied):
            pass
        return metrics
    def _collect_load(self) -> list[MetricPoint]:
        """Collect load average metrics (Unix only)."""
        metrics = []
        try:
            load1, load5, load15 = psutil.getloadavg()
            metrics.append(MetricPoint(metric_type="LOAD_AVG_1M", value=load1))
            metrics.append(MetricPoint(metric_type="LOAD_AVG_5M", value=load5))
            metrics.append(MetricPoint(metric_type="LOAD_AVG_15M", value=load15))
        except (AttributeError, OSError):
            # Windows doesn't have getloadavg
            pass
        # Process count
        metrics.append(
            MetricPoint(
                metric_type="PROCESS_COUNT",
                value=float(len(psutil.pids())),
            )
        )
        return metrics
--- a/services/gateway/Dockerfile
+++ b/services/gateway/Dockerfile
@@ -21,6 +21,8 @@ RUN python -m grpc_tools.protoc \
    /app/proto/metrics.proto
 COPY services/gateway /app/services/gateway
 COPY services/aggregator/__init__.py /app/services/aggregator/__init__.py
 COPY services/aggregator/storage.py /app/services/aggregator/storage.py
 COPY web /app/web
 ENV PYTHONPATH=/app
--- a/services/gateway/init.py
+++ b/services/gateway/init.py
@@ -0,0 +1 @@
 """Gateway service."""
--- a/services/gateway/main.py
+++ b/services/gateway/main.py
@@ -0,0 +1,393 @@
 """Gateway service - FastAPI with WebSocket for real-time dashboard."""
 import asyncio
 import json
 import sys
 from contextlib import asynccontextmanager
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Any
 import grpc
 from fastapi import FastAPI, HTTPException, Query, WebSocket, WebSocketDisconnect
 from fastapi.requests import Request
 from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
 # Add project root to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from services.aggregator.storage import TimescaleStorage
 from shared import metrics_pb2, metrics_pb2_grpc
 from shared.config import get_gateway_config
 from shared.events import get_subscriber
 from shared.logging import setup_logging
 # Global state
 config = get_gateway_config()
 logger = setup_logging(
    service_name=config.service_name,
    log_level=config.log_level,
    log_format=config.log_format,
 )
 # WebSocket connection manager
 class ConnectionManager:
    """Manages WebSocket connections for real-time updates."""
    def __init__(self):
        self.active_connections: list[WebSocket] = []
    async def connect(self, websocket: WebSocket) -> None:
        await websocket.accept()
        self.active_connections.append(websocket)
        logger.info("websocket_connected", total=len(self.active_connections))
    def disconnect(self, websocket: WebSocket) -> None:
        self.active_connections.remove(websocket)
        logger.info("websocket_disconnected", total=len(self.active_connections))
    async def broadcast(self, message: dict) -> None:
        """Broadcast message to all connected clients."""
        if not self.active_connections:
            return
        data = json.dumps(message)
        disconnected = []
        for connection in self.active_connections:
            try:
                await connection.send_text(data)
            except Exception:
                disconnected.append(connection)
        # Clean up disconnected
        for conn in disconnected:
            try:
                self.active_connections.remove(conn)
            except ValueError:
                pass
 manager = ConnectionManager()
 timescale: TimescaleStorage | None = None
 grpc_channel: grpc.aio.Channel | None = None
 grpc_stub: metrics_pb2_grpc.MetricsServiceStub | None = None
 async def event_listener():
    """Background task that listens for metric events and broadcasts to WebSocket clients."""
    logger.info("event_listener_starting")
    async with get_subscriber(topics=["metrics.raw"]) as subscriber:
        async for event in subscriber.consume():
            try:
                await manager.broadcast(
                    {
                        "type": "metrics",
                        "data": event.payload,
                        "timestamp": event.timestamp.isoformat(),
                    }
                )
            except Exception as e:
                logger.warning("broadcast_error", error=str(e))
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Application lifespan manager."""
    global timescale, grpc_channel, grpc_stub
    logger.info("gateway_starting", port=config.http_port)
    # Connect to TimescaleDB for historical queries
    timescale = TimescaleStorage(config.timescale_url)
    try:
        await timescale.connect()
    except Exception as e:
        logger.warning("timescale_connection_failed", error=str(e))
        timescale = None
    # Connect to aggregator via gRPC
    grpc_channel = grpc.aio.insecure_channel(config.aggregator_url)
    grpc_stub = metrics_pb2_grpc.MetricsServiceStub(grpc_channel)
    # Start event listener in background
    listener_task = asyncio.create_task(event_listener())
    logger.info("gateway_started")
    yield
    # Cleanup
    listener_task.cancel()
    try:
        await listener_task
    except asyncio.CancelledError:
        pass
    if grpc_channel:
        await grpc_channel.close()
    if timescale:
        await timescale.disconnect()
    logger.info("gateway_stopped")
 # Create FastAPI app
 app = FastAPI(
    title="System Monitor Gateway",
    description="Real-time system monitoring dashboard",
    version="0.1.0",
    lifespan=lifespan,
 )
 # Mount static files
 static_path = Path(__file__).parent.parent.parent / "web" / "static"
 if static_path.exists():
    app.mount("/static", StaticFiles(directory=str(static_path)), name="static")
 # Templates
 templates_path = Path(__file__).parent.parent.parent / "web" / "templates"
 templates = (
    Jinja2Templates(directory=str(templates_path)) if templates_path.exists() else None
 )
 # ============================================================================
 # Health endpoints
 # ============================================================================
@app.get("/health")
 async def health_check():
    """Health check endpoint."""
    return {"status": "healthy", "service": "gateway"}
@app.get("/ready")
 async def readiness_check():
    """Readiness check - verifies dependencies."""
    checks = {"gateway": "ok"}
    # Check gRPC connection
    try:
        if grpc_stub:
            await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=2.0)
            checks["aggregator"] = "ok"
    except Exception as e:
        checks["aggregator"] = f"error: {str(e)}"
    # Check TimescaleDB
    if timescale and timescale._pool:
        checks["timescaledb"] = "ok"
    else:
        checks["timescaledb"] = "not connected"
    return {"status": "ready", "checks": checks}
 # ============================================================================
 # REST API endpoints
 # ============================================================================
@app.get("/api/machines")
 async def get_machines():
    """Get current state of all machines."""
    if not grpc_stub:
        raise HTTPException(status_code=503, detail="Aggregator not connected")
    try:
        response = await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=5.0)
        machines = []
        for state in response.machines:
            metrics = {}
            for m in state.current_metrics:
                metric_type = metrics_pb2.MetricType.Name(m.type)
                metrics[metric_type] = m.value
            machines.append(
                {
                    "machine_id": state.machine_id,
                    "hostname": state.hostname,
                    "last_seen_ms": state.last_seen_ms,
                    "health": metrics_pb2.HealthStatus.Name(state.health),
                    "metrics": metrics,
                }
            )
        return {"machines": machines}
    except grpc.aio.AioRpcError as e:
        raise HTTPException(status_code=503, detail=f"Aggregator error: {e.details()}")
@app.get("/api/machines/{machine_id}")
 async def get_machine(machine_id: str):
    """Get current state of a specific machine."""
    if not grpc_stub:
        raise HTTPException(status_code=503, detail="Aggregator not connected")
    try:
        response = await grpc_stub.GetCurrentState(
            metrics_pb2.StateRequest(machine_id=machine_id),
            timeout=5.0,
        )
        if not response.machine_id:
            raise HTTPException(status_code=404, detail="Machine not found")
        metrics = {}
        for m in response.current_metrics:
            metric_type = metrics_pb2.MetricType.Name(m.type)
            metrics[metric_type] = m.value
        return {
            "machine_id": response.machine_id,
            "hostname": response.hostname,
            "last_seen_ms": response.last_seen_ms,
            "health": metrics_pb2.HealthStatus.Name(response.health),
            "metrics": metrics,
        }
    except grpc.aio.AioRpcError as e:
        if e.code() == grpc.StatusCode.NOT_FOUND:
            raise HTTPException(status_code=404, detail="Machine not found")
        raise HTTPException(status_code=503, detail=f"Aggregator error: {e.details()}")
@app.get("/api/metrics")
 async def get_metrics(
    machine_id: str | None = Query(None),
    metric_type: str | None = Query(None),
    minutes: int = Query(60, ge=1, le=1440),
    limit: int = Query(1000, ge=1, le=10000),
 ):
    """Get historical metrics."""
    if not timescale:
        raise HTTPException(status_code=503, detail="TimescaleDB not connected")
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(minutes=minutes)
    try:
        metrics = await timescale.get_metrics(
            machine_id=machine_id,
            metric_type=metric_type,
            start_time=start_time,
            end_time=end_time,
            limit=limit,
        )
        return {"metrics": metrics, "count": len(metrics)}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
 # ============================================================================
 # WebSocket endpoint
 # ============================================================================
@app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
    """WebSocket endpoint for real-time metric updates."""
    await manager.connect(websocket)
    try:
        # Send initial state
        if grpc_stub:
            try:
                response = await grpc_stub.GetAllStates(
                    metrics_pb2.Empty(), timeout=5.0
                )
                for state in response.machines:
                    metrics = {}
                    for m in state.current_metrics:
                        metric_type = metrics_pb2.MetricType.Name(m.type)
                        metrics[metric_type] = m.value
                    await websocket.send_json(
                        {
                            "type": "initial",
                            "data": {
                                "machine_id": state.machine_id,
                                "hostname": state.hostname,
                                "metrics": metrics,
                            },
                        }
                    )
            except Exception as e:
                logger.warning("initial_state_error", error=str(e))
        # Keep connection alive and handle incoming messages
        while True:
            try:
                data = await websocket.receive_text()
                # Handle ping/pong or commands from client
                if data == "ping":
                    await websocket.send_text("pong")
            except WebSocketDisconnect:
                break
    finally:
        manager.disconnect(websocket)
 # ============================================================================
 # Dashboard (HTML)
 # ============================================================================
@app.get("/", response_class=HTMLResponse)
 async def dashboard(request: Request):
    """Serve the dashboard HTML."""
    if templates:
        return templates.TemplateResponse("dashboard.html", {"request": request})
    # Fallback if templates not found
    return HTMLResponse("""
    <!DOCTYPE html>
    <html>
    <head>
        <title>System Monitor</title>
        <style>
            body { font-family: system-ui; background: #1a1a2e; color: #eee; padding: 2rem; }
            h1 { color: #e94560; }
            pre { background: #16213e; padding: 1rem; border-radius: 8px; overflow: auto; }
        </style>
    </head>
    <body>
        <h1>System Monitor</h1>
        <p>Dashboard template not found. API endpoints:</p>
        <ul>
            <li><a href="/api/machines">/api/machines</a> - Current state of all machines</li>
            <li><a href="/api/metrics">/api/metrics</a> - Historical metrics</li>
            <li><a href="/docs">/docs</a> - API documentation</li>
        </ul>
        <h2>Live Metrics</h2>
        <pre id="output">Connecting...</pre>
        <script>
            const ws = new WebSocket(`ws://${location.host}/ws`);
            const output = document.getElementById('output');
            ws.onmessage = (e) => {
                output.textContent = JSON.stringify(JSON.parse(e.data), null, 2);
            };
            ws.onclose = () => { output.textContent = 'Disconnected'; };
        </script>
    </body>
    </html>
    """)
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=config.http_port)
--- a/shared/init.py
+++ b/shared/init.py
@@ -0,0 +1,5 @@
 """Shared utilities and generated protobuf modules."""
 from . import metrics_pb2, metrics_pb2_grpc
 __all__ = ["metrics_pb2", "metrics_pb2_grpc"]
--- a/shared/config.py
+++ b/shared/config.py
@@ -0,0 +1,104 @@
 """Shared configuration management using Pydantic Settings."""
 from functools import lru_cache
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class BaseConfig(BaseSettings):
    """Base configuration shared across all services."""
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )
    # Service identification
    service_name: str = "unknown"
    machine_id: str = "local"
    # Logging
    log_level: str = "INFO"
    log_format: str = "json"  # "json" or "console"
    # Redis
    redis_url: str = "redis://localhost:6379"
    # Events
    events_backend: str = "redis_pubsub"
 class CollectorConfig(BaseConfig):
    """Collector service configuration."""
    service_name: str = "collector"
    # Aggregator connection
    aggregator_url: str = "localhost:50051"
    # Collection settings
    collection_interval: int = 5  # seconds
    # Metrics to collect
    collect_cpu: bool = True
    collect_memory: bool = True
    collect_disk: bool = True
    collect_network: bool = True
    collect_load: bool = True
 class AggregatorConfig(BaseConfig):
    """Aggregator service configuration."""
    service_name: str = "aggregator"
    # gRPC server
    grpc_port: int = 50051
    # TimescaleDB - can be set directly via TIMESCALE_URL
    timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
 class GatewayConfig(BaseConfig):
    """Gateway service configuration."""
    service_name: str = "gateway"
    # HTTP server
    http_port: int = 8000
    # Aggregator connection
    aggregator_url: str = "localhost:50051"
    # TimescaleDB - can be set directly via TIMESCALE_URL
    timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
 class AlertsConfig(BaseConfig):
    """Alerts service configuration."""
    service_name: str = "alerts"
    # TimescaleDB - can be set directly via TIMESCALE_URL or built from components
    timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
@lru_cache
 def get_collector_config() -> CollectorConfig:
    return CollectorConfig()
@lru_cache
 def get_aggregator_config() -> AggregatorConfig:
    return AggregatorConfig()
@lru_cache
 def get_gateway_config() -> GatewayConfig:
    return GatewayConfig()
@lru_cache
 def get_alerts_config() -> AlertsConfig:
    return AlertsConfig()
--- a/shared/logging.py
+++ b/shared/logging.py
@@ -0,0 +1,74 @@
 """Structured logging configuration."""
 import logging
 import sys
 from typing import Any
 import structlog
 def setup_logging(
    service_name: str,
    log_level: str = "INFO",
    log_format: str = "json",
 ) -> structlog.BoundLogger:
    """
    Configure structured logging for a service.
    Args:
        service_name: Name of the service for log context
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
        log_format: Output format ("json" or "console")
    Returns:
        Configured structlog logger
    """
    # Shared processors
    shared_processors: list[Any] = [
        structlog.contextvars.merge_contextvars,
        structlog.processors.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
    ]
    if log_format == "json":
        # JSON format for production
        processors = shared_processors + [
            structlog.processors.format_exc_info,
            structlog.processors.JSONRenderer(),
        ]
    else:
        # Console format for development
        processors = shared_processors + [
            structlog.dev.ConsoleRenderer(colors=True),
        ]
    structlog.configure(
        processors=processors,
        wrapper_class=structlog.make_filtering_bound_logger(
            getattr(logging, log_level.upper(), logging.INFO)
        ),
        context_class=dict,
        logger_factory=structlog.PrintLoggerFactory(),
        cache_logger_on_first_use=True,
    )
    # Also configure standard library logging
    logging.basicConfig(
        format="%(message)s",
        stream=sys.stdout,
        level=getattr(logging, log_level.upper(), logging.INFO),
    )
    # Get logger with service context
    logger = structlog.get_logger(service=service_name)
    return logger
 def get_logger(name: str | None = None) -> structlog.BoundLogger:
    """Get a logger instance, optionally with a specific name."""
    if name:
        return structlog.get_logger(component=name)
    return structlog.get_logger()
--- a/shared/metrics_pb2.py
+++ b/shared/metrics_pb2.py
@@ -0,0 +1,93 @@
 # -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # NO CHECKED-IN PROTOBUF GENCODE
 # source: metrics.proto
 # Protobuf Python Version: 6.31.1
 """Generated protocol buffer code."""
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import descriptor_pool as _descriptor_pool
 from google.protobuf import runtime_version as _runtime_version
 from google.protobuf import symbol_database as _symbol_database
 from google.protobuf.internal import builder as _builder
 _runtime_version.ValidateProtobufRuntimeVersion(
    _runtime_version.Domain.PUBLIC,
    6,
    31,
    1,
    '',
    'metrics.proto'
 )
 # @@protoc_insertion_point(imports)
 _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rmetrics.proto\x12\nmonitoring\"\x07\n\x05\x45mpty\"\xd8\x01\n\x06Metric\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0ctimestamp_ms\x18\x03 \x01(\x03\x12$\n\x04type\x18\x04 \x01(\x0e\x32\x16.monitoring.MetricType\x12\r\n\x05value\x18\x05 \x01(\x01\x12.\n\x06labels\x18\x06 \x03(\x0b\x32\x1e.monitoring.Metric.LabelsEntry\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"s\n\x0bMetricBatch\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0ctimestamp_ms\x18\x03 \x01(\x03\x12(\n\x07metrics\x18\x04 \x03(\x0b\x32\x17.monitoring.MetricPoint\"\xa6\x01\n\x0bMetricPoint\x12$\n\x04type\x18\x01 \x01(\x0e\x32\x16.monitoring.MetricType\x12\r\n\x05value\x18\x02 \x01(\x01\x12\x33\n\x06labels\x18\x03 \x03(\x0b\x32#.monitoring.MetricPoint.LabelsEntry\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"G\n\tStreamAck\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x18\n\x10metrics_received\x18\x02 \x01(\x03\x12\x0f\n\x07message\x18\x03 \x01(\t\"\"\n\x0cStateRequest\x12\x12\n\nmachine_id\x18\x01 \x01(\t\"\x8c\x02\n\x0cMachineState\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0clast_seen_ms\x18\x03 \x01(\x03\x12+\n\x0f\x63urrent_metrics\x18\x04 \x03(\x0b\x32\x12.monitoring.Metric\x12(\n\x06health\x18\x05 \x01(\x0e\x32\x18.monitoring.HealthStatus\x12\x38\n\x08metadata\x18\x06 \x03(\x0b\x32&.monitoring.MachineState.MetadataEntry\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\">\n\x10\x41llMachinesState\x12*\n\x08machines\x18\x01 \x03(\x0b\x32\x18.monitoring.MachineState\"\xd7\x01\n\x0e\x43ontrolCommand\x12\x12\n\ncommand_id\x18\x01 \x01(\t\x12<\n\x0fupdate_interval\x18\x02 \x01(\x0b\x32!.monitoring.UpdateIntervalCommandH\x00\x12\x37\n\x07restart\x18\x03 \x01(\x0b\x32$.monitoring.RestartCollectionCommandH\x00\x12/\n\x08shutdown\x18\x04 \x01(\x0b\x32\x1b.monitoring.ShutdownCommandH\x00\x42\t\n\x07\x63ommand\"1\n\x15UpdateIntervalCommand\x12\x18\n\x10interval_seconds\x18\x01 \x01(\x05\"\x1a\n\x18RestartCollectionCommand\"#\n\x0fShutdownCommand\x12\x10\n\x08graceful\x18\x01 \x01(\x08\"G\n\x0f\x43ontrolResponse\x12\x12\n\ncommand_id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07message\x18\x03 \x01(\t\"#\n\rConfigRequest\x12\x12\n\nmachine_id\x18\x01 \x01(\t\"\x80\x02\n\x0f\x43ollectorConfig\x12#\n\x1b\x63ollection_interval_seconds\x18\x01 \x01(\x05\x12/\n\x0f\x65nabled_metrics\x18\x02 \x03(\x0e\x32\x16.monitoring.MetricType\x12\x37\n\x06labels\x18\x03 \x03(\x0b\x32\'.monitoring.CollectorConfig.LabelsEntry\x12/\n\nthresholds\x18\x04 \x03(\x0b\x32\x1b.monitoring.ThresholdConfig\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"u\n\x0fThresholdConfig\x12+\n\x0bmetric_type\x18\x01 \x01(\x0e\x32\x16.monitoring.MetricType\x12\x19\n\x11warning_threshold\x18\x02 \x01(\x01\x12\x1a\n\x12\x63ritical_threshold\x18\x03 \x01(\x01*\x8d\x03\n\nMetricType\x12\x1b\n\x17METRIC_TYPE_UNSPECIFIED\x10\x00\x12\x0f\n\x0b\x43PU_PERCENT\x10\x01\x12\x18\n\x14\x43PU_PERCENT_PER_CORE\x10\x02\x12\x12\n\x0eMEMORY_PERCENT\x10\x03\x12\x15\n\x11MEMORY_USED_BYTES\x10\x04\x12\x1a\n\x16MEMORY_AVAILABLE_BYTES\x10\x05\x12\x10\n\x0c\x44ISK_PERCENT\x10\x06\x12\x13\n\x0f\x44ISK_USED_BYTES\x10\x07\x12\x17\n\x13\x44ISK_READ_BYTES_SEC\x10\x08\x12\x18\n\x14\x44ISK_WRITE_BYTES_SEC\x10\t\x12\x1a\n\x16NETWORK_SENT_BYTES_SEC\x10\n\x12\x1a\n\x16NETWORK_RECV_BYTES_SEC\x10\x0b\x12\x17\n\x13NETWORK_CONNECTIONS\x10\x0c\x12\x11\n\rPROCESS_COUNT\x10\r\x12\x0f\n\x0bLOAD_AVG_1M\x10\x0e\x12\x0f\n\x0bLOAD_AVG_5M\x10\x0f\x12\x10\n\x0cLOAD_AVG_15M\x10\x10*o\n\x0cHealthStatus\x12\x1d\n\x19HEALTH_STATUS_UNSPECIFIED\x10\x00\x12\x0b\n\x07HEALTHY\x10\x01\x12\x0b\n\x07WARNING\x10\x02\x12\x0c\n\x08\x43RITICAL\x10\x03\x12\x0b\n\x07UNKNOWN\x10\x04\x12\x0b\n\x07OFFLINE\x10\x05\x32\xdc\x01\n\x0eMetricsService\x12>\n\rStreamMetrics\x12\x12.monitoring.Metric\x1a\x15.monitoring.StreamAck\"\x00(\x01\x12G\n\x0fGetCurrentState\x12\x18.monitoring.StateRequest\x1a\x18.monitoring.MachineState\"\x00\x12\x41\n\x0cGetAllStates\x12\x11.monitoring.Empty\x1a\x1c.monitoring.AllMachinesState\"\x00\x32Z\n\x0e\x43ontrolService\x12H\n\x07\x43ontrol\x12\x1a.monitoring.ControlCommand\x1a\x1b.monitoring.ControlResponse\"\x00(\x01\x30\x01\x32\xa1\x01\n\rConfigService\x12\x45\n\tGetConfig\x12\x19.monitoring.ConfigRequest\x1a\x1b.monitoring.CollectorConfig\"\x00\x12I\n\x0bWatchConfig\x12\x19.monitoring.ConfigRequest\x1a\x1b.monitoring.CollectorConfig\"\x00\x30\x01\x42%Z#github.com/your-org/sysmonstm/protob\x06proto3')
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'metrics_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
  _globals['DESCRIPTOR']._loaded_options = None
  _globals['DESCRIPTOR']._serialized_options = b'Z#github.com/your-org/sysmonstm/proto'
  _globals['_METRIC_LABELSENTRY']._loaded_options = None
  _globals['_METRIC_LABELSENTRY']._serialized_options = b'8\001'
  _globals['_METRICPOINT_LABELSENTRY']._loaded_options = None
  _globals['_METRICPOINT_LABELSENTRY']._serialized_options = b'8\001'
  _globals['_MACHINESTATE_METADATAENTRY']._loaded_options = None
  _globals['_MACHINESTATE_METADATAENTRY']._serialized_options = b'8\001'
  _globals['_COLLECTORCONFIG_LABELSENTRY']._loaded_options = None
  _globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_options = b'8\001'
  _globals['_METRICTYPE']._serialized_start=1810
  _globals['_METRICTYPE']._serialized_end=2207
  _globals['_HEALTHSTATUS']._serialized_start=2209
  _globals['_HEALTHSTATUS']._serialized_end=2320
  _globals['_EMPTY']._serialized_start=29
  _globals['_EMPTY']._serialized_end=36
  _globals['_METRIC']._serialized_start=39
  _globals['_METRIC']._serialized_end=255
  _globals['_METRIC_LABELSENTRY']._serialized_start=210
  _globals['_METRIC_LABELSENTRY']._serialized_end=255
  _globals['_METRICBATCH']._serialized_start=257
  _globals['_METRICBATCH']._serialized_end=372
  _globals['_METRICPOINT']._serialized_start=375
  _globals['_METRICPOINT']._serialized_end=541
  _globals['_METRICPOINT_LABELSENTRY']._serialized_start=210
  _globals['_METRICPOINT_LABELSENTRY']._serialized_end=255
  _globals['_STREAMACK']._serialized_start=543
  _globals['_STREAMACK']._serialized_end=614
  _globals['_STATEREQUEST']._serialized_start=616
  _globals['_STATEREQUEST']._serialized_end=650
  _globals['_MACHINESTATE']._serialized_start=653
  _globals['_MACHINESTATE']._serialized_end=921
  _globals['_MACHINESTATE_METADATAENTRY']._serialized_start=874
  _globals['_MACHINESTATE_METADATAENTRY']._serialized_end=921
  _globals['_ALLMACHINESSTATE']._serialized_start=923
  _globals['_ALLMACHINESSTATE']._serialized_end=985
  _globals['_CONTROLCOMMAND']._serialized_start=988
  _globals['_CONTROLCOMMAND']._serialized_end=1203
  _globals['_UPDATEINTERVALCOMMAND']._serialized_start=1205
  _globals['_UPDATEINTERVALCOMMAND']._serialized_end=1254
  _globals['_RESTARTCOLLECTIONCOMMAND']._serialized_start=1256
  _globals['_RESTARTCOLLECTIONCOMMAND']._serialized_end=1282
  _globals['_SHUTDOWNCOMMAND']._serialized_start=1284
  _globals['_SHUTDOWNCOMMAND']._serialized_end=1319
  _globals['_CONTROLRESPONSE']._serialized_start=1321
  _globals['_CONTROLRESPONSE']._serialized_end=1392
  _globals['_CONFIGREQUEST']._serialized_start=1394
  _globals['_CONFIGREQUEST']._serialized_end=1429
  _globals['_COLLECTORCONFIG']._serialized_start=1432
  _globals['_COLLECTORCONFIG']._serialized_end=1688
  _globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_start=210
  _globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_end=255
  _globals['_THRESHOLDCONFIG']._serialized_start=1690
  _globals['_THRESHOLDCONFIG']._serialized_end=1807
  _globals['_METRICSSERVICE']._serialized_start=2323
  _globals['_METRICSSERVICE']._serialized_end=2543
  _globals['_CONTROLSERVICE']._serialized_start=2545
  _globals['_CONTROLSERVICE']._serialized_end=2635
  _globals['_CONFIGSERVICE']._serialized_start=2638
  _globals['_CONFIGSERVICE']._serialized_end=2799
 # @@protoc_insertion_point(module_scope)
--- a/shared/metrics_pb2_grpc.py
+++ b/shared/metrics_pb2_grpc.py
@@ -0,0 +1,385 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import warnings
 from shared import metrics_pb2 as metrics__pb2
 GRPC_GENERATED_VERSION = '1.76.0'
 GRPC_VERSION = grpc.__version__
 _version_not_supported = False
 try:
    from grpc._utilities import first_version_is_lower
    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
 except ImportError:
    _version_not_supported = True
 if _version_not_supported:
    raise RuntimeError(
        f'The grpc package installed is at version {GRPC_VERSION},'
        + ' but the generated code in metrics_pb2_grpc.py depends on'
        + f' grpcio>={GRPC_GENERATED_VERSION}.'
        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
    )
 class MetricsServiceStub(object):
    """MetricsService handles streaming metrics from collectors to aggregator
    """
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.StreamMetrics = channel.stream_unary(
                '/monitoring.MetricsService/StreamMetrics',
                request_serializer=metrics__pb2.Metric.SerializeToString,
                response_deserializer=metrics__pb2.StreamAck.FromString,
                _registered_method=True)
        self.GetCurrentState = channel.unary_unary(
                '/monitoring.MetricsService/GetCurrentState',
                request_serializer=metrics__pb2.StateRequest.SerializeToString,
                response_deserializer=metrics__pb2.MachineState.FromString,
                _registered_method=True)
        self.GetAllStates = channel.unary_unary(
                '/monitoring.MetricsService/GetAllStates',
                request_serializer=metrics__pb2.Empty.SerializeToString,
                response_deserializer=metrics__pb2.AllMachinesState.FromString,
                _registered_method=True)
 class MetricsServiceServicer(object):
    """MetricsService handles streaming metrics from collectors to aggregator
    """
    def StreamMetrics(self, request_iterator, context):
        """Client-side streaming: collector streams metrics to aggregator
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GetCurrentState(self, request, context):
        """Get current state of a machine
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GetAllStates(self, request, context):
        """Get current state of all machines
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_MetricsServiceServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'StreamMetrics': grpc.stream_unary_rpc_method_handler(
                    servicer.StreamMetrics,
                    request_deserializer=metrics__pb2.Metric.FromString,
                    response_serializer=metrics__pb2.StreamAck.SerializeToString,
            ),
            'GetCurrentState': grpc.unary_unary_rpc_method_handler(
                    servicer.GetCurrentState,
                    request_deserializer=metrics__pb2.StateRequest.FromString,
                    response_serializer=metrics__pb2.MachineState.SerializeToString,
            ),
            'GetAllStates': grpc.unary_unary_rpc_method_handler(
                    servicer.GetAllStates,
                    request_deserializer=metrics__pb2.Empty.FromString,
                    response_serializer=metrics__pb2.AllMachinesState.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'monitoring.MetricsService', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
    server.add_registered_method_handlers('monitoring.MetricsService', rpc_method_handlers)
 # This class is part of an EXPERIMENTAL API.
 class MetricsService(object):
    """MetricsService handles streaming metrics from collectors to aggregator
    """
    @staticmethod
    def StreamMetrics(request_iterator,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.stream_unary(
            request_iterator,
            target,
            '/monitoring.MetricsService/StreamMetrics',
            metrics__pb2.Metric.SerializeToString,
            metrics__pb2.StreamAck.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
            _registered_method=True)
    @staticmethod
    def GetCurrentState(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(
            request,
            target,
            '/monitoring.MetricsService/GetCurrentState',
            metrics__pb2.StateRequest.SerializeToString,
            metrics__pb2.MachineState.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
            _registered_method=True)
    @staticmethod
    def GetAllStates(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(
            request,
            target,
            '/monitoring.MetricsService/GetAllStates',
            metrics__pb2.Empty.SerializeToString,
            metrics__pb2.AllMachinesState.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
            _registered_method=True)
 class ControlServiceStub(object):
    """ControlService handles bidirectional control commands
    """
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Control = channel.stream_stream(
                '/monitoring.ControlService/Control',
                request_serializer=metrics__pb2.ControlCommand.SerializeToString,
                response_deserializer=metrics__pb2.ControlResponse.FromString,
                _registered_method=True)
 class ControlServiceServicer(object):
    """ControlService handles bidirectional control commands
    """
    def Control(self, request_iterator, context):
        """Bidirectional streaming for commands and responses
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_ControlServiceServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Control': grpc.stream_stream_rpc_method_handler(
                    servicer.Control,
                    request_deserializer=metrics__pb2.ControlCommand.FromString,
                    response_serializer=metrics__pb2.ControlResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'monitoring.ControlService', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
    server.add_registered_method_handlers('monitoring.ControlService', rpc_method_handlers)
 # This class is part of an EXPERIMENTAL API.
 class ControlService(object):
    """ControlService handles bidirectional control commands
    """
    @staticmethod
    def Control(request_iterator,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.stream_stream(
            request_iterator,
            target,
            '/monitoring.ControlService/Control',
            metrics__pb2.ControlCommand.SerializeToString,
            metrics__pb2.ControlResponse.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
            _registered_method=True)
 class ConfigServiceStub(object):
    """ConfigService handles dynamic configuration
    """
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.GetConfig = channel.unary_unary(
                '/monitoring.ConfigService/GetConfig',
                request_serializer=metrics__pb2.ConfigRequest.SerializeToString,
                response_deserializer=metrics__pb2.CollectorConfig.FromString,
                _registered_method=True)
        self.WatchConfig = channel.unary_stream(
                '/monitoring.ConfigService/WatchConfig',
                request_serializer=metrics__pb2.ConfigRequest.SerializeToString,
                response_deserializer=metrics__pb2.CollectorConfig.FromString,
                _registered_method=True)
 class ConfigServiceServicer(object):
    """ConfigService handles dynamic configuration
    """
    def GetConfig(self, request, context):
        """Get current configuration for a collector
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def WatchConfig(self, request, context):
        """Stream configuration updates
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_ConfigServiceServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'GetConfig': grpc.unary_unary_rpc_method_handler(
                    servicer.GetConfig,
                    request_deserializer=metrics__pb2.ConfigRequest.FromString,
                    response_serializer=metrics__pb2.CollectorConfig.SerializeToString,
            ),
            'WatchConfig': grpc.unary_stream_rpc_method_handler(
                    servicer.WatchConfig,
                    request_deserializer=metrics__pb2.ConfigRequest.FromString,
                    response_serializer=metrics__pb2.CollectorConfig.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'monitoring.ConfigService', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
    server.add_registered_method_handlers('monitoring.ConfigService', rpc_method_handlers)
 # This class is part of an EXPERIMENTAL API.
 class ConfigService(object):
    """ConfigService handles dynamic configuration
    """
    @staticmethod
    def GetConfig(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(
            request,
            target,
            '/monitoring.ConfigService/GetConfig',
            metrics__pb2.ConfigRequest.SerializeToString,
            metrics__pb2.CollectorConfig.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
            _registered_method=True)
    @staticmethod
    def WatchConfig(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(
            request,
            target,
            '/monitoring.ConfigService/WatchConfig',
            metrics__pb2.ConfigRequest.SerializeToString,
            metrics__pb2.CollectorConfig.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
            _registered_method=True)
--- a/web/static/.gitkeep
+++ b/web/static/.gitkeep
--- a/web/templates/dashboard.html
+++ b/web/templates/dashboard.html
@@ -0,0 +1,358 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>System Monitor Dashboard</title>
    <style>
        :root {
            --bg-primary: #1a1a2e;
            --bg-secondary: #16213e;
            --bg-card: #0f3460;
            --text-primary: #eee;
            --text-secondary: #a0a0a0;
            --accent: #e94560;
            --success: #4ade80;
            --warning: #fbbf24;
            --danger: #ef4444;
            --border: #2a2a4a;
        }
        * { box-sizing: border-box; margin: 0; padding: 0; }
        body {
            font-family: system-ui, -apple-system, sans-serif;
            background: var(--bg-primary);
            color: var(--text-primary);
            min-height: 100vh;
        }
        header {
            background: var(--bg-secondary);
            padding: 1rem 2rem;
            border-bottom: 2px solid var(--accent);
            display: flex;
            justify-content: space-between;
            align-items: center;
        }
        header h1 { font-size: 1.5rem; }
        .status {
            display: flex;
            align-items: center;
            gap: 0.5rem;
            font-size: 0.875rem;
        }
        .status-dot {
            width: 10px;
            height: 10px;
            border-radius: 50%;
            background: var(--danger);
        }
        .status-dot.connected { background: var(--success); }
        main {
            padding: 1.5rem;
            max-width: 1600px;
            margin: 0 auto;
        }
        .machines-grid {
            display: grid;
            grid-template-columns: repeat(auto-fill, minmax(400px, 1fr));
            gap: 1.5rem;
        }
        .machine-card {
            background: var(--bg-secondary);
            border-radius: 8px;
            padding: 1.25rem;
            border: 1px solid var(--border);
        }
        .machine-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 1rem;
            padding-bottom: 0.75rem;
            border-bottom: 1px solid var(--border);
        }
        .machine-name {
            font-weight: 600;
            color: var(--accent);
        }
        .machine-id {
            font-size: 0.75rem;
            color: var(--text-secondary);
        }
        .machine-status {
            font-size: 0.75rem;
            padding: 0.25rem 0.5rem;
            border-radius: 4px;
            background: var(--success);
            color: #000;
        }
        .machine-status.warning { background: var(--warning); }
        .machine-status.critical { background: var(--danger); color: #fff; }
        .machine-status.offline { background: var(--text-secondary); }
        .metrics-grid {
            display: grid;
            grid-template-columns: repeat(2, 1fr);
            gap: 0.75rem;
        }
        .metric {
            background: var(--bg-card);
            padding: 0.75rem;
            border-radius: 6px;
        }
        .metric-label {
            font-size: 0.75rem;
            color: var(--text-secondary);
            margin-bottom: 0.25rem;
        }
        .metric-value {
            font-size: 1.5rem;
            font-weight: 600;
        }
        .metric-bar {
            height: 4px;
            background: var(--border);
            border-radius: 2px;
            margin-top: 0.5rem;
            overflow: hidden;
        }
        .metric-bar-fill {
            height: 100%;
            background: var(--success);
            transition: width 0.3s ease;
        }
        .metric-bar-fill.warning { background: var(--warning); }
        .metric-bar-fill.critical { background: var(--danger); }
        .last-seen {
            font-size: 0.75rem;
            color: var(--text-secondary);
            margin-top: 1rem;
            text-align: right;
        }
        .no-machines {
            text-align: center;
            padding: 3rem;
            color: var(--text-secondary);
        }
        .no-machines h2 {
            color: var(--text-primary);
            margin-bottom: 0.5rem;
        }
        @media (max-width: 600px) {
            .machines-grid {
                grid-template-columns: 1fr;
            }
            .metrics-grid {
                grid-template-columns: 1fr;
            }
        }
    </style>
 </head>
 <body>
    <header>
        <h1>System Monitor</h1>
        <div class="status">
            <span class="status-dot" id="status-dot"></span>
            <span id="status-text">Connecting...</span>
        </div>
    </header>
    <main>
        <div class="machines-grid" id="machines-grid">
            <div class="no-machines">
                <h2>No machines connected</h2>
                <p>Waiting for collectors to send metrics...</p>
            </div>
        </div>
    </main>
    <script>
        const machinesGrid = document.getElementById('machines-grid');
        const statusDot = document.getElementById('status-dot');
        const statusText = document.getElementById('status-text');
        const machines = new Map();
        function formatBytes(bytes) {
            if (bytes === 0) return '0 B';
            const k = 1024;
            const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
            const i = Math.floor(Math.log(bytes) / Math.log(k));
            return parseFloat((bytes / Math.pow(k, i)).toFixed(1)) + ' ' + sizes[i];
        }
        function formatRate(bytesPerSec) {
            return formatBytes(bytesPerSec) + '/s';
        }
        function getBarClass(value, warning = 80, critical = 95) {
            if (value >= critical) return 'critical';
            if (value >= warning) return 'warning';
            return '';
        }
        function getStatusClass(metrics) {
            const cpu = metrics.CPU_PERCENT || 0;
            const mem = metrics.MEMORY_PERCENT || 0;
            const disk = metrics.DISK_PERCENT || 0;
            if (cpu > 95 || mem > 95 || disk > 90) return 'critical';
            if (cpu > 80 || mem > 85 || disk > 80) return 'warning';
            return '';
        }
        function timeSince(timestampMs) {
            const seconds = Math.floor((Date.now() - timestampMs) / 1000);
            if (seconds < 5) return 'just now';
            if (seconds < 60) return `${seconds}s ago`;
            const minutes = Math.floor(seconds / 60);
            if (minutes < 60) return `${minutes}m ago`;
            return `${Math.floor(minutes / 60)}h ago`;
        }
        function renderMachine(data) {
            const m = data.metrics || {};
            const statusClass = getStatusClass(m);
            return `
                <div class="machine-card" data-machine="${data.machine_id}">
                    <div class="machine-header">
                        <div>
                            <div class="machine-name">${data.hostname || data.machine_id}</div>
                            <div class="machine-id">${data.machine_id}</div>
                        </div>
                        <span class="machine-status ${statusClass}">${statusClass || 'healthy'}</span>
                    </div>
                    <div class="metrics-grid">
                        <div class="metric">
                            <div class="metric-label">CPU</div>
                            <div class="metric-value">${(m.CPU_PERCENT || 0).toFixed(1)}%</div>
                            <div class="metric-bar">
                                <div class="metric-bar-fill ${getBarClass(m.CPU_PERCENT || 0)}"
                                     style="width: ${m.CPU_PERCENT || 0}%"></div>
                            </div>
                        </div>
                        <div class="metric">
                            <div class="metric-label">Memory</div>
                            <div class="metric-value">${(m.MEMORY_PERCENT || 0).toFixed(1)}%</div>
                            <div class="metric-bar">
                                <div class="metric-bar-fill ${getBarClass(m.MEMORY_PERCENT || 0, 85, 95)}"
                                     style="width: ${m.MEMORY_PERCENT || 0}%"></div>
                            </div>
                        </div>
                        <div class="metric">
                            <div class="metric-label">Disk</div>
                            <div class="metric-value">${(m.DISK_PERCENT || 0).toFixed(1)}%</div>
                            <div class="metric-bar">
                                <div class="metric-bar-fill ${getBarClass(m.DISK_PERCENT || 0, 80, 90)}"
                                     style="width: ${m.DISK_PERCENT || 0}%"></div>
                            </div>
                        </div>
                        <div class="metric">
                            <div class="metric-label">Load (1m)</div>
                            <div class="metric-value">${(m.LOAD_AVG_1M || 0).toFixed(2)}</div>
                        </div>
                        <div class="metric">
                            <div class="metric-label">Network In</div>
                            <div class="metric-value">${formatRate(m.NETWORK_RECV_BYTES_SEC || 0)}</div>
                        </div>
                        <div class="metric">
                            <div class="metric-label">Network Out</div>
                            <div class="metric-value">${formatRate(m.NETWORK_SENT_BYTES_SEC || 0)}</div>
                        </div>
                    </div>
                    <div class="last-seen">Last seen: ${timeSince(data.timestamp_ms || Date.now())}</div>
                </div>
            `;
        }
        function updateUI() {
            if (machines.size === 0) {
                machinesGrid.innerHTML = `
                    <div class="no-machines">
                        <h2>No machines connected</h2>
                        <p>Waiting for collectors to send metrics...</p>
                    </div>
                `;
                return;
            }
            machinesGrid.innerHTML = Array.from(machines.values())
                .map(renderMachine)
                .join('');
        }
        function connect() {
            const ws = new WebSocket(`ws://${location.host}/ws`);
            ws.onopen = () => {
                statusDot.classList.add('connected');
                statusText.textContent = 'Connected';
            };
            ws.onclose = () => {
                statusDot.classList.remove('connected');
                statusText.textContent = 'Disconnected - Reconnecting...';
                setTimeout(connect, 3000);
            };
            ws.onerror = () => {
                statusDot.classList.remove('connected');
                statusText.textContent = 'Connection error';
            };
            ws.onmessage = (event) => {
                try {
                    const msg = JSON.parse(event.data);
                    if (msg.type === 'initial' || msg.type === 'metrics') {
                        const data = msg.data;
                        data.timestamp_ms = data.timestamp_ms || Date.now();
                        machines.set(data.machine_id, data);
                        updateUI();
                    }
                } catch (e) {
                    console.error('Failed to parse message:', e);
                }
            };
            // Send periodic pings
            setInterval(() => {
                if (ws.readyState === WebSocket.OPEN) {
                    ws.send('ping');
                }
            }, 30000);
        }
        // Update "last seen" timestamps periodically
        setInterval(updateUI, 5000);
        // Start connection
        connect();
    </script>
 </body>
 </html>