claude final draft

2025-12-29 23:44:30 -03:00
parent 116d4032e2
commit e5aafd5097
22 changed files with 2815 additions and 32 deletions
--- a/ctlptl.yaml
+++ b/ctlptl.yaml
@@ -1,32 +0,0 @@
-# ctlptl configuration for Kind cluster
-# Usage: ctlptl apply -f ctlptl.yaml
-
-apiVersion: ctlptl.dev/v1alpha1
-kind: Registry
-name: sysmonstm-registry
-port: 5005
---
-apiVersion: ctlptl.dev/v1alpha1
-kind: Cluster
-product: kind
-registry: sysmonstm-registry
-kindV1Alpha4Cluster:
-  name: sysmonstm
-  nodes:
-    - role: control-plane
-      extraPortMappings:
-        # Gateway HTTP
-        - containerPort: 30080
-          hostPort: 8080
-          protocol: TCP
-        # Aggregator gRPC
-        - containerPort: 30051
-          hostPort: 50051
-          protocol: TCP
-  # Resource limits for t2.small compatibility
-  kubeadmConfigPatches:
-    - |
-      kind: InitConfiguration
-      nodeRegistration:
-        kubeletExtraArgs:
-          system-reserved: memory=256Mi
--- a/scripts/generate-proto.sh
+++ b/scripts/generate-proto.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Generate Python gRPC code from proto definitions
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$SCRIPT_DIR/.."
+
+cd "$PROJECT_ROOT"
+
+echo "Generating Python gRPC code from proto/metrics.proto..."
+
+python -m grpc_tools.protoc \
+    -I./proto \
+    --python_out=./shared \
+    --grpc_python_out=./shared \
+    ./proto/metrics.proto
+
+# Fix imports in generated files (grpc_tools generates incorrect imports)
+sed -i 's/import metrics_pb2/from shared import metrics_pb2/' shared/metrics_pb2_grpc.py
+
+echo "Generated:"
+echo "  - shared/metrics_pb2.py"
+echo "  - shared/metrics_pb2_grpc.py"
--- a/services/aggregator/init.py
+++ b/services/aggregator/init.py
@@ -0,0 +1 @@
+"""Aggregator service."""
--- a/services/aggregator/main.py
+++ b/services/aggregator/main.py
@@ -0,0 +1,361 @@
+"""Aggregator service - gRPC server that receives metrics and stores them."""
+
+import asyncio
+import signal
+import sys
+from pathlib import Path
+
+import grpc
+from grpc_health.v1 import health, health_pb2, health_pb2_grpc
+
+# Add project root to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from services.aggregator.storage import RedisStorage, TimescaleStorage
+from shared import metrics_pb2, metrics_pb2_grpc
+from shared.config import get_aggregator_config
+from shared.events import get_publisher
+from shared.logging import setup_logging
+
+
+class MetricsServicer(metrics_pb2_grpc.MetricsServiceServicer):
+    """gRPC servicer for metrics ingestion."""
+
+    def __init__(
+        self,
+        redis_storage: RedisStorage,
+        timescale_storage: TimescaleStorage,
+        event_publisher,
+        logger,
+    ):
+        self.redis = redis_storage
+        self.timescale = timescale_storage
+        self.publisher = event_publisher
+        self.logger = logger
+
+    async def StreamMetrics(self, request_iterator, context):
+        """Receive streaming metrics from a collector."""
+        metrics_received = 0
+        current_machine = None
+        current_batch: list[tuple[str, float, dict]] = []
+        batch_timestamp = 0
+        batch_hostname = ""
+
+        try:
+            async for metric in request_iterator:
+                metrics_received += 1
+
+                # Track current machine
+                if current_machine != metric.machine_id:
+                    # Flush previous batch if switching machines
+                    if current_machine and current_batch:
+                        await self._flush_batch(
+                            current_machine,
+                            batch_hostname,
+                            batch_timestamp,
+                            current_batch,
+                        )
+                        current_batch = []
+
+                    current_machine = metric.machine_id
+                    self.logger.info(
+                        "collector_connected",
+                        machine_id=metric.machine_id,
+                        hostname=metric.hostname,
+                    )
+
+                # Get metric type name
+                metric_type = metrics_pb2.MetricType.Name(metric.type)
+
+                # Add to batch
+                current_batch.append(
+                    (
+                        metric_type,
+                        metric.value,
+                        dict(metric.labels),
+                    )
+                )
+                batch_timestamp = metric.timestamp_ms
+                batch_hostname = metric.hostname
+
+                # Flush batch every 20 metrics or if timestamp changes significantly
+                if len(current_batch) >= 20:
+                    await self._flush_batch(
+                        current_machine, batch_hostname, batch_timestamp, current_batch
+                    )
+                    current_batch = []
+
+            # Flush remaining
+            if current_machine and current_batch:
+                await self._flush_batch(
+                    current_machine, batch_hostname, batch_timestamp, current_batch
+                )
+
+            self.logger.info(
+                "stream_completed",
+                machine_id=current_machine,
+                metrics_received=metrics_received,
+            )
+
+            return metrics_pb2.StreamAck(
+                success=True,
+                metrics_received=metrics_received,
+                message="OK",
+            )
+
+        except Exception as e:
+            self.logger.error(
+                "stream_error",
+                error=str(e),
+                machine_id=current_machine,
+                metrics_received=metrics_received,
+            )
+            return metrics_pb2.StreamAck(
+                success=False,
+                metrics_received=metrics_received,
+                message=str(e),
+            )
+
+    async def _flush_batch(
+        self,
+        machine_id: str,
+        hostname: str,
+        timestamp_ms: int,
+        batch: list[tuple[str, float, dict]],
+    ) -> None:
+        """Flush a batch of metrics to storage and events."""
+
+        # Aggregate metrics for Redis state
+        metrics_dict = {}
+        for metric_type, value, labels in batch:
+            key = metric_type
+            if labels:
+                key = f"{metric_type}:{','.join(f'{k}={v}' for k, v in labels.items())}"
+            metrics_dict[key] = value
+
+        # Update Redis (current state)
+        await self.redis.update_machine_state(
+            machine_id=machine_id,
+            hostname=hostname,
+            metrics=metrics_dict,
+            timestamp_ms=timestamp_ms,
+        )
+
+        # Insert into TimescaleDB (historical)
+        try:
+            await self.timescale.insert_metrics(
+                machine_id=machine_id,
+                hostname=hostname,
+                timestamp_ms=timestamp_ms,
+                metrics=batch,
+            )
+        except Exception as e:
+            self.logger.warning("timescale_insert_failed", error=str(e))
+
+        # Update machine registry
+        try:
+            await self.timescale.update_machine_registry(
+                machine_id=machine_id,
+                hostname=hostname,
+            )
+        except Exception as e:
+            self.logger.warning("machine_registry_update_failed", error=str(e))
+
+        # Publish event for subscribers (alerts, gateway)
+        await self.publisher.publish(
+            topic="metrics.raw",
+            payload={
+                "machine_id": machine_id,
+                "hostname": hostname,
+                "timestamp_ms": timestamp_ms,
+                "metrics": metrics_dict,
+            },
+        )
+
+        self.logger.debug(
+            "batch_flushed",
+            machine_id=machine_id,
+            count=len(batch),
+        )
+
+    async def GetCurrentState(self, request, context):
+        """Get current state for a single machine."""
+        state = await self.redis.get_machine_state(request.machine_id)
+
+        if not state:
+            context.set_code(grpc.StatusCode.NOT_FOUND)
+            context.set_details(f"Machine {request.machine_id} not found")
+            return metrics_pb2.MachineState()
+
+        # Convert state to proto
+        metrics = []
+        for key, value in state.get("metrics", {}).items():
+            parts = key.split(":")
+            metric_type_str = parts[0]
+            labels = {}
+            if len(parts) > 1:
+                for pair in parts[1].split(","):
+                    k, v = pair.split("=")
+                    labels[k] = v
+
+            metric_type = getattr(metrics_pb2, metric_type_str, 0)
+            metrics.append(
+                metrics_pb2.Metric(
+                    machine_id=state["machine_id"],
+                    hostname=state["hostname"],
+                    timestamp_ms=state["last_seen_ms"],
+                    type=metric_type,
+                    value=value,
+                    labels=labels,
+                )
+            )
+
+        return metrics_pb2.MachineState(
+            machine_id=state["machine_id"],
+            hostname=state["hostname"],
+            last_seen_ms=state["last_seen_ms"],
+            current_metrics=metrics,
+            health=metrics_pb2.HEALTHY,
+        )
+
+    async def GetAllStates(self, request, context):
+        """Get current state for all machines."""
+        states = await self.redis.get_all_machines()
+
+        machine_states = []
+        for state in states:
+            metrics = []
+            for key, value in state.get("metrics", {}).items():
+                parts = key.split(":")
+                metric_type_str = parts[0]
+                metric_type = getattr(metrics_pb2, metric_type_str, 0)
+                metrics.append(
+                    metrics_pb2.Metric(
+                        machine_id=state["machine_id"],
+                        hostname=state["hostname"],
+                        timestamp_ms=state["last_seen_ms"],
+                        type=metric_type,
+                        value=value,
+                    )
+                )
+
+            machine_states.append(
+                metrics_pb2.MachineState(
+                    machine_id=state["machine_id"],
+                    hostname=state["hostname"],
+                    last_seen_ms=state["last_seen_ms"],
+                    current_metrics=metrics,
+                    health=metrics_pb2.HEALTHY,
+                )
+            )
+
+        return metrics_pb2.AllMachinesState(machines=machine_states)
+
+
+class AggregatorService:
+    """Main aggregator service."""
+
+    def __init__(self):
+        self.config = get_aggregator_config()
+        self.logger = setup_logging(
+            service_name=self.config.service_name,
+            log_level=self.config.log_level,
+            log_format=self.config.log_format,
+        )
+
+        self.redis = RedisStorage(self.config.redis_url)
+        self.timescale = TimescaleStorage(self.config.timescale_url)
+        self.publisher = get_publisher(source="aggregator")
+
+        self.server: grpc.aio.Server | None = None
+        self.running = False
+
+    async def start(self) -> None:
+        """Start the gRPC server."""
+        self.running = True
+
+        # Connect to storage
+        await self.redis.connect()
+
+        try:
+            await self.timescale.connect()
+        except Exception as e:
+            self.logger.warning(
+                "timescale_connection_failed",
+                error=str(e),
+                message="Continuing without TimescaleDB - metrics won't be persisted",
+            )
+
+        # Connect to event publisher
+        await self.publisher.connect()
+
+        # Create gRPC server
+        self.server = grpc.aio.server()
+
+        # Add metrics servicer
+        servicer = MetricsServicer(
+            redis_storage=self.redis,
+            timescale_storage=self.timescale,
+            event_publisher=self.publisher,
+            logger=self.logger,
+        )
+        metrics_pb2_grpc.add_MetricsServiceServicer_to_server(servicer, self.server)
+
+        # Add health check servicer
+        health_servicer = health.HealthServicer()
+        health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
+        health_servicer.set("MetricsService", health_pb2.HealthCheckResponse.SERVING)
+        health_pb2_grpc.add_HealthServicer_to_server(health_servicer, self.server)
+
+        # Start server
+        listen_addr = f"[::]:{self.config.grpc_port}"
+        self.server.add_insecure_port(listen_addr)
+
+        await self.server.start()
+
+        self.logger.info(
+            "aggregator_started",
+            port=self.config.grpc_port,
+            listen_addr=listen_addr,
+        )
+
+    async def stop(self) -> None:
+        """Stop the gRPC server."""
+        self.running = False
+
+        if self.server:
+            await self.server.stop(grace=5)
+            self.server = None
+
+        await self.publisher.disconnect()
+        await self.timescale.disconnect()
+        await self.redis.disconnect()
+
+        self.logger.info("aggregator_stopped")
+
+    async def wait(self) -> None:
+        """Wait for the server to terminate."""
+        if self.server:
+            await self.server.wait_for_termination()
+
+
+async def main():
+    """Main entry point."""
+    service = AggregatorService()
+
+    # Handle shutdown signals
+    loop = asyncio.get_event_loop()
+
+    async def shutdown():
+        service.logger.info("shutdown_signal_received")
+        await service.stop()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown()))
+
+    await service.start()
+    await service.wait()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/services/aggregator/storage.py
+++ b/services/aggregator/storage.py
@@ -0,0 +1,245 @@
+"""Storage layer for metrics - Redis (current state) and TimescaleDB (historical)."""
+
+import json
+import time
+from datetime import datetime
+from typing import Any
+
+import asyncpg
+import redis.asyncio as redis
+
+from shared.logging import get_logger
+
+logger = get_logger("storage")
+
+
+class RedisStorage:
+    """Redis storage for current machine state."""
+
+    def __init__(self, redis_url: str):
+        self.redis_url = redis_url
+        self._client: redis.Redis | None = None
+
+    async def connect(self) -> None:
+        self._client = redis.from_url(self.redis_url, decode_responses=True)
+        await self._client.ping()
+        logger.info("redis_connected", url=self.redis_url)
+
+    async def disconnect(self) -> None:
+        if self._client:
+            await self._client.close()
+            self._client = None
+            logger.info("redis_disconnected")
+
+    async def update_machine_state(
+        self,
+        machine_id: str,
+        hostname: str,
+        metrics: dict[str, float],
+        timestamp_ms: int,
+    ) -> None:
+        """Update the current state for a machine."""
+        if not self._client:
+            raise RuntimeError("Not connected to Redis")
+
+        state = {
+            "machine_id": machine_id,
+            "hostname": hostname,
+            "last_seen_ms": timestamp_ms,
+            "metrics": metrics,
+            "updated_at": datetime.utcnow().isoformat(),
+        }
+
+        # Store as hash for efficient partial reads
+        key = f"machine:{machine_id}"
+        await self._client.hset(
+            key,
+            mapping={
+                "state": json.dumps(state),
+                "last_seen": str(timestamp_ms),
+            },
+        )
+
+        # Set expiry - if no updates for 5 minutes, consider stale
+        await self._client.expire(key, 300)
+
+        # Add to active machines set
+        await self._client.sadd("machines:active", machine_id)
+
+    async def get_machine_state(self, machine_id: str) -> dict[str, Any] | None:
+        """Get current state for a machine."""
+        if not self._client:
+            raise RuntimeError("Not connected to Redis")
+
+        key = f"machine:{machine_id}"
+        data = await self._client.hget(key, "state")
+
+        if data:
+            return json.loads(data)
+        return None
+
+    async def get_all_machines(self) -> list[dict[str, Any]]:
+        """Get current state for all active machines."""
+        if not self._client:
+            raise RuntimeError("Not connected to Redis")
+
+        machine_ids = await self._client.smembers("machines:active")
+        states = []
+
+        for machine_id in machine_ids:
+            state = await self.get_machine_state(machine_id)
+            if state:
+                states.append(state)
+            else:
+                # Remove stale machine from active set
+                await self._client.srem("machines:active", machine_id)
+
+        return states
+
+
+class TimescaleStorage:
+    """TimescaleDB storage for historical metrics."""
+
+    def __init__(self, connection_url: str):
+        self.connection_url = connection_url
+        self._pool: asyncpg.Pool | None = None
+
+    async def connect(self) -> None:
+        self._pool = await asyncpg.create_pool(
+            self.connection_url,
+            min_size=2,
+            max_size=10,
+        )
+        logger.info("timescaledb_connected")
+
+    async def disconnect(self) -> None:
+        if self._pool:
+            await self._pool.close()
+            self._pool = None
+            logger.info("timescaledb_disconnected")
+
+    async def insert_metrics(
+        self,
+        machine_id: str,
+        hostname: str,
+        timestamp_ms: int,
+        metrics: list[tuple[str, float, dict[str, str]]],
+    ) -> int:
+        """
+        Insert a batch of metrics.
+
+        Args:
+            machine_id: Machine identifier
+            hostname: Machine hostname
+            timestamp_ms: Timestamp in milliseconds
+            metrics: List of (metric_type, value, labels) tuples
+
+        Returns:
+            Number of rows inserted
+        """
+        if not self._pool:
+            raise RuntimeError("Not connected to TimescaleDB")
+
+        timestamp = datetime.utcfromtimestamp(timestamp_ms / 1000)
+
+        # Prepare batch insert
+        rows = [
+            (timestamp, machine_id, hostname, metric_type, value, json.dumps(labels))
+            for metric_type, value, labels in metrics
+        ]
+
+        async with self._pool.acquire() as conn:
+            await conn.executemany(
+                """
+                INSERT INTO metrics_raw (time, machine_id, hostname, metric_type, value, labels)
+                VALUES ($1, $2, $3, $4, $5, $6)
+                """,
+                rows,
+            )
+
+        return len(rows)
+
+    async def update_machine_registry(
+        self,
+        machine_id: str,
+        hostname: str,
+        health: str = "HEALTHY",
+    ) -> None:
+        """Update the machines registry with last seen time."""
+        if not self._pool:
+            raise RuntimeError("Not connected to TimescaleDB")
+
+        async with self._pool.acquire() as conn:
+            await conn.execute(
+                """
+                INSERT INTO machines (machine_id, hostname, last_seen, health)
+                VALUES ($1, $2, NOW(), $3)
+                ON CONFLICT (machine_id) DO UPDATE
+                SET hostname = $2, last_seen = NOW(), health = $3
+                """,
+                machine_id,
+                hostname,
+                health,
+            )
+
+    async def get_metrics(
+        self,
+        machine_id: str | None = None,
+        metric_type: str | None = None,
+        start_time: datetime | None = None,
+        end_time: datetime | None = None,
+        limit: int = 1000,
+    ) -> list[dict[str, Any]]:
+        """Query historical metrics."""
+        if not self._pool:
+            raise RuntimeError("Not connected to TimescaleDB")
+
+        conditions = []
+        params = []
+        param_idx = 1
+
+        if machine_id:
+            conditions.append(f"machine_id = ${param_idx}")
+            params.append(machine_id)
+            param_idx += 1
+
+        if metric_type:
+            conditions.append(f"metric_type = ${param_idx}")
+            params.append(metric_type)
+            param_idx += 1
+
+        if start_time:
+            conditions.append(f"time >= ${param_idx}")
+            params.append(start_time)
+            param_idx += 1
+
+        if end_time:
+            conditions.append(f"time <= ${param_idx}")
+            params.append(end_time)
+            param_idx += 1
+
+        where_clause = " AND ".join(conditions) if conditions else "TRUE"
+
+        query = f"""
+            SELECT time, machine_id, hostname, metric_type, value, labels
+            FROM metrics_raw
+            WHERE {where_clause}
+            ORDER BY time DESC
+            LIMIT ${param_idx}
+        """
+        params.append(limit)
+
+        async with self._pool.acquire() as conn:
+            rows = await conn.fetch(query, *params)
+
+        return [
+            {
+                "time": row["time"].isoformat(),
+                "machine_id": row["machine_id"],
+                "hostname": row["hostname"],
+                "metric_type": row["metric_type"],
+                "value": row["value"],
+                "labels": json.loads(row["labels"]) if row["labels"] else {},
+            }
+            for row in rows
+        ]
--- a/services/alerts/Dockerfile
+++ b/services/alerts/Dockerfile
@@ -14,6 +14,12 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY shared /app/shared
 COPY proto /app/proto

+RUN python -m grpc_tools.protoc \
+    -I/app/proto \
+    --python_out=/app/shared \
+    --grpc_python_out=/app/shared \
+    /app/proto/metrics.proto
+
 COPY services/alerts /app/services/alerts

 ENV PYTHONPATH=/app
--- a/services/alerts/init.py
+++ b/services/alerts/init.py
@@ -0,0 +1 @@
+"""Alerts service."""
--- a/services/alerts/main.py
+++ b/services/alerts/main.py
@@ -0,0 +1,317 @@
+"""Alerts service - subscribes to metrics events and evaluates thresholds."""
+
+import asyncio
+import signal
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import asyncpg
+
+# Add project root to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from shared.config import get_alerts_config
+from shared.events import get_publisher, get_subscriber
+from shared.logging import setup_logging
+
+
+@dataclass
+class AlertRule:
+    """An alert rule configuration."""
+
+    id: int
+    name: str
+    metric_type: str
+    operator: str  # gt, lt, gte, lte, eq
+    threshold: float
+    severity: str  # warning, critical
+    enabled: bool
+
+
+@dataclass
+class Alert:
+    """A triggered alert."""
+
+    rule: AlertRule
+    machine_id: str
+    value: float
+    triggered_at: datetime
+
+
+class AlertEvaluator:
+    """Evaluates metrics against alert rules."""
+
+    OPERATORS = {
+        "gt": lambda v, t: v > t,
+        "lt": lambda v, t: v < t,
+        "gte": lambda v, t: v >= t,
+        "lte": lambda v, t: v <= t,
+        "eq": lambda v, t: v == t,
+    }
+
+    def __init__(self, rules: list[AlertRule]):
+        self.rules = {r.metric_type: r for r in rules if r.enabled}
+        # Track active alerts to avoid duplicates
+        self.active_alerts: dict[str, Alert] = {}  # key: f"{machine_id}:{rule_name}"
+
+    def evaluate(self, machine_id: str, metrics: dict[str, float]) -> list[Alert]:
+        """Evaluate metrics against rules and return new alerts."""
+        new_alerts = []
+
+        for metric_type, value in metrics.items():
+            rule = self.rules.get(metric_type)
+            if not rule:
+                continue
+
+            op_func = self.OPERATORS.get(rule.operator)
+            if not op_func:
+                continue
+
+            alert_key = f"{machine_id}:{rule.name}"
+
+            if op_func(value, rule.threshold):
+                # Threshold exceeded
+                if alert_key not in self.active_alerts:
+                    alert = Alert(
+                        rule=rule,
+                        machine_id=machine_id,
+                        value=value,
+                        triggered_at=datetime.utcnow(),
+                    )
+                    self.active_alerts[alert_key] = alert
+                    new_alerts.append(alert)
+            else:
+                # Threshold no longer exceeded - resolve alert
+                if alert_key in self.active_alerts:
+                    del self.active_alerts[alert_key]
+
+        return new_alerts
+
+    def update_rules(self, rules: list[AlertRule]) -> None:
+        """Update the rules being evaluated."""
+        self.rules = {r.metric_type: r for r in rules if r.enabled}
+
+
+class AlertsService:
+    """Main alerts service."""
+
+    def __init__(self):
+        self.config = get_alerts_config()
+        self.logger = setup_logging(
+            service_name=self.config.service_name,
+            log_level=self.config.log_level,
+            log_format=self.config.log_format,
+        )
+
+        self.running = False
+        self.db_pool: asyncpg.Pool | None = None
+        self.evaluator: AlertEvaluator | None = None
+        self.subscriber = get_subscriber(topics=["metrics.raw"])
+        self.publisher = get_publisher(source="alerts")
+
+    async def connect_db(self) -> None:
+        """Connect to TimescaleDB for rules and alert storage."""
+        try:
+            self.db_pool = await asyncpg.create_pool(
+                self.config.timescale_url,
+                min_size=1,
+                max_size=5,
+            )
+            self.logger.info("database_connected")
+        except Exception as e:
+            self.logger.warning("database_connection_failed", error=str(e))
+            self.db_pool = None
+
+    async def load_rules(self) -> list[AlertRule]:
+        """Load alert rules from database."""
+        if not self.db_pool:
+            # Return default rules if no database
+            return [
+                AlertRule(
+                    1, "High CPU Usage", "CPU_PERCENT", "gt", 80.0, "warning", True
+                ),
+                AlertRule(
+                    2, "Critical CPU Usage", "CPU_PERCENT", "gt", 95.0, "critical", True
+                ),
+                AlertRule(
+                    3,
+                    "High Memory Usage",
+                    "MEMORY_PERCENT",
+                    "gt",
+                    85.0,
+                    "warning",
+                    True,
+                ),
+                AlertRule(
+                    4,
+                    "Critical Memory Usage",
+                    "MEMORY_PERCENT",
+                    "gt",
+                    95.0,
+                    "critical",
+                    True,
+                ),
+                AlertRule(
+                    5, "High Disk Usage", "DISK_PERCENT", "gt", 80.0, "warning", True
+                ),
+                AlertRule(
+                    6,
+                    "Critical Disk Usage",
+                    "DISK_PERCENT",
+                    "gt",
+                    90.0,
+                    "critical",
+                    True,
+                ),
+            ]
+
+        async with self.db_pool.acquire() as conn:
+            rows = await conn.fetch(
+                "SELECT id, name, metric_type, operator, threshold, severity, enabled FROM alert_rules"
+            )
+
+        return [
+            AlertRule(
+                id=row["id"],
+                name=row["name"],
+                metric_type=row["metric_type"],
+                operator=row["operator"],
+                threshold=row["threshold"],
+                severity=row["severity"],
+                enabled=row["enabled"],
+            )
+            for row in rows
+        ]
+
+    async def store_alert(self, alert: Alert) -> None:
+        """Store triggered alert in database."""
+        if not self.db_pool:
+            return
+
+        try:
+            async with self.db_pool.acquire() as conn:
+                await conn.execute(
+                    """
+                    INSERT INTO alerts (time, machine_id, rule_id, rule_name, metric_type, value, threshold, severity)
+                    VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
+                    """,
+                    alert.triggered_at,
+                    alert.machine_id,
+                    alert.rule.id,
+                    alert.rule.name,
+                    alert.rule.metric_type,
+                    alert.value,
+                    alert.rule.threshold,
+                    alert.rule.severity,
+                )
+        except Exception as e:
+            self.logger.warning("alert_storage_failed", error=str(e))
+
+    async def publish_alert(self, alert: Alert) -> None:
+        """Publish alert event for other services (e.g., notifications)."""
+        await self.publisher.publish(
+            topic=f"alerts.{alert.rule.severity}",
+            payload={
+                "rule_name": alert.rule.name,
+                "machine_id": alert.machine_id,
+                "metric_type": alert.rule.metric_type,
+                "value": alert.value,
+                "threshold": alert.rule.threshold,
+                "severity": alert.rule.severity,
+                "triggered_at": alert.triggered_at.isoformat(),
+            },
+        )
+
+    async def process_metrics(self, event_data: dict[str, Any]) -> None:
+        """Process incoming metrics and evaluate alerts."""
+        if not self.evaluator:
+            return
+
+        machine_id = event_data.get("machine_id", "unknown")
+        metrics = event_data.get("metrics", {})
+
+        alerts = self.evaluator.evaluate(machine_id, metrics)
+
+        for alert in alerts:
+            self.logger.warning(
+                "alert_triggered",
+                rule=alert.rule.name,
+                machine_id=alert.machine_id,
+                value=alert.value,
+                threshold=alert.rule.threshold,
+                severity=alert.rule.severity,
+            )
+
+            await self.store_alert(alert)
+            await self.publish_alert(alert)
+
+    async def run(self) -> None:
+        """Main service loop."""
+        self.running = True
+
+        self.logger.info("alerts_service_starting")
+
+        # Connect to database
+        await self.connect_db()
+
+        # Load rules
+        rules = await self.load_rules()
+        self.evaluator = AlertEvaluator(rules)
+        self.logger.info("rules_loaded", count=len(rules))
+
+        # Connect to event bus
+        await self.subscriber.connect()
+        await self.publisher.connect()
+
+        self.logger.info("alerts_service_started")
+
+        try:
+            # Process events
+            async for event in self.subscriber.consume():
+                if not self.running:
+                    break
+
+                try:
+                    await self.process_metrics(event.payload)
+                except Exception as e:
+                    self.logger.error("event_processing_error", error=str(e))
+
+        except asyncio.CancelledError:
+            self.logger.info("alerts_service_cancelled")
+
+        finally:
+            await self.subscriber.disconnect()
+            await self.publisher.disconnect()
+
+            if self.db_pool:
+                await self.db_pool.close()
+
+            self.logger.info("alerts_service_stopped")
+
+    def stop(self) -> None:
+        """Signal the service to stop."""
+        self.running = False
+
+
+async def main():
+    """Main entry point."""
+    service = AlertsService()
+
+    # Handle shutdown signals
+    loop = asyncio.get_event_loop()
+
+    def signal_handler():
+        service.logger.info("shutdown_signal_received")
+        service.stop()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    await service.run()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/services/alerts/requirements.txt
+++ b/services/alerts/requirements.txt
@@ -1,3 +1,5 @@
+grpcio>=1.60.0
+grpcio-tools>=1.60.0
 redis>=5.0.0
 asyncpg>=0.29.0
 structlog>=23.2.0
--- a/services/collector/init.py
+++ b/services/collector/init.py
@@ -0,0 +1 @@
+"""Collector service."""
--- a/services/collector/main.py
+++ b/services/collector/main.py
@@ -0,0 +1,209 @@
+"""Collector service - streams system metrics to the aggregator via gRPC."""
+
+import asyncio
+import signal
+import sys
+from pathlib import Path
+
+import grpc
+
+# Add project root to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from services.collector.metrics import MetricsCollector
+from shared import metrics_pb2, metrics_pb2_grpc
+from shared.config import get_collector_config
+from shared.logging import setup_logging
+
+
+class CollectorService:
+    """Main collector service that streams metrics to the aggregator."""
+
+    def __init__(self):
+        self.config = get_collector_config()
+        self.logger = setup_logging(
+            service_name=self.config.service_name,
+            log_level=self.config.log_level,
+            log_format=self.config.log_format,
+        )
+        self.running = False
+        self.channel: grpc.aio.Channel | None = None
+        self.stub: metrics_pb2_grpc.MetricsServiceStub | None = None
+
+        self.collector = MetricsCollector(
+            machine_id=self.config.machine_id,
+            collect_cpu=self.config.collect_cpu,
+            collect_memory=self.config.collect_memory,
+            collect_disk=self.config.collect_disk,
+            collect_network=self.config.collect_network,
+            collect_load=self.config.collect_load,
+        )
+
+    async def connect(self) -> None:
+        """Establish connection to the aggregator."""
+        self.logger.info(
+            "connecting_to_aggregator",
+            aggregator_url=self.config.aggregator_url,
+        )
+
+        self.channel = grpc.aio.insecure_channel(
+            self.config.aggregator_url,
+            options=[
+                ("grpc.keepalive_time_ms", 10000),
+                ("grpc.keepalive_timeout_ms", 5000),
+                ("grpc.keepalive_permit_without_calls", True),
+            ],
+        )
+        self.stub = metrics_pb2_grpc.MetricsServiceStub(self.channel)
+
+        # Wait for channel to be ready
+        try:
+            await asyncio.wait_for(
+                self.channel.channel_ready(),
+                timeout=10.0,
+            )
+            self.logger.info("connected_to_aggregator")
+        except asyncio.TimeoutError:
+            self.logger.error("connection_timeout")
+            raise
+
+    async def disconnect(self) -> None:
+        """Close connection to the aggregator."""
+        if self.channel:
+            await self.channel.close()
+            self.channel = None
+            self.stub = None
+            self.logger.info("disconnected_from_aggregator")
+
+    def _batch_to_proto(self, batch) -> list[metrics_pb2.Metric]:
+        """Convert a MetricsBatch to protobuf messages."""
+        protos = []
+        for metric in batch.metrics:
+            proto = metrics_pb2.Metric(
+                machine_id=batch.machine_id,
+                hostname=batch.hostname,
+                timestamp_ms=batch.timestamp_ms,
+                type=getattr(metrics_pb2, metric.metric_type, 0),
+                value=metric.value,
+                labels=metric.labels,
+            )
+            protos.append(proto)
+        return protos
+
+    async def _metric_generator(self):
+        """Async generator that yields metrics at the configured interval."""
+        while self.running:
+            batch = self.collector.collect()
+            protos = self._batch_to_proto(batch)
+
+            for proto in protos:
+                yield proto
+
+            self.logger.debug(
+                "collected_metrics",
+                count=len(protos),
+                machine_id=batch.machine_id,
+            )
+
+            await asyncio.sleep(self.config.collection_interval)
+
+    async def stream_metrics(self) -> None:
+        """Stream metrics to the aggregator."""
+        if not self.stub:
+            raise RuntimeError("Not connected to aggregator")
+
+        retry_count = 0
+        max_retries = 10
+        base_delay = 1.0
+
+        while self.running:
+            try:
+                self.logger.info("starting_metric_stream")
+
+                response = await self.stub.StreamMetrics(self._metric_generator())
+
+                self.logger.info(
+                    "stream_completed",
+                    success=response.success,
+                    metrics_received=response.metrics_received,
+                    message=response.message,
+                )
+
+                retry_count = 0
+
+            except grpc.aio.AioRpcError as e:
+                retry_count += 1
+                delay = min(base_delay * (2**retry_count), 60.0)
+
+                self.logger.warning(
+                    "stream_error",
+                    code=e.code().name,
+                    details=e.details(),
+                    retry_count=retry_count,
+                    retry_delay=delay,
+                )
+
+                if retry_count >= max_retries:
+                    self.logger.error("max_retries_exceeded")
+                    raise
+
+                await asyncio.sleep(delay)
+
+                # Reconnect
+                try:
+                    await self.disconnect()
+                    await self.connect()
+                except Exception as conn_err:
+                    self.logger.error("reconnect_failed", error=str(conn_err))
+
+            except asyncio.CancelledError:
+                self.logger.info("stream_cancelled")
+                break
+
+    async def run(self) -> None:
+        """Main entry point for the collector service."""
+        self.running = True
+
+        self.logger.info(
+            "collector_starting",
+            machine_id=self.config.machine_id,
+            interval=self.config.collection_interval,
+        )
+
+        # Initial CPU percent call to initialize (first call always returns 0)
+        import psutil
+
+        psutil.cpu_percent()
+
+        await self.connect()
+
+        try:
+            await self.stream_metrics()
+        finally:
+            await self.disconnect()
+            self.logger.info("collector_stopped")
+
+    def stop(self) -> None:
+        """Signal the collector to stop."""
+        self.running = False
+
+
+async def main():
+    """Main entry point."""
+    service = CollectorService()
+
+    # Handle shutdown signals
+    loop = asyncio.get_event_loop()
+
+    def signal_handler():
+        service.logger.info("shutdown_signal_received")
+        service.stop()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    await service.run()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/services/collector/metrics.py
+++ b/services/collector/metrics.py
@@ -0,0 +1,233 @@
+"""System metrics collection using psutil."""
+
+import socket
+import time
+from dataclasses import dataclass, field
+
+import psutil
+
+
+@dataclass
+class MetricPoint:
+    """A single metric data point."""
+
+    metric_type: str
+    value: float
+    labels: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass
+class MetricsBatch:
+    """A batch of metrics from a single collection cycle."""
+
+    machine_id: str
+    hostname: str
+    timestamp_ms: int
+    metrics: list[MetricPoint]
+
+
+class MetricsCollector:
+    """Collects system metrics using psutil."""
+
+    def __init__(
+        self,
+        machine_id: str,
+        collect_cpu: bool = True,
+        collect_memory: bool = True,
+        collect_disk: bool = True,
+        collect_network: bool = True,
+        collect_load: bool = True,
+    ):
+        self.machine_id = machine_id
+        self.hostname = socket.gethostname()
+
+        self.collect_cpu = collect_cpu
+        self.collect_memory = collect_memory
+        self.collect_disk = collect_disk
+        self.collect_network = collect_network
+        self.collect_load = collect_load
+
+        # Track previous network counters for rate calculation
+        self._prev_net_io: psutil._common.snetio | None = None
+        self._prev_net_time: float | None = None
+
+    def collect(self) -> MetricsBatch:
+        """Collect all enabled metrics and return as a batch."""
+        metrics: list[MetricPoint] = []
+
+        if self.collect_cpu:
+            metrics.extend(self._collect_cpu())
+
+        if self.collect_memory:
+            metrics.extend(self._collect_memory())
+
+        if self.collect_disk:
+            metrics.extend(self._collect_disk())
+
+        if self.collect_network:
+            metrics.extend(self._collect_network())
+
+        if self.collect_load:
+            metrics.extend(self._collect_load())
+
+        return MetricsBatch(
+            machine_id=self.machine_id,
+            hostname=self.hostname,
+            timestamp_ms=int(time.time() * 1000),
+            metrics=metrics,
+        )
+
+    def _collect_cpu(self) -> list[MetricPoint]:
+        """Collect CPU metrics."""
+        metrics = []
+
+        # Overall CPU percent
+        cpu_percent = psutil.cpu_percent(interval=None)
+        metrics.append(
+            MetricPoint(
+                metric_type="CPU_PERCENT",
+                value=cpu_percent,
+            )
+        )
+
+        # Per-core CPU percent
+        per_cpu = psutil.cpu_percent(interval=None, percpu=True)
+        for i, pct in enumerate(per_cpu):
+            metrics.append(
+                MetricPoint(
+                    metric_type="CPU_PERCENT_PER_CORE",
+                    value=pct,
+                    labels={"core": str(i)},
+                )
+            )
+
+        return metrics
+
+    def _collect_memory(self) -> list[MetricPoint]:
+        """Collect memory metrics."""
+        mem = psutil.virtual_memory()
+
+        return [
+            MetricPoint(metric_type="MEMORY_PERCENT", value=mem.percent),
+            MetricPoint(metric_type="MEMORY_USED_BYTES", value=float(mem.used)),
+            MetricPoint(
+                metric_type="MEMORY_AVAILABLE_BYTES", value=float(mem.available)
+            ),
+        ]
+
+    def _collect_disk(self) -> list[MetricPoint]:
+        """Collect disk metrics."""
+        metrics = []
+
+        # Disk usage for root partition
+        try:
+            disk = psutil.disk_usage("/")
+            metrics.append(
+                MetricPoint(
+                    metric_type="DISK_PERCENT",
+                    value=disk.percent,
+                    labels={"mount": "/"},
+                )
+            )
+            metrics.append(
+                MetricPoint(
+                    metric_type="DISK_USED_BYTES",
+                    value=float(disk.used),
+                    labels={"mount": "/"},
+                )
+            )
+        except (PermissionError, FileNotFoundError):
+            pass
+
+        # Disk I/O rates
+        try:
+            io = psutil.disk_io_counters()
+            if io:
+                metrics.append(
+                    MetricPoint(
+                        metric_type="DISK_READ_BYTES_SEC",
+                        value=float(
+                            io.read_bytes
+                        ),  # Will be converted to rate by aggregator
+                    )
+                )
+                metrics.append(
+                    MetricPoint(
+                        metric_type="DISK_WRITE_BYTES_SEC",
+                        value=float(io.write_bytes),
+                    )
+                )
+        except (PermissionError, AttributeError):
+            pass
+
+        return metrics
+
+    def _collect_network(self) -> list[MetricPoint]:
+        """Collect network metrics with rate calculation."""
+        metrics = []
+
+        try:
+            net_io = psutil.net_io_counters()
+            current_time = time.time()
+
+            if self._prev_net_io is not None and self._prev_net_time is not None:
+                time_delta = current_time - self._prev_net_time
+                if time_delta > 0:
+                    bytes_sent_rate = (
+                        net_io.bytes_sent - self._prev_net_io.bytes_sent
+                    ) / time_delta
+                    bytes_recv_rate = (
+                        net_io.bytes_recv - self._prev_net_io.bytes_recv
+                    ) / time_delta
+
+                    metrics.append(
+                        MetricPoint(
+                            metric_type="NETWORK_SENT_BYTES_SEC",
+                            value=bytes_sent_rate,
+                        )
+                    )
+                    metrics.append(
+                        MetricPoint(
+                            metric_type="NETWORK_RECV_BYTES_SEC",
+                            value=bytes_recv_rate,
+                        )
+                    )
+
+            self._prev_net_io = net_io
+            self._prev_net_time = current_time
+
+            # Connection count
+            connections = len(psutil.net_connections(kind="inet"))
+            metrics.append(
+                MetricPoint(
+                    metric_type="NETWORK_CONNECTIONS",
+                    value=float(connections),
+                )
+            )
+        except (PermissionError, psutil.AccessDenied):
+            pass
+
+        return metrics
+
+    def _collect_load(self) -> list[MetricPoint]:
+        """Collect load average metrics (Unix only)."""
+        metrics = []
+
+        try:
+            load1, load5, load15 = psutil.getloadavg()
+            metrics.append(MetricPoint(metric_type="LOAD_AVG_1M", value=load1))
+            metrics.append(MetricPoint(metric_type="LOAD_AVG_5M", value=load5))
+            metrics.append(MetricPoint(metric_type="LOAD_AVG_15M", value=load15))
+        except (AttributeError, OSError):
+            # Windows doesn't have getloadavg
+            pass
+
+        # Process count
+        metrics.append(
+            MetricPoint(
+                metric_type="PROCESS_COUNT",
+                value=float(len(psutil.pids())),
+            )
+        )
+
+        return metrics
--- a/services/gateway/Dockerfile
+++ b/services/gateway/Dockerfile
@@ -21,6 +21,8 @@ RUN python -m grpc_tools.protoc \
    /app/proto/metrics.proto

 COPY services/gateway /app/services/gateway
+COPY services/aggregator/__init__.py /app/services/aggregator/__init__.py
+COPY services/aggregator/storage.py /app/services/aggregator/storage.py
 COPY web /app/web

 ENV PYTHONPATH=/app
--- a/services/gateway/init.py
+++ b/services/gateway/init.py
@@ -0,0 +1 @@
+"""Gateway service."""
--- a/services/gateway/main.py
+++ b/services/gateway/main.py
@@ -0,0 +1,393 @@
+"""Gateway service - FastAPI with WebSocket for real-time dashboard."""
+
+import asyncio
+import json
+import sys
+from contextlib import asynccontextmanager
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any
+
+import grpc
+from fastapi import FastAPI, HTTPException, Query, WebSocket, WebSocketDisconnect
+from fastapi.requests import Request
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+
+# Add project root to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from services.aggregator.storage import TimescaleStorage
+from shared import metrics_pb2, metrics_pb2_grpc
+from shared.config import get_gateway_config
+from shared.events import get_subscriber
+from shared.logging import setup_logging
+
+# Global state
+config = get_gateway_config()
+logger = setup_logging(
+    service_name=config.service_name,
+    log_level=config.log_level,
+    log_format=config.log_format,
+)
+
+
+# WebSocket connection manager
+class ConnectionManager:
+    """Manages WebSocket connections for real-time updates."""
+
+    def __init__(self):
+        self.active_connections: list[WebSocket] = []
+
+    async def connect(self, websocket: WebSocket) -> None:
+        await websocket.accept()
+        self.active_connections.append(websocket)
+        logger.info("websocket_connected", total=len(self.active_connections))
+
+    def disconnect(self, websocket: WebSocket) -> None:
+        self.active_connections.remove(websocket)
+        logger.info("websocket_disconnected", total=len(self.active_connections))
+
+    async def broadcast(self, message: dict) -> None:
+        """Broadcast message to all connected clients."""
+        if not self.active_connections:
+            return
+
+        data = json.dumps(message)
+        disconnected = []
+
+        for connection in self.active_connections:
+            try:
+                await connection.send_text(data)
+            except Exception:
+                disconnected.append(connection)
+
+        # Clean up disconnected
+        for conn in disconnected:
+            try:
+                self.active_connections.remove(conn)
+            except ValueError:
+                pass
+
+
+manager = ConnectionManager()
+timescale: TimescaleStorage | None = None
+grpc_channel: grpc.aio.Channel | None = None
+grpc_stub: metrics_pb2_grpc.MetricsServiceStub | None = None
+
+
+async def event_listener():
+    """Background task that listens for metric events and broadcasts to WebSocket clients."""
+    logger.info("event_listener_starting")
+
+    async with get_subscriber(topics=["metrics.raw"]) as subscriber:
+        async for event in subscriber.consume():
+            try:
+                await manager.broadcast(
+                    {
+                        "type": "metrics",
+                        "data": event.payload,
+                        "timestamp": event.timestamp.isoformat(),
+                    }
+                )
+            except Exception as e:
+                logger.warning("broadcast_error", error=str(e))
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager."""
+    global timescale, grpc_channel, grpc_stub
+
+    logger.info("gateway_starting", port=config.http_port)
+
+    # Connect to TimescaleDB for historical queries
+    timescale = TimescaleStorage(config.timescale_url)
+    try:
+        await timescale.connect()
+    except Exception as e:
+        logger.warning("timescale_connection_failed", error=str(e))
+        timescale = None
+
+    # Connect to aggregator via gRPC
+    grpc_channel = grpc.aio.insecure_channel(config.aggregator_url)
+    grpc_stub = metrics_pb2_grpc.MetricsServiceStub(grpc_channel)
+
+    # Start event listener in background
+    listener_task = asyncio.create_task(event_listener())
+
+    logger.info("gateway_started")
+
+    yield
+
+    # Cleanup
+    listener_task.cancel()
+    try:
+        await listener_task
+    except asyncio.CancelledError:
+        pass
+
+    if grpc_channel:
+        await grpc_channel.close()
+
+    if timescale:
+        await timescale.disconnect()
+
+    logger.info("gateway_stopped")
+
+
+# Create FastAPI app
+app = FastAPI(
+    title="System Monitor Gateway",
+    description="Real-time system monitoring dashboard",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+
+# Mount static files
+static_path = Path(__file__).parent.parent.parent / "web" / "static"
+if static_path.exists():
+    app.mount("/static", StaticFiles(directory=str(static_path)), name="static")
+
+# Templates
+templates_path = Path(__file__).parent.parent.parent / "web" / "templates"
+templates = (
+    Jinja2Templates(directory=str(templates_path)) if templates_path.exists() else None
+)
+
+
+# ============================================================================
+# Health endpoints
+# ============================================================================
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "service": "gateway"}
+
+
+@app.get("/ready")
+async def readiness_check():
+    """Readiness check - verifies dependencies."""
+    checks = {"gateway": "ok"}
+
+    # Check gRPC connection
+    try:
+        if grpc_stub:
+            await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=2.0)
+            checks["aggregator"] = "ok"
+    except Exception as e:
+        checks["aggregator"] = f"error: {str(e)}"
+
+    # Check TimescaleDB
+    if timescale and timescale._pool:
+        checks["timescaledb"] = "ok"
+    else:
+        checks["timescaledb"] = "not connected"
+
+    return {"status": "ready", "checks": checks}
+
+
+# ============================================================================
+# REST API endpoints
+# ============================================================================
+
+
+@app.get("/api/machines")
+async def get_machines():
+    """Get current state of all machines."""
+    if not grpc_stub:
+        raise HTTPException(status_code=503, detail="Aggregator not connected")
+
+    try:
+        response = await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=5.0)
+
+        machines = []
+        for state in response.machines:
+            metrics = {}
+            for m in state.current_metrics:
+                metric_type = metrics_pb2.MetricType.Name(m.type)
+                metrics[metric_type] = m.value
+
+            machines.append(
+                {
+                    "machine_id": state.machine_id,
+                    "hostname": state.hostname,
+                    "last_seen_ms": state.last_seen_ms,
+                    "health": metrics_pb2.HealthStatus.Name(state.health),
+                    "metrics": metrics,
+                }
+            )
+
+        return {"machines": machines}
+
+    except grpc.aio.AioRpcError as e:
+        raise HTTPException(status_code=503, detail=f"Aggregator error: {e.details()}")
+
+
+@app.get("/api/machines/{machine_id}")
+async def get_machine(machine_id: str):
+    """Get current state of a specific machine."""
+    if not grpc_stub:
+        raise HTTPException(status_code=503, detail="Aggregator not connected")
+
+    try:
+        response = await grpc_stub.GetCurrentState(
+            metrics_pb2.StateRequest(machine_id=machine_id),
+            timeout=5.0,
+        )
+
+        if not response.machine_id:
+            raise HTTPException(status_code=404, detail="Machine not found")
+
+        metrics = {}
+        for m in response.current_metrics:
+            metric_type = metrics_pb2.MetricType.Name(m.type)
+            metrics[metric_type] = m.value
+
+        return {
+            "machine_id": response.machine_id,
+            "hostname": response.hostname,
+            "last_seen_ms": response.last_seen_ms,
+            "health": metrics_pb2.HealthStatus.Name(response.health),
+            "metrics": metrics,
+        }
+
+    except grpc.aio.AioRpcError as e:
+        if e.code() == grpc.StatusCode.NOT_FOUND:
+            raise HTTPException(status_code=404, detail="Machine not found")
+        raise HTTPException(status_code=503, detail=f"Aggregator error: {e.details()}")
+
+
+@app.get("/api/metrics")
+async def get_metrics(
+    machine_id: str | None = Query(None),
+    metric_type: str | None = Query(None),
+    minutes: int = Query(60, ge=1, le=1440),
+    limit: int = Query(1000, ge=1, le=10000),
+):
+    """Get historical metrics."""
+    if not timescale:
+        raise HTTPException(status_code=503, detail="TimescaleDB not connected")
+
+    end_time = datetime.utcnow()
+    start_time = end_time - timedelta(minutes=minutes)
+
+    try:
+        metrics = await timescale.get_metrics(
+            machine_id=machine_id,
+            metric_type=metric_type,
+            start_time=start_time,
+            end_time=end_time,
+            limit=limit,
+        )
+
+        return {"metrics": metrics, "count": len(metrics)}
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# ============================================================================
+# WebSocket endpoint
+# ============================================================================
+
+
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    """WebSocket endpoint for real-time metric updates."""
+    await manager.connect(websocket)
+
+    try:
+        # Send initial state
+        if grpc_stub:
+            try:
+                response = await grpc_stub.GetAllStates(
+                    metrics_pb2.Empty(), timeout=5.0
+                )
+
+                for state in response.machines:
+                    metrics = {}
+                    for m in state.current_metrics:
+                        metric_type = metrics_pb2.MetricType.Name(m.type)
+                        metrics[metric_type] = m.value
+
+                    await websocket.send_json(
+                        {
+                            "type": "initial",
+                            "data": {
+                                "machine_id": state.machine_id,
+                                "hostname": state.hostname,
+                                "metrics": metrics,
+                            },
+                        }
+                    )
+            except Exception as e:
+                logger.warning("initial_state_error", error=str(e))
+
+        # Keep connection alive and handle incoming messages
+        while True:
+            try:
+                data = await websocket.receive_text()
+                # Handle ping/pong or commands from client
+                if data == "ping":
+                    await websocket.send_text("pong")
+            except WebSocketDisconnect:
+                break
+
+    finally:
+        manager.disconnect(websocket)
+
+
+# ============================================================================
+# Dashboard (HTML)
+# ============================================================================
+
+
+@app.get("/", response_class=HTMLResponse)
+async def dashboard(request: Request):
+    """Serve the dashboard HTML."""
+    if templates:
+        return templates.TemplateResponse("dashboard.html", {"request": request})
+
+    # Fallback if templates not found
+    return HTMLResponse("""
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>System Monitor</title>
+        <style>
+            body { font-family: system-ui; background: #1a1a2e; color: #eee; padding: 2rem; }
+            h1 { color: #e94560; }
+            pre { background: #16213e; padding: 1rem; border-radius: 8px; overflow: auto; }
+        </style>
+    </head>
+    <body>
+        <h1>System Monitor</h1>
+        <p>Dashboard template not found. API endpoints:</p>
+        <ul>
+            <li><a href="/api/machines">/api/machines</a> - Current state of all machines</li>
+            <li><a href="/api/metrics">/api/metrics</a> - Historical metrics</li>
+            <li><a href="/docs">/docs</a> - API documentation</li>
+        </ul>
+        <h2>Live Metrics</h2>
+        <pre id="output">Connecting...</pre>
+        <script>
+            const ws = new WebSocket(`ws://${location.host}/ws`);
+            const output = document.getElementById('output');
+            ws.onmessage = (e) => {
+                output.textContent = JSON.stringify(JSON.parse(e.data), null, 2);
+            };
+            ws.onclose = () => { output.textContent = 'Disconnected'; };
+        </script>
+    </body>
+    </html>
+    """)
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=config.http_port)
--- a/shared/init.py
+++ b/shared/init.py
@@ -0,0 +1,5 @@
+"""Shared utilities and generated protobuf modules."""
+
+from . import metrics_pb2, metrics_pb2_grpc
+
+__all__ = ["metrics_pb2", "metrics_pb2_grpc"]
--- a/shared/config.py
+++ b/shared/config.py
@@ -0,0 +1,104 @@
+"""Shared configuration management using Pydantic Settings."""
+
+from functools import lru_cache
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class BaseConfig(BaseSettings):
+    """Base configuration shared across all services."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+
+    # Service identification
+    service_name: str = "unknown"
+    machine_id: str = "local"
+
+    # Logging
+    log_level: str = "INFO"
+    log_format: str = "json"  # "json" or "console"
+
+    # Redis
+    redis_url: str = "redis://localhost:6379"
+
+    # Events
+    events_backend: str = "redis_pubsub"
+
+
+class CollectorConfig(BaseConfig):
+    """Collector service configuration."""
+
+    service_name: str = "collector"
+
+    # Aggregator connection
+    aggregator_url: str = "localhost:50051"
+
+    # Collection settings
+    collection_interval: int = 5  # seconds
+
+    # Metrics to collect
+    collect_cpu: bool = True
+    collect_memory: bool = True
+    collect_disk: bool = True
+    collect_network: bool = True
+    collect_load: bool = True
+
+
+class AggregatorConfig(BaseConfig):
+    """Aggregator service configuration."""
+
+    service_name: str = "aggregator"
+
+    # gRPC server
+    grpc_port: int = 50051
+
+    # TimescaleDB - can be set directly via TIMESCALE_URL
+    timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
+
+
+class GatewayConfig(BaseConfig):
+    """Gateway service configuration."""
+
+    service_name: str = "gateway"
+
+    # HTTP server
+    http_port: int = 8000
+
+    # Aggregator connection
+    aggregator_url: str = "localhost:50051"
+
+    # TimescaleDB - can be set directly via TIMESCALE_URL
+    timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
+
+
+class AlertsConfig(BaseConfig):
+    """Alerts service configuration."""
+
+    service_name: str = "alerts"
+
+    # TimescaleDB - can be set directly via TIMESCALE_URL or built from components
+    timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
+
+
+@lru_cache
+def get_collector_config() -> CollectorConfig:
+    return CollectorConfig()
+
+
+@lru_cache
+def get_aggregator_config() -> AggregatorConfig:
+    return AggregatorConfig()
+
+
+@lru_cache
+def get_gateway_config() -> GatewayConfig:
+    return GatewayConfig()
+
+
+@lru_cache
+def get_alerts_config() -> AlertsConfig:
+    return AlertsConfig()
--- a/shared/logging.py
+++ b/shared/logging.py
@@ -0,0 +1,74 @@
+"""Structured logging configuration."""
+
+import logging
+import sys
+from typing import Any
+
+import structlog
+
+
+def setup_logging(
+    service_name: str,
+    log_level: str = "INFO",
+    log_format: str = "json",
+) -> structlog.BoundLogger:
+    """
+    Configure structured logging for a service.
+
+    Args:
+        service_name: Name of the service for log context
+        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
+        log_format: Output format ("json" or "console")
+
+    Returns:
+        Configured structlog logger
+    """
+
+    # Shared processors
+    shared_processors: list[Any] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.processors.add_log_level,
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.StackInfoRenderer(),
+    ]
+
+    if log_format == "json":
+        # JSON format for production
+        processors = shared_processors + [
+            structlog.processors.format_exc_info,
+            structlog.processors.JSONRenderer(),
+        ]
+    else:
+        # Console format for development
+        processors = shared_processors + [
+            structlog.dev.ConsoleRenderer(colors=True),
+        ]
+
+    structlog.configure(
+        processors=processors,
+        wrapper_class=structlog.make_filtering_bound_logger(
+            getattr(logging, log_level.upper(), logging.INFO)
+        ),
+        context_class=dict,
+        logger_factory=structlog.PrintLoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+
+    # Also configure standard library logging
+    logging.basicConfig(
+        format="%(message)s",
+        stream=sys.stdout,
+        level=getattr(logging, log_level.upper(), logging.INFO),
+    )
+
+    # Get logger with service context
+    logger = structlog.get_logger(service=service_name)
+
+    return logger
+
+
+def get_logger(name: str | None = None) -> structlog.BoundLogger:
+    """Get a logger instance, optionally with a specific name."""
+    if name:
+        return structlog.get_logger(component=name)
+    return structlog.get_logger()
--- a/shared/metrics_pb2.py
+++ b/shared/metrics_pb2.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# NO CHECKED-IN PROTOBUF GENCODE
+# source: metrics.proto
+# Protobuf Python Version: 6.31.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import runtime_version as _runtime_version
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+_runtime_version.ValidateProtobufRuntimeVersion(
+    _runtime_version.Domain.PUBLIC,
+    6,
+    31,
+    1,
+    '',
+    'metrics.proto'
+)
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rmetrics.proto\x12\nmonitoring\"\x07\n\x05\x45mpty\"\xd8\x01\n\x06Metric\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0ctimestamp_ms\x18\x03 \x01(\x03\x12$\n\x04type\x18\x04 \x01(\x0e\x32\x16.monitoring.MetricType\x12\r\n\x05value\x18\x05 \x01(\x01\x12.\n\x06labels\x18\x06 \x03(\x0b\x32\x1e.monitoring.Metric.LabelsEntry\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"s\n\x0bMetricBatch\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0ctimestamp_ms\x18\x03 \x01(\x03\x12(\n\x07metrics\x18\x04 \x03(\x0b\x32\x17.monitoring.MetricPoint\"\xa6\x01\n\x0bMetricPoint\x12$\n\x04type\x18\x01 \x01(\x0e\x32\x16.monitoring.MetricType\x12\r\n\x05value\x18\x02 \x01(\x01\x12\x33\n\x06labels\x18\x03 \x03(\x0b\x32#.monitoring.MetricPoint.LabelsEntry\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"G\n\tStreamAck\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x18\n\x10metrics_received\x18\x02 \x01(\x03\x12\x0f\n\x07message\x18\x03 \x01(\t\"\"\n\x0cStateRequest\x12\x12\n\nmachine_id\x18\x01 \x01(\t\"\x8c\x02\n\x0cMachineState\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0clast_seen_ms\x18\x03 \x01(\x03\x12+\n\x0f\x63urrent_metrics\x18\x04 \x03(\x0b\x32\x12.monitoring.Metric\x12(\n\x06health\x18\x05 \x01(\x0e\x32\x18.monitoring.HealthStatus\x12\x38\n\x08metadata\x18\x06 \x03(\x0b\x32&.monitoring.MachineState.MetadataEntry\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\">\n\x10\x41llMachinesState\x12*\n\x08machines\x18\x01 \x03(\x0b\x32\x18.monitoring.MachineState\"\xd7\x01\n\x0e\x43ontrolCommand\x12\x12\n\ncommand_id\x18\x01 \x01(\t\x12<\n\x0fupdate_interval\x18\x02 \x01(\x0b\x32!.monitoring.UpdateIntervalCommandH\x00\x12\x37\n\x07restart\x18\x03 \x01(\x0b\x32$.monitoring.RestartCollectionCommandH\x00\x12/\n\x08shutdown\x18\x04 \x01(\x0b\x32\x1b.monitoring.ShutdownCommandH\x00\x42\t\n\x07\x63ommand\"1\n\x15UpdateIntervalCommand\x12\x18\n\x10interval_seconds\x18\x01 \x01(\x05\"\x1a\n\x18RestartCollectionCommand\"#\n\x0fShutdownCommand\x12\x10\n\x08graceful\x18\x01 \x01(\x08\"G\n\x0f\x43ontrolResponse\x12\x12\n\ncommand_id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07message\x18\x03 \x01(\t\"#\n\rConfigRequest\x12\x12\n\nmachine_id\x18\x01 \x01(\t\"\x80\x02\n\x0f\x43ollectorConfig\x12#\n\x1b\x63ollection_interval_seconds\x18\x01 \x01(\x05\x12/\n\x0f\x65nabled_metrics\x18\x02 \x03(\x0e\x32\x16.monitoring.MetricType\x12\x37\n\x06labels\x18\x03 \x03(\x0b\x32\'.monitoring.CollectorConfig.LabelsEntry\x12/\n\nthresholds\x18\x04 \x03(\x0b\x32\x1b.monitoring.ThresholdConfig\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"u\n\x0fThresholdConfig\x12+\n\x0bmetric_type\x18\x01 \x01(\x0e\x32\x16.monitoring.MetricType\x12\x19\n\x11warning_threshold\x18\x02 \x01(\x01\x12\x1a\n\x12\x63ritical_threshold\x18\x03 \x01(\x01*\x8d\x03\n\nMetricType\x12\x1b\n\x17METRIC_TYPE_UNSPECIFIED\x10\x00\x12\x0f\n\x0b\x43PU_PERCENT\x10\x01\x12\x18\n\x14\x43PU_PERCENT_PER_CORE\x10\x02\x12\x12\n\x0eMEMORY_PERCENT\x10\x03\x12\x15\n\x11MEMORY_USED_BYTES\x10\x04\x12\x1a\n\x16MEMORY_AVAILABLE_BYTES\x10\x05\x12\x10\n\x0c\x44ISK_PERCENT\x10\x06\x12\x13\n\x0f\x44ISK_USED_BYTES\x10\x07\x12\x17\n\x13\x44ISK_READ_BYTES_SEC\x10\x08\x12\x18\n\x14\x44ISK_WRITE_BYTES_SEC\x10\t\x12\x1a\n\x16NETWORK_SENT_BYTES_SEC\x10\n\x12\x1a\n\x16NETWORK_RECV_BYTES_SEC\x10\x0b\x12\x17\n\x13NETWORK_CONNECTIONS\x10\x0c\x12\x11\n\rPROCESS_COUNT\x10\r\x12\x0f\n\x0bLOAD_AVG_1M\x10\x0e\x12\x0f\n\x0bLOAD_AVG_5M\x10\x0f\x12\x10\n\x0cLOAD_AVG_15M\x10\x10*o\n\x0cHealthStatus\x12\x1d\n\x19HEALTH_STATUS_UNSPECIFIED\x10\x00\x12\x0b\n\x07HEALTHY\x10\x01\x12\x0b\n\x07WARNING\x10\x02\x12\x0c\n\x08\x43RITICAL\x10\x03\x12\x0b\n\x07UNKNOWN\x10\x04\x12\x0b\n\x07OFFLINE\x10\x05\x32\xdc\x01\n\x0eMetricsService\x12>\n\rStreamMetrics\x12\x12.monitoring.Metric\x1a\x15.monitoring.StreamAck\"\x00(\x01\x12G\n\x0fGetCurrentState\x12\x18.monitoring.StateRequest\x1a\x18.monitoring.MachineState\"\x00\x12\x41\n\x0cGetAllStates\x12\x11.monitoring.Empty\x1a\x1c.monitoring.AllMachinesState\"\x00\x32Z\n\x0e\x43ontrolService\x12H\n\x07\x43ontrol\x12\x1a.monitoring.ControlCommand\x1a\x1b.monitoring.ControlResponse\"\x00(\x01\x30\x01\x32\xa1\x01\n\rConfigService\x12\x45\n\tGetConfig\x12\x19.monitoring.ConfigRequest\x1a\x1b.monitoring.CollectorConfig\"\x00\x12I\n\x0bWatchConfig\x12\x19.monitoring.ConfigRequest\x1a\x1b.monitoring.CollectorConfig\"\x00\x30\x01\x42%Z#github.com/your-org/sysmonstm/protob\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'metrics_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  _globals['DESCRIPTOR']._loaded_options = None
+  _globals['DESCRIPTOR']._serialized_options = b'Z#github.com/your-org/sysmonstm/proto'
+  _globals['_METRIC_LABELSENTRY']._loaded_options = None
+  _globals['_METRIC_LABELSENTRY']._serialized_options = b'8\001'
+  _globals['_METRICPOINT_LABELSENTRY']._loaded_options = None
+  _globals['_METRICPOINT_LABELSENTRY']._serialized_options = b'8\001'
+  _globals['_MACHINESTATE_METADATAENTRY']._loaded_options = None
+  _globals['_MACHINESTATE_METADATAENTRY']._serialized_options = b'8\001'
+  _globals['_COLLECTORCONFIG_LABELSENTRY']._loaded_options = None
+  _globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_options = b'8\001'
+  _globals['_METRICTYPE']._serialized_start=1810
+  _globals['_METRICTYPE']._serialized_end=2207
+  _globals['_HEALTHSTATUS']._serialized_start=2209
+  _globals['_HEALTHSTATUS']._serialized_end=2320
+  _globals['_EMPTY']._serialized_start=29
+  _globals['_EMPTY']._serialized_end=36
+  _globals['_METRIC']._serialized_start=39
+  _globals['_METRIC']._serialized_end=255
+  _globals['_METRIC_LABELSENTRY']._serialized_start=210
+  _globals['_METRIC_LABELSENTRY']._serialized_end=255
+  _globals['_METRICBATCH']._serialized_start=257
+  _globals['_METRICBATCH']._serialized_end=372
+  _globals['_METRICPOINT']._serialized_start=375
+  _globals['_METRICPOINT']._serialized_end=541
+  _globals['_METRICPOINT_LABELSENTRY']._serialized_start=210
+  _globals['_METRICPOINT_LABELSENTRY']._serialized_end=255
+  _globals['_STREAMACK']._serialized_start=543
+  _globals['_STREAMACK']._serialized_end=614
+  _globals['_STATEREQUEST']._serialized_start=616
+  _globals['_STATEREQUEST']._serialized_end=650
+  _globals['_MACHINESTATE']._serialized_start=653
+  _globals['_MACHINESTATE']._serialized_end=921
+  _globals['_MACHINESTATE_METADATAENTRY']._serialized_start=874
+  _globals['_MACHINESTATE_METADATAENTRY']._serialized_end=921
+  _globals['_ALLMACHINESSTATE']._serialized_start=923
+  _globals['_ALLMACHINESSTATE']._serialized_end=985
+  _globals['_CONTROLCOMMAND']._serialized_start=988
+  _globals['_CONTROLCOMMAND']._serialized_end=1203
+  _globals['_UPDATEINTERVALCOMMAND']._serialized_start=1205
+  _globals['_UPDATEINTERVALCOMMAND']._serialized_end=1254
+  _globals['_RESTARTCOLLECTIONCOMMAND']._serialized_start=1256
+  _globals['_RESTARTCOLLECTIONCOMMAND']._serialized_end=1282
+  _globals['_SHUTDOWNCOMMAND']._serialized_start=1284
+  _globals['_SHUTDOWNCOMMAND']._serialized_end=1319
+  _globals['_CONTROLRESPONSE']._serialized_start=1321
+  _globals['_CONTROLRESPONSE']._serialized_end=1392
+  _globals['_CONFIGREQUEST']._serialized_start=1394
+  _globals['_CONFIGREQUEST']._serialized_end=1429
+  _globals['_COLLECTORCONFIG']._serialized_start=1432
+  _globals['_COLLECTORCONFIG']._serialized_end=1688
+  _globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_start=210
+  _globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_end=255
+  _globals['_THRESHOLDCONFIG']._serialized_start=1690
+  _globals['_THRESHOLDCONFIG']._serialized_end=1807
+  _globals['_METRICSSERVICE']._serialized_start=2323
+  _globals['_METRICSSERVICE']._serialized_end=2543
+  _globals['_CONTROLSERVICE']._serialized_start=2545
+  _globals['_CONTROLSERVICE']._serialized_end=2635
+  _globals['_CONFIGSERVICE']._serialized_start=2638
+  _globals['_CONFIGSERVICE']._serialized_end=2799
+# @@protoc_insertion_point(module_scope)
--- a/shared/metrics_pb2_grpc.py
+++ b/shared/metrics_pb2_grpc.py
@@ -0,0 +1,385 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+from shared import metrics_pb2 as metrics__pb2
+
+GRPC_GENERATED_VERSION = '1.76.0'
+GRPC_VERSION = grpc.__version__
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    raise RuntimeError(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + ' but the generated code in metrics_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+    )
+
+
+class MetricsServiceStub(object):
+    """MetricsService handles streaming metrics from collectors to aggregator
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.StreamMetrics = channel.stream_unary(
+                '/monitoring.MetricsService/StreamMetrics',
+                request_serializer=metrics__pb2.Metric.SerializeToString,
+                response_deserializer=metrics__pb2.StreamAck.FromString,
+                _registered_method=True)
+        self.GetCurrentState = channel.unary_unary(
+                '/monitoring.MetricsService/GetCurrentState',
+                request_serializer=metrics__pb2.StateRequest.SerializeToString,
+                response_deserializer=metrics__pb2.MachineState.FromString,
+                _registered_method=True)
+        self.GetAllStates = channel.unary_unary(
+                '/monitoring.MetricsService/GetAllStates',
+                request_serializer=metrics__pb2.Empty.SerializeToString,
+                response_deserializer=metrics__pb2.AllMachinesState.FromString,
+                _registered_method=True)
+
+
+class MetricsServiceServicer(object):
+    """MetricsService handles streaming metrics from collectors to aggregator
+    """
+
+    def StreamMetrics(self, request_iterator, context):
+        """Client-side streaming: collector streams metrics to aggregator
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GetCurrentState(self, request, context):
+        """Get current state of a machine
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GetAllStates(self, request, context):
+        """Get current state of all machines
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_MetricsServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'StreamMetrics': grpc.stream_unary_rpc_method_handler(
+                    servicer.StreamMetrics,
+                    request_deserializer=metrics__pb2.Metric.FromString,
+                    response_serializer=metrics__pb2.StreamAck.SerializeToString,
+            ),
+            'GetCurrentState': grpc.unary_unary_rpc_method_handler(
+                    servicer.GetCurrentState,
+                    request_deserializer=metrics__pb2.StateRequest.FromString,
+                    response_serializer=metrics__pb2.MachineState.SerializeToString,
+            ),
+            'GetAllStates': grpc.unary_unary_rpc_method_handler(
+                    servicer.GetAllStates,
+                    request_deserializer=metrics__pb2.Empty.FromString,
+                    response_serializer=metrics__pb2.AllMachinesState.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'monitoring.MetricsService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('monitoring.MetricsService', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class MetricsService(object):
+    """MetricsService handles streaming metrics from collectors to aggregator
+    """
+
+    @staticmethod
+    def StreamMetrics(request_iterator,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_unary(
+            request_iterator,
+            target,
+            '/monitoring.MetricsService/StreamMetrics',
+            metrics__pb2.Metric.SerializeToString,
+            metrics__pb2.StreamAck.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def GetCurrentState(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/monitoring.MetricsService/GetCurrentState',
+            metrics__pb2.StateRequest.SerializeToString,
+            metrics__pb2.MachineState.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def GetAllStates(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/monitoring.MetricsService/GetAllStates',
+            metrics__pb2.Empty.SerializeToString,
+            metrics__pb2.AllMachinesState.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+
+class ControlServiceStub(object):
+    """ControlService handles bidirectional control commands
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Control = channel.stream_stream(
+                '/monitoring.ControlService/Control',
+                request_serializer=metrics__pb2.ControlCommand.SerializeToString,
+                response_deserializer=metrics__pb2.ControlResponse.FromString,
+                _registered_method=True)
+
+
+class ControlServiceServicer(object):
+    """ControlService handles bidirectional control commands
+    """
+
+    def Control(self, request_iterator, context):
+        """Bidirectional streaming for commands and responses
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_ControlServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Control': grpc.stream_stream_rpc_method_handler(
+                    servicer.Control,
+                    request_deserializer=metrics__pb2.ControlCommand.FromString,
+                    response_serializer=metrics__pb2.ControlResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'monitoring.ControlService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('monitoring.ControlService', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class ControlService(object):
+    """ControlService handles bidirectional control commands
+    """
+
+    @staticmethod
+    def Control(request_iterator,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_stream(
+            request_iterator,
+            target,
+            '/monitoring.ControlService/Control',
+            metrics__pb2.ControlCommand.SerializeToString,
+            metrics__pb2.ControlResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+
+class ConfigServiceStub(object):
+    """ConfigService handles dynamic configuration
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.GetConfig = channel.unary_unary(
+                '/monitoring.ConfigService/GetConfig',
+                request_serializer=metrics__pb2.ConfigRequest.SerializeToString,
+                response_deserializer=metrics__pb2.CollectorConfig.FromString,
+                _registered_method=True)
+        self.WatchConfig = channel.unary_stream(
+                '/monitoring.ConfigService/WatchConfig',
+                request_serializer=metrics__pb2.ConfigRequest.SerializeToString,
+                response_deserializer=metrics__pb2.CollectorConfig.FromString,
+                _registered_method=True)
+
+
+class ConfigServiceServicer(object):
+    """ConfigService handles dynamic configuration
+    """
+
+    def GetConfig(self, request, context):
+        """Get current configuration for a collector
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def WatchConfig(self, request, context):
+        """Stream configuration updates
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_ConfigServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'GetConfig': grpc.unary_unary_rpc_method_handler(
+                    servicer.GetConfig,
+                    request_deserializer=metrics__pb2.ConfigRequest.FromString,
+                    response_serializer=metrics__pb2.CollectorConfig.SerializeToString,
+            ),
+            'WatchConfig': grpc.unary_stream_rpc_method_handler(
+                    servicer.WatchConfig,
+                    request_deserializer=metrics__pb2.ConfigRequest.FromString,
+                    response_serializer=metrics__pb2.CollectorConfig.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'monitoring.ConfigService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('monitoring.ConfigService', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class ConfigService(object):
+    """ConfigService handles dynamic configuration
+    """
+
+    @staticmethod
+    def GetConfig(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/monitoring.ConfigService/GetConfig',
+            metrics__pb2.ConfigRequest.SerializeToString,
+            metrics__pb2.CollectorConfig.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def WatchConfig(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(
+            request,
+            target,
+            '/monitoring.ConfigService/WatchConfig',
+            metrics__pb2.ConfigRequest.SerializeToString,
+            metrics__pb2.CollectorConfig.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
--- a/web/static/.gitkeep
+++ b/web/static/.gitkeep
--- a/web/templates/dashboard.html
+++ b/web/templates/dashboard.html
@@ -0,0 +1,358 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>System Monitor Dashboard</title>
+    <style>
+        :root {
+            --bg-primary: #1a1a2e;
+            --bg-secondary: #16213e;
+            --bg-card: #0f3460;
+            --text-primary: #eee;
+            --text-secondary: #a0a0a0;
+            --accent: #e94560;
+            --success: #4ade80;
+            --warning: #fbbf24;
+            --danger: #ef4444;
+            --border: #2a2a4a;
+        }
+
+        * { box-sizing: border-box; margin: 0; padding: 0; }
+
+        body {
+            font-family: system-ui, -apple-system, sans-serif;
+            background: var(--bg-primary);
+            color: var(--text-primary);
+            min-height: 100vh;
+        }
+
+        header {
+            background: var(--bg-secondary);
+            padding: 1rem 2rem;
+            border-bottom: 2px solid var(--accent);
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+
+        header h1 { font-size: 1.5rem; }
+
+        .status {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            font-size: 0.875rem;
+        }
+
+        .status-dot {
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            background: var(--danger);
+        }
+
+        .status-dot.connected { background: var(--success); }
+
+        main {
+            padding: 1.5rem;
+            max-width: 1600px;
+            margin: 0 auto;
+        }
+
+        .machines-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fill, minmax(400px, 1fr));
+            gap: 1.5rem;
+        }
+
+        .machine-card {
+            background: var(--bg-secondary);
+            border-radius: 8px;
+            padding: 1.25rem;
+            border: 1px solid var(--border);
+        }
+
+        .machine-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 1rem;
+            padding-bottom: 0.75rem;
+            border-bottom: 1px solid var(--border);
+        }
+
+        .machine-name {
+            font-weight: 600;
+            color: var(--accent);
+        }
+
+        .machine-id {
+            font-size: 0.75rem;
+            color: var(--text-secondary);
+        }
+
+        .machine-status {
+            font-size: 0.75rem;
+            padding: 0.25rem 0.5rem;
+            border-radius: 4px;
+            background: var(--success);
+            color: #000;
+        }
+
+        .machine-status.warning { background: var(--warning); }
+        .machine-status.critical { background: var(--danger); color: #fff; }
+        .machine-status.offline { background: var(--text-secondary); }
+
+        .metrics-grid {
+            display: grid;
+            grid-template-columns: repeat(2, 1fr);
+            gap: 0.75rem;
+        }
+
+        .metric {
+            background: var(--bg-card);
+            padding: 0.75rem;
+            border-radius: 6px;
+        }
+
+        .metric-label {
+            font-size: 0.75rem;
+            color: var(--text-secondary);
+            margin-bottom: 0.25rem;
+        }
+
+        .metric-value {
+            font-size: 1.5rem;
+            font-weight: 600;
+        }
+
+        .metric-bar {
+            height: 4px;
+            background: var(--border);
+            border-radius: 2px;
+            margin-top: 0.5rem;
+            overflow: hidden;
+        }
+
+        .metric-bar-fill {
+            height: 100%;
+            background: var(--success);
+            transition: width 0.3s ease;
+        }
+
+        .metric-bar-fill.warning { background: var(--warning); }
+        .metric-bar-fill.critical { background: var(--danger); }
+
+        .last-seen {
+            font-size: 0.75rem;
+            color: var(--text-secondary);
+            margin-top: 1rem;
+            text-align: right;
+        }
+
+        .no-machines {
+            text-align: center;
+            padding: 3rem;
+            color: var(--text-secondary);
+        }
+
+        .no-machines h2 {
+            color: var(--text-primary);
+            margin-bottom: 0.5rem;
+        }
+
+        @media (max-width: 600px) {
+            .machines-grid {
+                grid-template-columns: 1fr;
+            }
+            .metrics-grid {
+                grid-template-columns: 1fr;
+            }
+        }
+    </style>
+</head>
+<body>
+    <header>
+        <h1>System Monitor</h1>
+        <div class="status">
+            <span class="status-dot" id="status-dot"></span>
+            <span id="status-text">Connecting...</span>
+        </div>
+    </header>
+
+    <main>
+        <div class="machines-grid" id="machines-grid">
+            <div class="no-machines">
+                <h2>No machines connected</h2>
+                <p>Waiting for collectors to send metrics...</p>
+            </div>
+        </div>
+    </main>
+
+    <script>
+        const machinesGrid = document.getElementById('machines-grid');
+        const statusDot = document.getElementById('status-dot');
+        const statusText = document.getElementById('status-text');
+
+        const machines = new Map();
+
+        function formatBytes(bytes) {
+            if (bytes === 0) return '0 B';
+            const k = 1024;
+            const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
+            const i = Math.floor(Math.log(bytes) / Math.log(k));
+            return parseFloat((bytes / Math.pow(k, i)).toFixed(1)) + ' ' + sizes[i];
+        }
+
+        function formatRate(bytesPerSec) {
+            return formatBytes(bytesPerSec) + '/s';
+        }
+
+        function getBarClass(value, warning = 80, critical = 95) {
+            if (value >= critical) return 'critical';
+            if (value >= warning) return 'warning';
+            return '';
+        }
+
+        function getStatusClass(metrics) {
+            const cpu = metrics.CPU_PERCENT || 0;
+            const mem = metrics.MEMORY_PERCENT || 0;
+            const disk = metrics.DISK_PERCENT || 0;
+
+            if (cpu > 95 || mem > 95 || disk > 90) return 'critical';
+            if (cpu > 80 || mem > 85 || disk > 80) return 'warning';
+            return '';
+        }
+
+        function timeSince(timestampMs) {
+            const seconds = Math.floor((Date.now() - timestampMs) / 1000);
+            if (seconds < 5) return 'just now';
+            if (seconds < 60) return `${seconds}s ago`;
+            const minutes = Math.floor(seconds / 60);
+            if (minutes < 60) return `${minutes}m ago`;
+            return `${Math.floor(minutes / 60)}h ago`;
+        }
+
+        function renderMachine(data) {
+            const m = data.metrics || {};
+            const statusClass = getStatusClass(m);
+
+            return `
+                <div class="machine-card" data-machine="${data.machine_id}">
+                    <div class="machine-header">
+                        <div>
+                            <div class="machine-name">${data.hostname || data.machine_id}</div>
+                            <div class="machine-id">${data.machine_id}</div>
+                        </div>
+                        <span class="machine-status ${statusClass}">${statusClass || 'healthy'}</span>
+                    </div>
+                    <div class="metrics-grid">
+                        <div class="metric">
+                            <div class="metric-label">CPU</div>
+                            <div class="metric-value">${(m.CPU_PERCENT || 0).toFixed(1)}%</div>
+                            <div class="metric-bar">
+                                <div class="metric-bar-fill ${getBarClass(m.CPU_PERCENT || 0)}"
+                                     style="width: ${m.CPU_PERCENT || 0}%"></div>
+                            </div>
+                        </div>
+                        <div class="metric">
+                            <div class="metric-label">Memory</div>
+                            <div class="metric-value">${(m.MEMORY_PERCENT || 0).toFixed(1)}%</div>
+                            <div class="metric-bar">
+                                <div class="metric-bar-fill ${getBarClass(m.MEMORY_PERCENT || 0, 85, 95)}"
+                                     style="width: ${m.MEMORY_PERCENT || 0}%"></div>
+                            </div>
+                        </div>
+                        <div class="metric">
+                            <div class="metric-label">Disk</div>
+                            <div class="metric-value">${(m.DISK_PERCENT || 0).toFixed(1)}%</div>
+                            <div class="metric-bar">
+                                <div class="metric-bar-fill ${getBarClass(m.DISK_PERCENT || 0, 80, 90)}"
+                                     style="width: ${m.DISK_PERCENT || 0}%"></div>
+                            </div>
+                        </div>
+                        <div class="metric">
+                            <div class="metric-label">Load (1m)</div>
+                            <div class="metric-value">${(m.LOAD_AVG_1M || 0).toFixed(2)}</div>
+                        </div>
+                        <div class="metric">
+                            <div class="metric-label">Network In</div>
+                            <div class="metric-value">${formatRate(m.NETWORK_RECV_BYTES_SEC || 0)}</div>
+                        </div>
+                        <div class="metric">
+                            <div class="metric-label">Network Out</div>
+                            <div class="metric-value">${formatRate(m.NETWORK_SENT_BYTES_SEC || 0)}</div>
+                        </div>
+                    </div>
+                    <div class="last-seen">Last seen: ${timeSince(data.timestamp_ms || Date.now())}</div>
+                </div>
+            `;
+        }
+
+        function updateUI() {
+            if (machines.size === 0) {
+                machinesGrid.innerHTML = `
+                    <div class="no-machines">
+                        <h2>No machines connected</h2>
+                        <p>Waiting for collectors to send metrics...</p>
+                    </div>
+                `;
+                return;
+            }
+
+            machinesGrid.innerHTML = Array.from(machines.values())
+                .map(renderMachine)
+                .join('');
+        }
+
+        function connect() {
+            const ws = new WebSocket(`ws://${location.host}/ws`);
+
+            ws.onopen = () => {
+                statusDot.classList.add('connected');
+                statusText.textContent = 'Connected';
+            };
+
+            ws.onclose = () => {
+                statusDot.classList.remove('connected');
+                statusText.textContent = 'Disconnected - Reconnecting...';
+                setTimeout(connect, 3000);
+            };
+
+            ws.onerror = () => {
+                statusDot.classList.remove('connected');
+                statusText.textContent = 'Connection error';
+            };
+
+            ws.onmessage = (event) => {
+                try {
+                    const msg = JSON.parse(event.data);
+
+                    if (msg.type === 'initial' || msg.type === 'metrics') {
+                        const data = msg.data;
+                        data.timestamp_ms = data.timestamp_ms || Date.now();
+                        machines.set(data.machine_id, data);
+                        updateUI();
+                    }
+                } catch (e) {
+                    console.error('Failed to parse message:', e);
+                }
+            };
+
+            // Send periodic pings
+            setInterval(() => {
+                if (ws.readyState === WebSocket.OPEN) {
+                    ws.send('ping');
+                }
+            }, 30000);
+        }
+
+        // Update "last seen" timestamps periodically
+        setInterval(updateUI, 5000);
+
+        // Start connection
+        connect();
+    </script>
+</body>
+</html>