claude final draft

2025-12-29 23:44:30 -03:00
parent 116d4032e2
commit e5aafd5097
22 changed files with 2815 additions and 32 deletions
--- a/services/collector/init.py
+++ b/services/collector/init.py
@@ -0,0 +1 @@
+"""Collector service."""
--- a/services/collector/main.py
+++ b/services/collector/main.py
@@ -0,0 +1,209 @@
+"""Collector service - streams system metrics to the aggregator via gRPC."""
+
+import asyncio
+import signal
+import sys
+from pathlib import Path
+
+import grpc
+
+# Add project root to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from services.collector.metrics import MetricsCollector
+from shared import metrics_pb2, metrics_pb2_grpc
+from shared.config import get_collector_config
+from shared.logging import setup_logging
+
+
+class CollectorService:
+    """Main collector service that streams metrics to the aggregator."""
+
+    def __init__(self):
+        self.config = get_collector_config()
+        self.logger = setup_logging(
+            service_name=self.config.service_name,
+            log_level=self.config.log_level,
+            log_format=self.config.log_format,
+        )
+        self.running = False
+        self.channel: grpc.aio.Channel | None = None
+        self.stub: metrics_pb2_grpc.MetricsServiceStub | None = None
+
+        self.collector = MetricsCollector(
+            machine_id=self.config.machine_id,
+            collect_cpu=self.config.collect_cpu,
+            collect_memory=self.config.collect_memory,
+            collect_disk=self.config.collect_disk,
+            collect_network=self.config.collect_network,
+            collect_load=self.config.collect_load,
+        )
+
+    async def connect(self) -> None:
+        """Establish connection to the aggregator."""
+        self.logger.info(
+            "connecting_to_aggregator",
+            aggregator_url=self.config.aggregator_url,
+        )
+
+        self.channel = grpc.aio.insecure_channel(
+            self.config.aggregator_url,
+            options=[
+                ("grpc.keepalive_time_ms", 10000),
+                ("grpc.keepalive_timeout_ms", 5000),
+                ("grpc.keepalive_permit_without_calls", True),
+            ],
+        )
+        self.stub = metrics_pb2_grpc.MetricsServiceStub(self.channel)
+
+        # Wait for channel to be ready
+        try:
+            await asyncio.wait_for(
+                self.channel.channel_ready(),
+                timeout=10.0,
+            )
+            self.logger.info("connected_to_aggregator")
+        except asyncio.TimeoutError:
+            self.logger.error("connection_timeout")
+            raise
+
+    async def disconnect(self) -> None:
+        """Close connection to the aggregator."""
+        if self.channel:
+            await self.channel.close()
+            self.channel = None
+            self.stub = None
+            self.logger.info("disconnected_from_aggregator")
+
+    def _batch_to_proto(self, batch) -> list[metrics_pb2.Metric]:
+        """Convert a MetricsBatch to protobuf messages."""
+        protos = []
+        for metric in batch.metrics:
+            proto = metrics_pb2.Metric(
+                machine_id=batch.machine_id,
+                hostname=batch.hostname,
+                timestamp_ms=batch.timestamp_ms,
+                type=getattr(metrics_pb2, metric.metric_type, 0),
+                value=metric.value,
+                labels=metric.labels,
+            )
+            protos.append(proto)
+        return protos
+
+    async def _metric_generator(self):
+        """Async generator that yields metrics at the configured interval."""
+        while self.running:
+            batch = self.collector.collect()
+            protos = self._batch_to_proto(batch)
+
+            for proto in protos:
+                yield proto
+
+            self.logger.debug(
+                "collected_metrics",
+                count=len(protos),
+                machine_id=batch.machine_id,
+            )
+
+            await asyncio.sleep(self.config.collection_interval)
+
+    async def stream_metrics(self) -> None:
+        """Stream metrics to the aggregator."""
+        if not self.stub:
+            raise RuntimeError("Not connected to aggregator")
+
+        retry_count = 0
+        max_retries = 10
+        base_delay = 1.0
+
+        while self.running:
+            try:
+                self.logger.info("starting_metric_stream")
+
+                response = await self.stub.StreamMetrics(self._metric_generator())
+
+                self.logger.info(
+                    "stream_completed",
+                    success=response.success,
+                    metrics_received=response.metrics_received,
+                    message=response.message,
+                )
+
+                retry_count = 0
+
+            except grpc.aio.AioRpcError as e:
+                retry_count += 1
+                delay = min(base_delay * (2**retry_count), 60.0)
+
+                self.logger.warning(
+                    "stream_error",
+                    code=e.code().name,
+                    details=e.details(),
+                    retry_count=retry_count,
+                    retry_delay=delay,
+                )
+
+                if retry_count >= max_retries:
+                    self.logger.error("max_retries_exceeded")
+                    raise
+
+                await asyncio.sleep(delay)
+
+                # Reconnect
+                try:
+                    await self.disconnect()
+                    await self.connect()
+                except Exception as conn_err:
+                    self.logger.error("reconnect_failed", error=str(conn_err))
+
+            except asyncio.CancelledError:
+                self.logger.info("stream_cancelled")
+                break
+
+    async def run(self) -> None:
+        """Main entry point for the collector service."""
+        self.running = True
+
+        self.logger.info(
+            "collector_starting",
+            machine_id=self.config.machine_id,
+            interval=self.config.collection_interval,
+        )
+
+        # Initial CPU percent call to initialize (first call always returns 0)
+        import psutil
+
+        psutil.cpu_percent()
+
+        await self.connect()
+
+        try:
+            await self.stream_metrics()
+        finally:
+            await self.disconnect()
+            self.logger.info("collector_stopped")
+
+    def stop(self) -> None:
+        """Signal the collector to stop."""
+        self.running = False
+
+
+async def main():
+    """Main entry point."""
+    service = CollectorService()
+
+    # Handle shutdown signals
+    loop = asyncio.get_event_loop()
+
+    def signal_handler():
+        service.logger.info("shutdown_signal_received")
+        service.stop()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    await service.run()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/services/collector/metrics.py
+++ b/services/collector/metrics.py
@@ -0,0 +1,233 @@
+"""System metrics collection using psutil."""
+
+import socket
+import time
+from dataclasses import dataclass, field
+
+import psutil
+
+
+@dataclass
+class MetricPoint:
+    """A single metric data point."""
+
+    metric_type: str
+    value: float
+    labels: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass
+class MetricsBatch:
+    """A batch of metrics from a single collection cycle."""
+
+    machine_id: str
+    hostname: str
+    timestamp_ms: int
+    metrics: list[MetricPoint]
+
+
+class MetricsCollector:
+    """Collects system metrics using psutil."""
+
+    def __init__(
+        self,
+        machine_id: str,
+        collect_cpu: bool = True,
+        collect_memory: bool = True,
+        collect_disk: bool = True,
+        collect_network: bool = True,
+        collect_load: bool = True,
+    ):
+        self.machine_id = machine_id
+        self.hostname = socket.gethostname()
+
+        self.collect_cpu = collect_cpu
+        self.collect_memory = collect_memory
+        self.collect_disk = collect_disk
+        self.collect_network = collect_network
+        self.collect_load = collect_load
+
+        # Track previous network counters for rate calculation
+        self._prev_net_io: psutil._common.snetio | None = None
+        self._prev_net_time: float | None = None
+
+    def collect(self) -> MetricsBatch:
+        """Collect all enabled metrics and return as a batch."""
+        metrics: list[MetricPoint] = []
+
+        if self.collect_cpu:
+            metrics.extend(self._collect_cpu())
+
+        if self.collect_memory:
+            metrics.extend(self._collect_memory())
+
+        if self.collect_disk:
+            metrics.extend(self._collect_disk())
+
+        if self.collect_network:
+            metrics.extend(self._collect_network())
+
+        if self.collect_load:
+            metrics.extend(self._collect_load())
+
+        return MetricsBatch(
+            machine_id=self.machine_id,
+            hostname=self.hostname,
+            timestamp_ms=int(time.time() * 1000),
+            metrics=metrics,
+        )
+
+    def _collect_cpu(self) -> list[MetricPoint]:
+        """Collect CPU metrics."""
+        metrics = []
+
+        # Overall CPU percent
+        cpu_percent = psutil.cpu_percent(interval=None)
+        metrics.append(
+            MetricPoint(
+                metric_type="CPU_PERCENT",
+                value=cpu_percent,
+            )
+        )
+
+        # Per-core CPU percent
+        per_cpu = psutil.cpu_percent(interval=None, percpu=True)
+        for i, pct in enumerate(per_cpu):
+            metrics.append(
+                MetricPoint(
+                    metric_type="CPU_PERCENT_PER_CORE",
+                    value=pct,
+                    labels={"core": str(i)},
+                )
+            )
+
+        return metrics
+
+    def _collect_memory(self) -> list[MetricPoint]:
+        """Collect memory metrics."""
+        mem = psutil.virtual_memory()
+
+        return [
+            MetricPoint(metric_type="MEMORY_PERCENT", value=mem.percent),
+            MetricPoint(metric_type="MEMORY_USED_BYTES", value=float(mem.used)),
+            MetricPoint(
+                metric_type="MEMORY_AVAILABLE_BYTES", value=float(mem.available)
+            ),
+        ]
+
+    def _collect_disk(self) -> list[MetricPoint]:
+        """Collect disk metrics."""
+        metrics = []
+
+        # Disk usage for root partition
+        try:
+            disk = psutil.disk_usage("/")
+            metrics.append(
+                MetricPoint(
+                    metric_type="DISK_PERCENT",
+                    value=disk.percent,
+                    labels={"mount": "/"},
+                )
+            )
+            metrics.append(
+                MetricPoint(
+                    metric_type="DISK_USED_BYTES",
+                    value=float(disk.used),
+                    labels={"mount": "/"},
+                )
+            )
+        except (PermissionError, FileNotFoundError):
+            pass
+
+        # Disk I/O rates
+        try:
+            io = psutil.disk_io_counters()
+            if io:
+                metrics.append(
+                    MetricPoint(
+                        metric_type="DISK_READ_BYTES_SEC",
+                        value=float(
+                            io.read_bytes
+                        ),  # Will be converted to rate by aggregator
+                    )
+                )
+                metrics.append(
+                    MetricPoint(
+                        metric_type="DISK_WRITE_BYTES_SEC",
+                        value=float(io.write_bytes),
+                    )
+                )
+        except (PermissionError, AttributeError):
+            pass
+
+        return metrics
+
+    def _collect_network(self) -> list[MetricPoint]:
+        """Collect network metrics with rate calculation."""
+        metrics = []
+
+        try:
+            net_io = psutil.net_io_counters()
+            current_time = time.time()
+
+            if self._prev_net_io is not None and self._prev_net_time is not None:
+                time_delta = current_time - self._prev_net_time
+                if time_delta > 0:
+                    bytes_sent_rate = (
+                        net_io.bytes_sent - self._prev_net_io.bytes_sent
+                    ) / time_delta
+                    bytes_recv_rate = (
+                        net_io.bytes_recv - self._prev_net_io.bytes_recv
+                    ) / time_delta
+
+                    metrics.append(
+                        MetricPoint(
+                            metric_type="NETWORK_SENT_BYTES_SEC",
+                            value=bytes_sent_rate,
+                        )
+                    )
+                    metrics.append(
+                        MetricPoint(
+                            metric_type="NETWORK_RECV_BYTES_SEC",
+                            value=bytes_recv_rate,
+                        )
+                    )
+
+            self._prev_net_io = net_io
+            self._prev_net_time = current_time
+
+            # Connection count
+            connections = len(psutil.net_connections(kind="inet"))
+            metrics.append(
+                MetricPoint(
+                    metric_type="NETWORK_CONNECTIONS",
+                    value=float(connections),
+                )
+            )
+        except (PermissionError, psutil.AccessDenied):
+            pass
+
+        return metrics
+
+    def _collect_load(self) -> list[MetricPoint]:
+        """Collect load average metrics (Unix only)."""
+        metrics = []
+
+        try:
+            load1, load5, load15 = psutil.getloadavg()
+            metrics.append(MetricPoint(metric_type="LOAD_AVG_1M", value=load1))
+            metrics.append(MetricPoint(metric_type="LOAD_AVG_5M", value=load5))
+            metrics.append(MetricPoint(metric_type="LOAD_AVG_15M", value=load15))
+        except (AttributeError, OSError):
+            # Windows doesn't have getloadavg
+            pass
+
+        # Process count
+        metrics.append(
+            MetricPoint(
+                metric_type="PROCESS_COUNT",
+                value=float(len(psutil.pids())),
+            )
+        )
+
+        return metrics