claude final draft
This commit is contained in:
1
services/collector/__init__.py
Normal file
1
services/collector/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Collector service."""
|
||||
209
services/collector/main.py
Normal file
209
services/collector/main.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""Collector service - streams system metrics to the aggregator via gRPC."""
|
||||
|
||||
import asyncio
|
||||
import signal
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import grpc
|
||||
|
||||
# Add project root to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from services.collector.metrics import MetricsCollector
|
||||
from shared import metrics_pb2, metrics_pb2_grpc
|
||||
from shared.config import get_collector_config
|
||||
from shared.logging import setup_logging
|
||||
|
||||
|
||||
class CollectorService:
|
||||
"""Main collector service that streams metrics to the aggregator."""
|
||||
|
||||
def __init__(self):
|
||||
self.config = get_collector_config()
|
||||
self.logger = setup_logging(
|
||||
service_name=self.config.service_name,
|
||||
log_level=self.config.log_level,
|
||||
log_format=self.config.log_format,
|
||||
)
|
||||
self.running = False
|
||||
self.channel: grpc.aio.Channel | None = None
|
||||
self.stub: metrics_pb2_grpc.MetricsServiceStub | None = None
|
||||
|
||||
self.collector = MetricsCollector(
|
||||
machine_id=self.config.machine_id,
|
||||
collect_cpu=self.config.collect_cpu,
|
||||
collect_memory=self.config.collect_memory,
|
||||
collect_disk=self.config.collect_disk,
|
||||
collect_network=self.config.collect_network,
|
||||
collect_load=self.config.collect_load,
|
||||
)
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Establish connection to the aggregator."""
|
||||
self.logger.info(
|
||||
"connecting_to_aggregator",
|
||||
aggregator_url=self.config.aggregator_url,
|
||||
)
|
||||
|
||||
self.channel = grpc.aio.insecure_channel(
|
||||
self.config.aggregator_url,
|
||||
options=[
|
||||
("grpc.keepalive_time_ms", 10000),
|
||||
("grpc.keepalive_timeout_ms", 5000),
|
||||
("grpc.keepalive_permit_without_calls", True),
|
||||
],
|
||||
)
|
||||
self.stub = metrics_pb2_grpc.MetricsServiceStub(self.channel)
|
||||
|
||||
# Wait for channel to be ready
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self.channel.channel_ready(),
|
||||
timeout=10.0,
|
||||
)
|
||||
self.logger.info("connected_to_aggregator")
|
||||
except asyncio.TimeoutError:
|
||||
self.logger.error("connection_timeout")
|
||||
raise
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Close connection to the aggregator."""
|
||||
if self.channel:
|
||||
await self.channel.close()
|
||||
self.channel = None
|
||||
self.stub = None
|
||||
self.logger.info("disconnected_from_aggregator")
|
||||
|
||||
def _batch_to_proto(self, batch) -> list[metrics_pb2.Metric]:
|
||||
"""Convert a MetricsBatch to protobuf messages."""
|
||||
protos = []
|
||||
for metric in batch.metrics:
|
||||
proto = metrics_pb2.Metric(
|
||||
machine_id=batch.machine_id,
|
||||
hostname=batch.hostname,
|
||||
timestamp_ms=batch.timestamp_ms,
|
||||
type=getattr(metrics_pb2, metric.metric_type, 0),
|
||||
value=metric.value,
|
||||
labels=metric.labels,
|
||||
)
|
||||
protos.append(proto)
|
||||
return protos
|
||||
|
||||
async def _metric_generator(self):
|
||||
"""Async generator that yields metrics at the configured interval."""
|
||||
while self.running:
|
||||
batch = self.collector.collect()
|
||||
protos = self._batch_to_proto(batch)
|
||||
|
||||
for proto in protos:
|
||||
yield proto
|
||||
|
||||
self.logger.debug(
|
||||
"collected_metrics",
|
||||
count=len(protos),
|
||||
machine_id=batch.machine_id,
|
||||
)
|
||||
|
||||
await asyncio.sleep(self.config.collection_interval)
|
||||
|
||||
async def stream_metrics(self) -> None:
|
||||
"""Stream metrics to the aggregator."""
|
||||
if not self.stub:
|
||||
raise RuntimeError("Not connected to aggregator")
|
||||
|
||||
retry_count = 0
|
||||
max_retries = 10
|
||||
base_delay = 1.0
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
self.logger.info("starting_metric_stream")
|
||||
|
||||
response = await self.stub.StreamMetrics(self._metric_generator())
|
||||
|
||||
self.logger.info(
|
||||
"stream_completed",
|
||||
success=response.success,
|
||||
metrics_received=response.metrics_received,
|
||||
message=response.message,
|
||||
)
|
||||
|
||||
retry_count = 0
|
||||
|
||||
except grpc.aio.AioRpcError as e:
|
||||
retry_count += 1
|
||||
delay = min(base_delay * (2**retry_count), 60.0)
|
||||
|
||||
self.logger.warning(
|
||||
"stream_error",
|
||||
code=e.code().name,
|
||||
details=e.details(),
|
||||
retry_count=retry_count,
|
||||
retry_delay=delay,
|
||||
)
|
||||
|
||||
if retry_count >= max_retries:
|
||||
self.logger.error("max_retries_exceeded")
|
||||
raise
|
||||
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# Reconnect
|
||||
try:
|
||||
await self.disconnect()
|
||||
await self.connect()
|
||||
except Exception as conn_err:
|
||||
self.logger.error("reconnect_failed", error=str(conn_err))
|
||||
|
||||
except asyncio.CancelledError:
|
||||
self.logger.info("stream_cancelled")
|
||||
break
|
||||
|
||||
async def run(self) -> None:
|
||||
"""Main entry point for the collector service."""
|
||||
self.running = True
|
||||
|
||||
self.logger.info(
|
||||
"collector_starting",
|
||||
machine_id=self.config.machine_id,
|
||||
interval=self.config.collection_interval,
|
||||
)
|
||||
|
||||
# Initial CPU percent call to initialize (first call always returns 0)
|
||||
import psutil
|
||||
|
||||
psutil.cpu_percent()
|
||||
|
||||
await self.connect()
|
||||
|
||||
try:
|
||||
await self.stream_metrics()
|
||||
finally:
|
||||
await self.disconnect()
|
||||
self.logger.info("collector_stopped")
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Signal the collector to stop."""
|
||||
self.running = False
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point."""
|
||||
service = CollectorService()
|
||||
|
||||
# Handle shutdown signals
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def signal_handler():
|
||||
service.logger.info("shutdown_signal_received")
|
||||
service.stop()
|
||||
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, signal_handler)
|
||||
|
||||
await service.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
233
services/collector/metrics.py
Normal file
233
services/collector/metrics.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""System metrics collection using psutil."""
|
||||
|
||||
import socket
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricPoint:
|
||||
"""A single metric data point."""
|
||||
|
||||
metric_type: str
|
||||
value: float
|
||||
labels: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricsBatch:
|
||||
"""A batch of metrics from a single collection cycle."""
|
||||
|
||||
machine_id: str
|
||||
hostname: str
|
||||
timestamp_ms: int
|
||||
metrics: list[MetricPoint]
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Collects system metrics using psutil."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
machine_id: str,
|
||||
collect_cpu: bool = True,
|
||||
collect_memory: bool = True,
|
||||
collect_disk: bool = True,
|
||||
collect_network: bool = True,
|
||||
collect_load: bool = True,
|
||||
):
|
||||
self.machine_id = machine_id
|
||||
self.hostname = socket.gethostname()
|
||||
|
||||
self.collect_cpu = collect_cpu
|
||||
self.collect_memory = collect_memory
|
||||
self.collect_disk = collect_disk
|
||||
self.collect_network = collect_network
|
||||
self.collect_load = collect_load
|
||||
|
||||
# Track previous network counters for rate calculation
|
||||
self._prev_net_io: psutil._common.snetio | None = None
|
||||
self._prev_net_time: float | None = None
|
||||
|
||||
def collect(self) -> MetricsBatch:
|
||||
"""Collect all enabled metrics and return as a batch."""
|
||||
metrics: list[MetricPoint] = []
|
||||
|
||||
if self.collect_cpu:
|
||||
metrics.extend(self._collect_cpu())
|
||||
|
||||
if self.collect_memory:
|
||||
metrics.extend(self._collect_memory())
|
||||
|
||||
if self.collect_disk:
|
||||
metrics.extend(self._collect_disk())
|
||||
|
||||
if self.collect_network:
|
||||
metrics.extend(self._collect_network())
|
||||
|
||||
if self.collect_load:
|
||||
metrics.extend(self._collect_load())
|
||||
|
||||
return MetricsBatch(
|
||||
machine_id=self.machine_id,
|
||||
hostname=self.hostname,
|
||||
timestamp_ms=int(time.time() * 1000),
|
||||
metrics=metrics,
|
||||
)
|
||||
|
||||
def _collect_cpu(self) -> list[MetricPoint]:
|
||||
"""Collect CPU metrics."""
|
||||
metrics = []
|
||||
|
||||
# Overall CPU percent
|
||||
cpu_percent = psutil.cpu_percent(interval=None)
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="CPU_PERCENT",
|
||||
value=cpu_percent,
|
||||
)
|
||||
)
|
||||
|
||||
# Per-core CPU percent
|
||||
per_cpu = psutil.cpu_percent(interval=None, percpu=True)
|
||||
for i, pct in enumerate(per_cpu):
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="CPU_PERCENT_PER_CORE",
|
||||
value=pct,
|
||||
labels={"core": str(i)},
|
||||
)
|
||||
)
|
||||
|
||||
return metrics
|
||||
|
||||
def _collect_memory(self) -> list[MetricPoint]:
|
||||
"""Collect memory metrics."""
|
||||
mem = psutil.virtual_memory()
|
||||
|
||||
return [
|
||||
MetricPoint(metric_type="MEMORY_PERCENT", value=mem.percent),
|
||||
MetricPoint(metric_type="MEMORY_USED_BYTES", value=float(mem.used)),
|
||||
MetricPoint(
|
||||
metric_type="MEMORY_AVAILABLE_BYTES", value=float(mem.available)
|
||||
),
|
||||
]
|
||||
|
||||
def _collect_disk(self) -> list[MetricPoint]:
|
||||
"""Collect disk metrics."""
|
||||
metrics = []
|
||||
|
||||
# Disk usage for root partition
|
||||
try:
|
||||
disk = psutil.disk_usage("/")
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="DISK_PERCENT",
|
||||
value=disk.percent,
|
||||
labels={"mount": "/"},
|
||||
)
|
||||
)
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="DISK_USED_BYTES",
|
||||
value=float(disk.used),
|
||||
labels={"mount": "/"},
|
||||
)
|
||||
)
|
||||
except (PermissionError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
# Disk I/O rates
|
||||
try:
|
||||
io = psutil.disk_io_counters()
|
||||
if io:
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="DISK_READ_BYTES_SEC",
|
||||
value=float(
|
||||
io.read_bytes
|
||||
), # Will be converted to rate by aggregator
|
||||
)
|
||||
)
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="DISK_WRITE_BYTES_SEC",
|
||||
value=float(io.write_bytes),
|
||||
)
|
||||
)
|
||||
except (PermissionError, AttributeError):
|
||||
pass
|
||||
|
||||
return metrics
|
||||
|
||||
def _collect_network(self) -> list[MetricPoint]:
|
||||
"""Collect network metrics with rate calculation."""
|
||||
metrics = []
|
||||
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
current_time = time.time()
|
||||
|
||||
if self._prev_net_io is not None and self._prev_net_time is not None:
|
||||
time_delta = current_time - self._prev_net_time
|
||||
if time_delta > 0:
|
||||
bytes_sent_rate = (
|
||||
net_io.bytes_sent - self._prev_net_io.bytes_sent
|
||||
) / time_delta
|
||||
bytes_recv_rate = (
|
||||
net_io.bytes_recv - self._prev_net_io.bytes_recv
|
||||
) / time_delta
|
||||
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="NETWORK_SENT_BYTES_SEC",
|
||||
value=bytes_sent_rate,
|
||||
)
|
||||
)
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="NETWORK_RECV_BYTES_SEC",
|
||||
value=bytes_recv_rate,
|
||||
)
|
||||
)
|
||||
|
||||
self._prev_net_io = net_io
|
||||
self._prev_net_time = current_time
|
||||
|
||||
# Connection count
|
||||
connections = len(psutil.net_connections(kind="inet"))
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="NETWORK_CONNECTIONS",
|
||||
value=float(connections),
|
||||
)
|
||||
)
|
||||
except (PermissionError, psutil.AccessDenied):
|
||||
pass
|
||||
|
||||
return metrics
|
||||
|
||||
def _collect_load(self) -> list[MetricPoint]:
|
||||
"""Collect load average metrics (Unix only)."""
|
||||
metrics = []
|
||||
|
||||
try:
|
||||
load1, load5, load15 = psutil.getloadavg()
|
||||
metrics.append(MetricPoint(metric_type="LOAD_AVG_1M", value=load1))
|
||||
metrics.append(MetricPoint(metric_type="LOAD_AVG_5M", value=load5))
|
||||
metrics.append(MetricPoint(metric_type="LOAD_AVG_15M", value=load15))
|
||||
except (AttributeError, OSError):
|
||||
# Windows doesn't have getloadavg
|
||||
pass
|
||||
|
||||
# Process count
|
||||
metrics.append(
|
||||
MetricPoint(
|
||||
metric_type="PROCESS_COUNT",
|
||||
value=float(len(psutil.pids())),
|
||||
)
|
||||
)
|
||||
|
||||
return metrics
|
||||
Reference in New Issue
Block a user