234 lines
6.9 KiB
Python
234 lines
6.9 KiB
Python
"""System metrics collection using psutil."""
|
|
|
|
import socket
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
|
|
import psutil
|
|
|
|
|
|
@dataclass
|
|
class MetricPoint:
|
|
"""A single metric data point."""
|
|
|
|
metric_type: str
|
|
value: float
|
|
labels: dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class MetricsBatch:
|
|
"""A batch of metrics from a single collection cycle."""
|
|
|
|
machine_id: str
|
|
hostname: str
|
|
timestamp_ms: int
|
|
metrics: list[MetricPoint]
|
|
|
|
|
|
class MetricsCollector:
|
|
"""Collects system metrics using psutil."""
|
|
|
|
def __init__(
|
|
self,
|
|
machine_id: str,
|
|
collect_cpu: bool = True,
|
|
collect_memory: bool = True,
|
|
collect_disk: bool = True,
|
|
collect_network: bool = True,
|
|
collect_load: bool = True,
|
|
):
|
|
self.machine_id = machine_id
|
|
self.hostname = socket.gethostname()
|
|
|
|
self.collect_cpu = collect_cpu
|
|
self.collect_memory = collect_memory
|
|
self.collect_disk = collect_disk
|
|
self.collect_network = collect_network
|
|
self.collect_load = collect_load
|
|
|
|
# Track previous network counters for rate calculation
|
|
self._prev_net_io: psutil._common.snetio | None = None
|
|
self._prev_net_time: float | None = None
|
|
|
|
def collect(self) -> MetricsBatch:
|
|
"""Collect all enabled metrics and return as a batch."""
|
|
metrics: list[MetricPoint] = []
|
|
|
|
if self.collect_cpu:
|
|
metrics.extend(self._collect_cpu())
|
|
|
|
if self.collect_memory:
|
|
metrics.extend(self._collect_memory())
|
|
|
|
if self.collect_disk:
|
|
metrics.extend(self._collect_disk())
|
|
|
|
if self.collect_network:
|
|
metrics.extend(self._collect_network())
|
|
|
|
if self.collect_load:
|
|
metrics.extend(self._collect_load())
|
|
|
|
return MetricsBatch(
|
|
machine_id=self.machine_id,
|
|
hostname=self.hostname,
|
|
timestamp_ms=int(time.time() * 1000),
|
|
metrics=metrics,
|
|
)
|
|
|
|
def _collect_cpu(self) -> list[MetricPoint]:
|
|
"""Collect CPU metrics."""
|
|
metrics = []
|
|
|
|
# Overall CPU percent
|
|
cpu_percent = psutil.cpu_percent(interval=None)
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="CPU_PERCENT",
|
|
value=cpu_percent,
|
|
)
|
|
)
|
|
|
|
# Per-core CPU percent
|
|
per_cpu = psutil.cpu_percent(interval=None, percpu=True)
|
|
for i, pct in enumerate(per_cpu):
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="CPU_PERCENT_PER_CORE",
|
|
value=pct,
|
|
labels={"core": str(i)},
|
|
)
|
|
)
|
|
|
|
return metrics
|
|
|
|
def _collect_memory(self) -> list[MetricPoint]:
|
|
"""Collect memory metrics."""
|
|
mem = psutil.virtual_memory()
|
|
|
|
return [
|
|
MetricPoint(metric_type="MEMORY_PERCENT", value=mem.percent),
|
|
MetricPoint(metric_type="MEMORY_USED_BYTES", value=float(mem.used)),
|
|
MetricPoint(
|
|
metric_type="MEMORY_AVAILABLE_BYTES", value=float(mem.available)
|
|
),
|
|
]
|
|
|
|
def _collect_disk(self) -> list[MetricPoint]:
|
|
"""Collect disk metrics."""
|
|
metrics = []
|
|
|
|
# Disk usage for root partition
|
|
try:
|
|
disk = psutil.disk_usage("/")
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="DISK_PERCENT",
|
|
value=disk.percent,
|
|
labels={"mount": "/"},
|
|
)
|
|
)
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="DISK_USED_BYTES",
|
|
value=float(disk.used),
|
|
labels={"mount": "/"},
|
|
)
|
|
)
|
|
except (PermissionError, FileNotFoundError):
|
|
pass
|
|
|
|
# Disk I/O rates
|
|
try:
|
|
io = psutil.disk_io_counters()
|
|
if io:
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="DISK_READ_BYTES_SEC",
|
|
value=float(
|
|
io.read_bytes
|
|
), # Will be converted to rate by aggregator
|
|
)
|
|
)
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="DISK_WRITE_BYTES_SEC",
|
|
value=float(io.write_bytes),
|
|
)
|
|
)
|
|
except (PermissionError, AttributeError):
|
|
pass
|
|
|
|
return metrics
|
|
|
|
def _collect_network(self) -> list[MetricPoint]:
|
|
"""Collect network metrics with rate calculation."""
|
|
metrics = []
|
|
|
|
try:
|
|
net_io = psutil.net_io_counters()
|
|
current_time = time.time()
|
|
|
|
if self._prev_net_io is not None and self._prev_net_time is not None:
|
|
time_delta = current_time - self._prev_net_time
|
|
if time_delta > 0:
|
|
bytes_sent_rate = (
|
|
net_io.bytes_sent - self._prev_net_io.bytes_sent
|
|
) / time_delta
|
|
bytes_recv_rate = (
|
|
net_io.bytes_recv - self._prev_net_io.bytes_recv
|
|
) / time_delta
|
|
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="NETWORK_SENT_BYTES_SEC",
|
|
value=bytes_sent_rate,
|
|
)
|
|
)
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="NETWORK_RECV_BYTES_SEC",
|
|
value=bytes_recv_rate,
|
|
)
|
|
)
|
|
|
|
self._prev_net_io = net_io
|
|
self._prev_net_time = current_time
|
|
|
|
# Connection count
|
|
connections = len(psutil.net_connections(kind="inet"))
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="NETWORK_CONNECTIONS",
|
|
value=float(connections),
|
|
)
|
|
)
|
|
except (PermissionError, psutil.AccessDenied):
|
|
pass
|
|
|
|
return metrics
|
|
|
|
def _collect_load(self) -> list[MetricPoint]:
|
|
"""Collect load average metrics (Unix only)."""
|
|
metrics = []
|
|
|
|
try:
|
|
load1, load5, load15 = psutil.getloadavg()
|
|
metrics.append(MetricPoint(metric_type="LOAD_AVG_1M", value=load1))
|
|
metrics.append(MetricPoint(metric_type="LOAD_AVG_5M", value=load5))
|
|
metrics.append(MetricPoint(metric_type="LOAD_AVG_15M", value=load15))
|
|
except (AttributeError, OSError):
|
|
# Windows doesn't have getloadavg
|
|
pass
|
|
|
|
# Process count
|
|
metrics.append(
|
|
MetricPoint(
|
|
metric_type="PROCESS_COUNT",
|
|
value=float(len(psutil.pids())),
|
|
)
|
|
)
|
|
|
|
return metrics
|