claude final draft

This commit is contained in:
buenosairesam
2025-12-29 23:44:30 -03:00
parent 116d4032e2
commit e5aafd5097
22 changed files with 2815 additions and 32 deletions

361
services/aggregator/main.py Normal file
View File

@@ -0,0 +1,361 @@
"""Aggregator service - gRPC server that receives metrics and stores them."""
import asyncio
import signal
import sys
from pathlib import Path
import grpc
from grpc_health.v1 import health, health_pb2, health_pb2_grpc
# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from services.aggregator.storage import RedisStorage, TimescaleStorage
from shared import metrics_pb2, metrics_pb2_grpc
from shared.config import get_aggregator_config
from shared.events import get_publisher
from shared.logging import setup_logging
class MetricsServicer(metrics_pb2_grpc.MetricsServiceServicer):
"""gRPC servicer for metrics ingestion."""
def __init__(
self,
redis_storage: RedisStorage,
timescale_storage: TimescaleStorage,
event_publisher,
logger,
):
self.redis = redis_storage
self.timescale = timescale_storage
self.publisher = event_publisher
self.logger = logger
async def StreamMetrics(self, request_iterator, context):
"""Receive streaming metrics from a collector."""
metrics_received = 0
current_machine = None
current_batch: list[tuple[str, float, dict]] = []
batch_timestamp = 0
batch_hostname = ""
try:
async for metric in request_iterator:
metrics_received += 1
# Track current machine
if current_machine != metric.machine_id:
# Flush previous batch if switching machines
if current_machine and current_batch:
await self._flush_batch(
current_machine,
batch_hostname,
batch_timestamp,
current_batch,
)
current_batch = []
current_machine = metric.machine_id
self.logger.info(
"collector_connected",
machine_id=metric.machine_id,
hostname=metric.hostname,
)
# Get metric type name
metric_type = metrics_pb2.MetricType.Name(metric.type)
# Add to batch
current_batch.append(
(
metric_type,
metric.value,
dict(metric.labels),
)
)
batch_timestamp = metric.timestamp_ms
batch_hostname = metric.hostname
# Flush batch every 20 metrics or if timestamp changes significantly
if len(current_batch) >= 20:
await self._flush_batch(
current_machine, batch_hostname, batch_timestamp, current_batch
)
current_batch = []
# Flush remaining
if current_machine and current_batch:
await self._flush_batch(
current_machine, batch_hostname, batch_timestamp, current_batch
)
self.logger.info(
"stream_completed",
machine_id=current_machine,
metrics_received=metrics_received,
)
return metrics_pb2.StreamAck(
success=True,
metrics_received=metrics_received,
message="OK",
)
except Exception as e:
self.logger.error(
"stream_error",
error=str(e),
machine_id=current_machine,
metrics_received=metrics_received,
)
return metrics_pb2.StreamAck(
success=False,
metrics_received=metrics_received,
message=str(e),
)
async def _flush_batch(
self,
machine_id: str,
hostname: str,
timestamp_ms: int,
batch: list[tuple[str, float, dict]],
) -> None:
"""Flush a batch of metrics to storage and events."""
# Aggregate metrics for Redis state
metrics_dict = {}
for metric_type, value, labels in batch:
key = metric_type
if labels:
key = f"{metric_type}:{','.join(f'{k}={v}' for k, v in labels.items())}"
metrics_dict[key] = value
# Update Redis (current state)
await self.redis.update_machine_state(
machine_id=machine_id,
hostname=hostname,
metrics=metrics_dict,
timestamp_ms=timestamp_ms,
)
# Insert into TimescaleDB (historical)
try:
await self.timescale.insert_metrics(
machine_id=machine_id,
hostname=hostname,
timestamp_ms=timestamp_ms,
metrics=batch,
)
except Exception as e:
self.logger.warning("timescale_insert_failed", error=str(e))
# Update machine registry
try:
await self.timescale.update_machine_registry(
machine_id=machine_id,
hostname=hostname,
)
except Exception as e:
self.logger.warning("machine_registry_update_failed", error=str(e))
# Publish event for subscribers (alerts, gateway)
await self.publisher.publish(
topic="metrics.raw",
payload={
"machine_id": machine_id,
"hostname": hostname,
"timestamp_ms": timestamp_ms,
"metrics": metrics_dict,
},
)
self.logger.debug(
"batch_flushed",
machine_id=machine_id,
count=len(batch),
)
async def GetCurrentState(self, request, context):
"""Get current state for a single machine."""
state = await self.redis.get_machine_state(request.machine_id)
if not state:
context.set_code(grpc.StatusCode.NOT_FOUND)
context.set_details(f"Machine {request.machine_id} not found")
return metrics_pb2.MachineState()
# Convert state to proto
metrics = []
for key, value in state.get("metrics", {}).items():
parts = key.split(":")
metric_type_str = parts[0]
labels = {}
if len(parts) > 1:
for pair in parts[1].split(","):
k, v = pair.split("=")
labels[k] = v
metric_type = getattr(metrics_pb2, metric_type_str, 0)
metrics.append(
metrics_pb2.Metric(
machine_id=state["machine_id"],
hostname=state["hostname"],
timestamp_ms=state["last_seen_ms"],
type=metric_type,
value=value,
labels=labels,
)
)
return metrics_pb2.MachineState(
machine_id=state["machine_id"],
hostname=state["hostname"],
last_seen_ms=state["last_seen_ms"],
current_metrics=metrics,
health=metrics_pb2.HEALTHY,
)
async def GetAllStates(self, request, context):
"""Get current state for all machines."""
states = await self.redis.get_all_machines()
machine_states = []
for state in states:
metrics = []
for key, value in state.get("metrics", {}).items():
parts = key.split(":")
metric_type_str = parts[0]
metric_type = getattr(metrics_pb2, metric_type_str, 0)
metrics.append(
metrics_pb2.Metric(
machine_id=state["machine_id"],
hostname=state["hostname"],
timestamp_ms=state["last_seen_ms"],
type=metric_type,
value=value,
)
)
machine_states.append(
metrics_pb2.MachineState(
machine_id=state["machine_id"],
hostname=state["hostname"],
last_seen_ms=state["last_seen_ms"],
current_metrics=metrics,
health=metrics_pb2.HEALTHY,
)
)
return metrics_pb2.AllMachinesState(machines=machine_states)
class AggregatorService:
"""Main aggregator service."""
def __init__(self):
self.config = get_aggregator_config()
self.logger = setup_logging(
service_name=self.config.service_name,
log_level=self.config.log_level,
log_format=self.config.log_format,
)
self.redis = RedisStorage(self.config.redis_url)
self.timescale = TimescaleStorage(self.config.timescale_url)
self.publisher = get_publisher(source="aggregator")
self.server: grpc.aio.Server | None = None
self.running = False
async def start(self) -> None:
"""Start the gRPC server."""
self.running = True
# Connect to storage
await self.redis.connect()
try:
await self.timescale.connect()
except Exception as e:
self.logger.warning(
"timescale_connection_failed",
error=str(e),
message="Continuing without TimescaleDB - metrics won't be persisted",
)
# Connect to event publisher
await self.publisher.connect()
# Create gRPC server
self.server = grpc.aio.server()
# Add metrics servicer
servicer = MetricsServicer(
redis_storage=self.redis,
timescale_storage=self.timescale,
event_publisher=self.publisher,
logger=self.logger,
)
metrics_pb2_grpc.add_MetricsServiceServicer_to_server(servicer, self.server)
# Add health check servicer
health_servicer = health.HealthServicer()
health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
health_servicer.set("MetricsService", health_pb2.HealthCheckResponse.SERVING)
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, self.server)
# Start server
listen_addr = f"[::]:{self.config.grpc_port}"
self.server.add_insecure_port(listen_addr)
await self.server.start()
self.logger.info(
"aggregator_started",
port=self.config.grpc_port,
listen_addr=listen_addr,
)
async def stop(self) -> None:
"""Stop the gRPC server."""
self.running = False
if self.server:
await self.server.stop(grace=5)
self.server = None
await self.publisher.disconnect()
await self.timescale.disconnect()
await self.redis.disconnect()
self.logger.info("aggregator_stopped")
async def wait(self) -> None:
"""Wait for the server to terminate."""
if self.server:
await self.server.wait_for_termination()
async def main():
"""Main entry point."""
service = AggregatorService()
# Handle shutdown signals
loop = asyncio.get_event_loop()
async def shutdown():
service.logger.info("shutdown_signal_received")
await service.stop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown()))
await service.start()
await service.wait()
if __name__ == "__main__":
asyncio.run(main())