better redis error handling

This commit is contained in:
buenosairesam
2025-12-30 03:40:20 -03:00
parent e5aafd5097
commit ee9cbf73ec
2 changed files with 74 additions and 19 deletions

View File

@@ -133,13 +133,16 @@ class MetricsServicer(metrics_pb2_grpc.MetricsServiceServicer):
key = f"{metric_type}:{','.join(f'{k}={v}' for k, v in labels.items())}" key = f"{metric_type}:{','.join(f'{k}={v}' for k, v in labels.items())}"
metrics_dict[key] = value metrics_dict[key] = value
# Update Redis (current state) # Update Redis (current state) - don't fail stream if Redis is down
await self.redis.update_machine_state( try:
machine_id=machine_id, await self.redis.update_machine_state(
hostname=hostname, machine_id=machine_id,
metrics=metrics_dict, hostname=hostname,
timestamp_ms=timestamp_ms, metrics=metrics_dict,
) timestamp_ms=timestamp_ms,
)
except Exception as e:
self.logger.warning("redis_update_failed", error=str(e))
# Insert into TimescaleDB (historical) # Insert into TimescaleDB (historical)
try: try:
@@ -161,16 +164,19 @@ class MetricsServicer(metrics_pb2_grpc.MetricsServiceServicer):
except Exception as e: except Exception as e:
self.logger.warning("machine_registry_update_failed", error=str(e)) self.logger.warning("machine_registry_update_failed", error=str(e))
# Publish event for subscribers (alerts, gateway) # Publish event for subscribers (alerts, gateway) - don't fail stream
await self.publisher.publish( try:
topic="metrics.raw", await self.publisher.publish(
payload={ topic="metrics.raw",
"machine_id": machine_id, payload={
"hostname": hostname, "machine_id": machine_id,
"timestamp_ms": timestamp_ms, "hostname": hostname,
"metrics": metrics_dict, "timestamp_ms": timestamp_ms,
}, "metrics": metrics_dict,
) },
)
except Exception as e:
self.logger.warning("event_publish_failed", error=str(e))
self.logger.debug( self.logger.debug(
"batch_flushed", "batch_flushed",

View File

@@ -106,6 +106,51 @@ class RedisPubSubSubscriber(EventSubscriber):
self._topics.extend(topics) self._topics.extend(topics)
async def _reconnect(self) -> None:
"""Attempt to reconnect to Redis."""
max_retries = 10
base_delay = 1.0
for attempt in range(max_retries):
try:
# Clean up old connections
if self._pubsub:
try:
await self._pubsub.close()
except Exception:
pass
if self._client:
try:
await self._client.close()
except Exception:
pass
# Reconnect
self._client = redis.from_url(self.redis_url, decode_responses=True)
await self._client.ping()
self._pubsub = self._client.pubsub()
# Re-subscribe to topics
if self._topics:
patterns = [t for t in self._topics if "*" in t]
channels = [t for t in self._topics if "*" not in t]
if channels:
await self._pubsub.subscribe(*channels)
if patterns:
await self._pubsub.psubscribe(*patterns)
logger.info(f"Reconnected to Redis at {self.redis_url}")
return
except Exception as e:
delay = min(base_delay * (2**attempt), 30.0)
logger.warning(
f"Reconnect attempt {attempt + 1} failed: {e}, retrying in {delay}s"
)
await asyncio.sleep(delay)
raise RuntimeError(f"Failed to reconnect to Redis after {max_retries} attempts")
async def consume(self) -> AsyncIterator[Event]: async def consume(self) -> AsyncIterator[Event]:
if not self._pubsub: if not self._pubsub:
raise RuntimeError("Subscriber not connected") raise RuntimeError("Subscriber not connected")
@@ -138,5 +183,9 @@ class RedisPubSubSubscriber(EventSubscriber):
self._running = False self._running = False
break break
except Exception as e: except Exception as e:
logger.error(f"Error consuming events: {e}") logger.error(f"Error consuming events: {e}, attempting reconnect...")
await asyncio.sleep(1.0) try:
await self._reconnect()
except Exception as reconnect_err:
logger.error(f"Reconnect failed: {reconnect_err}")
await asyncio.sleep(5.0)