Fix metrics flickering and improve internals page

- Fix dashboard metrics alternating to 0 by merging partial batches in gateway before broadcasting to WebSocket clients. The aggregator sends metrics in batches of 20, causing partial updates that overwrote each other. Gateway now maintains machine_metrics_cache that accumulates metrics across batches. - Remove misleading gRPC calls counter from internals page (only incremented on health checks, not actual metric flow). Replace with cached_machines counter showing tracked machines. - Update internals.html stats panel to show Events, Broadcasts, Clients, and Machines instead of gRPC calls.
2025-12-31 02:15:57 -03:00
parent ee9cbf73ec
commit 00b1e663d9
3 changed files with 577 additions and 37 deletions
--- a/services/aggregator/storage.py
+++ b/services/aggregator/storage.py
@@ -38,10 +38,21 @@ class RedisStorage:
        metrics: dict[str, float],
        timestamp_ms: int,
    ) -> None:
-        """Update the current state for a machine."""
+        """Update the current state for a machine (merges metrics, doesn't replace)."""
        if not self._client:
            raise RuntimeError("Not connected to Redis")

+        key = f"machine:{machine_id}"
+
+        # Get existing state to merge metrics
+        existing_data = await self._client.hget(key, "state")
+        if existing_data:
+            existing_state = json.loads(existing_data)
+            existing_metrics = existing_state.get("metrics", {})
+            # Merge new metrics into existing (new values override old)
+            existing_metrics.update(metrics)
+            metrics = existing_metrics
+
        state = {
            "machine_id": machine_id,
            "hostname": hostname,
@@ -51,7 +62,6 @@ class RedisStorage:
        }

        # Store as hash for efficient partial reads
-        key = f"machine:{machine_id}"
        await self._client.hset(
            key,
            mapping={