Fix metrics flickering and improve internals page

- Fix dashboard metrics alternating to 0 by merging partial batches
  in gateway before broadcasting to WebSocket clients. The aggregator
  sends metrics in batches of 20, causing partial updates that
  overwrote each other. Gateway now maintains machine_metrics_cache
  that accumulates metrics across batches.

- Remove misleading gRPC calls counter from internals page (only
  incremented on health checks, not actual metric flow). Replace
  with cached_machines counter showing tracked machines.

- Update internals.html stats panel to show Events, Broadcasts,
  Clients, and Machines instead of gRPC calls.
This commit is contained in:
buenosairesam
2025-12-31 02:15:57 -03:00
parent ee9cbf73ec
commit 00b1e663d9
3 changed files with 577 additions and 37 deletions

View File

@@ -38,10 +38,21 @@ class RedisStorage:
metrics: dict[str, float],
timestamp_ms: int,
) -> None:
"""Update the current state for a machine."""
"""Update the current state for a machine (merges metrics, doesn't replace)."""
if not self._client:
raise RuntimeError("Not connected to Redis")
key = f"machine:{machine_id}"
# Get existing state to merge metrics
existing_data = await self._client.hget(key, "state")
if existing_data:
existing_state = json.loads(existing_data)
existing_metrics = existing_state.get("metrics", {})
# Merge new metrics into existing (new values override old)
existing_metrics.update(metrics)
metrics = existing_metrics
state = {
"machine_id": machine_id,
"hostname": hostname,
@@ -51,7 +62,6 @@ class RedisStorage:
}
# Store as hash for efficient partial reads
key = f"machine:{machine_id}"
await self._client.hset(
key,
mapping={