Fix metrics flickering and improve internals page

- Fix dashboard metrics alternating to 0 by merging partial batches
  in gateway before broadcasting to WebSocket clients. The aggregator
  sends metrics in batches of 20, causing partial updates that
  overwrote each other. Gateway now maintains machine_metrics_cache
  that accumulates metrics across batches.

- Remove misleading gRPC calls counter from internals page (only
  incremented on health checks, not actual metric flow). Replace
  with cached_machines counter showing tracked machines.

- Update internals.html stats panel to show Events, Broadcasts,
  Clients, and Machines instead of gRPC calls.
This commit is contained in:
buenosairesam
2025-12-31 02:15:57 -03:00
parent ee9cbf73ec
commit 00b1e663d9
3 changed files with 577 additions and 37 deletions

View File

@@ -38,10 +38,21 @@ class RedisStorage:
metrics: dict[str, float], metrics: dict[str, float],
timestamp_ms: int, timestamp_ms: int,
) -> None: ) -> None:
"""Update the current state for a machine.""" """Update the current state for a machine (merges metrics, doesn't replace)."""
if not self._client: if not self._client:
raise RuntimeError("Not connected to Redis") raise RuntimeError("Not connected to Redis")
key = f"machine:{machine_id}"
# Get existing state to merge metrics
existing_data = await self._client.hget(key, "state")
if existing_data:
existing_state = json.loads(existing_data)
existing_metrics = existing_state.get("metrics", {})
# Merge new metrics into existing (new values override old)
existing_metrics.update(metrics)
metrics = existing_metrics
state = { state = {
"machine_id": machine_id, "machine_id": machine_id,
"hostname": hostname, "hostname": hostname,
@@ -51,7 +62,6 @@ class RedisStorage:
} }
# Store as hash for efficient partial reads # Store as hash for efficient partial reads
key = f"machine:{machine_id}"
await self._client.hset( await self._client.hset(
key, key,
mapping={ mapping={

View File

@@ -72,25 +72,84 @@ class ConnectionManager:
manager = ConnectionManager() manager = ConnectionManager()
internals_manager = ConnectionManager() # Separate manager for internals page
timescale: TimescaleStorage | None = None timescale: TimescaleStorage | None = None
grpc_channel: grpc.aio.Channel | None = None grpc_channel: grpc.aio.Channel | None = None
grpc_stub: metrics_pb2_grpc.MetricsServiceStub | None = None grpc_stub: metrics_pb2_grpc.MetricsServiceStub | None = None
# Track recent events for internals view
recent_events: list[dict] = []
MAX_RECENT_EVENTS = 100
service_stats = {
"events_received": 0,
"websocket_broadcasts": 0,
"started_at": None,
}
# Cache of latest full metrics per machine (merges partial batches)
machine_metrics_cache: dict[str, dict] = {}
async def event_listener(): async def event_listener():
"""Background task that listens for metric events and broadcasts to WebSocket clients.""" """Background task that listens for metric events and broadcasts to WebSocket clients."""
logger.info("event_listener_starting") logger.info("event_listener_starting")
async with get_subscriber(topics=["metrics.raw"]) as subscriber: async with get_subscriber(topics=["metrics.raw", "alerts.*"]) as subscriber:
async for event in subscriber.consume(): async for event in subscriber.consume():
try: try:
service_stats["events_received"] += 1
# Track for internals view
event_record = {
"id": event.event_id[:8],
"topic": event.topic,
"source": event.source,
"timestamp": event.timestamp.isoformat(),
"machine_id": event.payload.get("machine_id", ""),
"metrics_count": len(event.payload.get("metrics", {})),
}
recent_events.insert(0, event_record)
if len(recent_events) > MAX_RECENT_EVENTS:
recent_events.pop()
# Merge partial batch metrics into cache
machine_id = event.payload.get("machine_id", "")
incoming_metrics = event.payload.get("metrics", {})
if machine_id:
if machine_id not in machine_metrics_cache:
machine_metrics_cache[machine_id] = {}
# Merge new metrics into existing (accumulate across batches)
machine_metrics_cache[machine_id].update(incoming_metrics)
# Build complete payload with merged metrics
merged_payload = {
"machine_id": machine_id,
"hostname": event.payload.get("hostname", ""),
"timestamp_ms": event.payload.get("timestamp_ms", 0),
"metrics": machine_metrics_cache[machine_id],
}
else:
merged_payload = event.payload
# Broadcast merged data to dashboard
await manager.broadcast( await manager.broadcast(
{ {
"type": "metrics", "type": "metrics",
"data": event.payload, "data": merged_payload,
"timestamp": event.timestamp.isoformat(), "timestamp": event.timestamp.isoformat(),
} }
) )
service_stats["websocket_broadcasts"] += 1
# Broadcast to internals (show raw event, not merged)
await internals_manager.broadcast(
{
"type": "event",
"data": event_record,
}
)
except Exception as e: except Exception as e:
logger.warning("broadcast_error", error=str(e)) logger.warning("broadcast_error", error=str(e))
@@ -117,6 +176,7 @@ async def lifespan(app: FastAPI):
# Start event listener in background # Start event listener in background
listener_task = asyncio.create_task(event_listener()) listener_task = asyncio.create_task(event_listener())
service_stats["started_at"] = datetime.utcnow().isoformat()
logger.info("gateway_started") logger.info("gateway_started")
yield yield
@@ -341,6 +401,85 @@ async def websocket_endpoint(websocket: WebSocket):
manager.disconnect(websocket) manager.disconnect(websocket)
# ============================================================================
# Internals / Debug endpoints
# ============================================================================
@app.get("/api/internals")
async def get_internals():
"""Get internal service stats and recent events."""
# Get service health
services = {
"gateway": {"status": "healthy", "started_at": service_stats["started_at"]},
"aggregator": {"status": "unknown"},
"redis": {"status": "unknown"},
"timescaledb": {"status": "unknown"},
}
# Check aggregator
try:
if grpc_stub:
await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=2.0)
services["aggregator"]["status"] = "healthy"
except Exception as e:
services["aggregator"]["status"] = f"error: {str(e)[:50]}"
# Check TimescaleDB
if timescale and timescale._pool:
services["timescaledb"]["status"] = "healthy"
else:
services["timescaledb"]["status"] = "not connected"
# Redis is healthy if we're receiving events
if service_stats["events_received"] > 0:
services["redis"]["status"] = "healthy"
return {
"stats": {
"events_received": service_stats["events_received"],
"websocket_broadcasts": service_stats["websocket_broadcasts"],
"dashboard_connections": len(manager.active_connections),
"internals_connections": len(internals_manager.active_connections),
"cached_machines": len(machine_metrics_cache),
},
"services": services,
"recent_events": recent_events[:20],
}
@app.websocket("/ws/internals")
async def internals_websocket(websocket: WebSocket):
"""WebSocket for real-time internals updates."""
await internals_manager.connect(websocket)
try:
# Send initial state
await websocket.send_json(
{
"type": "init",
"data": {
"stats": {
"events_received": service_stats["events_received"],
"websocket_broadcasts": service_stats["websocket_broadcasts"],
"cached_machines": len(machine_metrics_cache),
},
"recent_events": recent_events[:20],
},
}
)
while True:
try:
data = await websocket.receive_text()
if data == "ping":
await websocket.send_text("pong")
except WebSocketDisconnect:
break
finally:
internals_manager.disconnect(websocket)
# ============================================================================ # ============================================================================
# Dashboard (HTML) # Dashboard (HTML)
# ============================================================================ # ============================================================================
@@ -351,40 +490,17 @@ async def dashboard(request: Request):
"""Serve the dashboard HTML.""" """Serve the dashboard HTML."""
if templates: if templates:
return templates.TemplateResponse("dashboard.html", {"request": request}) return templates.TemplateResponse("dashboard.html", {"request": request})
return HTMLResponse(
"<h1>Dashboard template not found</h1><p><a href='/internals'>Internals</a> | <a href='/docs'>API Docs</a></p>"
)
# Fallback if templates not found
return HTMLResponse(""" @app.get("/internals", response_class=HTMLResponse)
<!DOCTYPE html> async def internals_page(request: Request):
<html> """Serve the internals/debug HTML page."""
<head> if templates:
<title>System Monitor</title> return templates.TemplateResponse("internals.html", {"request": request})
<style> return HTMLResponse("<h1>Internals template not found</h1>")
body { font-family: system-ui; background: #1a1a2e; color: #eee; padding: 2rem; }
h1 { color: #e94560; }
pre { background: #16213e; padding: 1rem; border-radius: 8px; overflow: auto; }
</style>
</head>
<body>
<h1>System Monitor</h1>
<p>Dashboard template not found. API endpoints:</p>
<ul>
<li><a href="/api/machines">/api/machines</a> - Current state of all machines</li>
<li><a href="/api/metrics">/api/metrics</a> - Historical metrics</li>
<li><a href="/docs">/docs</a> - API documentation</li>
</ul>
<h2>Live Metrics</h2>
<pre id="output">Connecting...</pre>
<script>
const ws = new WebSocket(`ws://${location.host}/ws`);
const output = document.getElementById('output');
ws.onmessage = (e) => {
output.textContent = JSON.stringify(JSON.parse(e.data), null, 2);
};
ws.onclose = () => { output.textContent = 'Disconnected'; };
</script>
</body>
</html>
""")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,414 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>System Monitor - Internals</title>
<style>
:root {
--bg-primary: #0d1117;
--bg-secondary: #161b22;
--bg-tertiary: #21262d;
--border: #30363d;
--text-primary: #c9d1d9;
--text-secondary: #8b949e;
--accent-green: #3fb950;
--accent-red: #f85149;
--accent-yellow: #d29922;
--accent-blue: #58a6ff;
--accent-purple: #a371f7;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: "SF Mono", "Fira Code", monospace;
background: var(--bg-primary);
color: var(--text-primary);
font-size: 13px;
line-height: 1.5;
}
.container {
max-width: 1400px;
margin: 0 auto;
padding: 1rem;
}
header {
background: var(--bg-secondary);
border-bottom: 1px solid var(--border);
padding: 0.75rem 1rem;
display: flex;
justify-content: space-between;
align-items: center;
}
header h1 {
font-size: 1rem;
font-weight: 500;
}
header h1 span {
color: var(--accent-purple);
}
.nav-links a {
color: var(--text-secondary);
text-decoration: none;
margin-left: 1.5rem;
}
.nav-links a:hover {
color: var(--text-primary);
}
.grid {
display: grid;
grid-template-columns: 300px 1fr;
gap: 1rem;
margin-top: 1rem;
}
.panel {
background: var(--bg-secondary);
border: 1px solid var(--border);
border-radius: 6px;
overflow: hidden;
}
.panel-header {
background: var(--bg-tertiary);
padding: 0.5rem 0.75rem;
border-bottom: 1px solid var(--border);
font-weight: 500;
font-size: 0.75rem;
text-transform: uppercase;
letter-spacing: 0.05em;
color: var(--text-secondary);
}
.panel-body {
padding: 0.75rem;
}
.status-grid {
display: grid;
gap: 0.5rem;
}
.status-item {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.5rem;
background: var(--bg-tertiary);
border-radius: 4px;
}
.status-label {
color: var(--text-secondary);
}
.status-value {
font-weight: 500;
}
.status-dot {
width: 8px;
height: 8px;
border-radius: 50%;
display: inline-block;
margin-right: 0.5rem;
}
.status-dot.healthy {
background: var(--accent-green);
box-shadow: 0 0 6px var(--accent-green);
}
.status-dot.error {
background: var(--accent-red);
}
.status-dot.unknown {
background: var(--accent-yellow);
}
.stats-grid {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 0.5rem;
}
.stat-box {
background: var(--bg-tertiary);
padding: 0.75rem;
border-radius: 4px;
text-align: center;
}
.stat-value {
font-size: 1.5rem;
font-weight: 600;
color: var(--accent-blue);
}
.stat-label {
font-size: 0.7rem;
color: var(--text-secondary);
text-transform: uppercase;
}
.event-stream {
height: calc(100vh - 200px);
overflow-y: auto;
}
.event-item {
display: grid;
grid-template-columns: 70px 90px 1fr 80px 60px;
gap: 0.5rem;
padding: 0.4rem 0.75rem;
border-bottom: 1px solid var(--border);
font-size: 0.8rem;
align-items: center;
}
.event-item:hover {
background: var(--bg-tertiary);
}
.event-time {
color: var(--text-secondary);
}
.event-topic {
padding: 0.15rem 0.4rem;
border-radius: 3px;
font-size: 0.7rem;
font-weight: 500;
}
.event-topic.metrics {
background: rgba(56, 139, 253, 0.15);
color: var(--accent-blue);
}
.event-topic.alerts {
background: rgba(248, 81, 73, 0.15);
color: var(--accent-red);
}
.event-source {
color: var(--text-secondary);
}
.event-machine {
color: var(--accent-purple);
}
.event-count {
color: var(--text-secondary);
text-align: right;
}
.pulse {
animation: pulse 2s ease-in-out infinite;
}
@keyframes pulse {
0%,
100% {
opacity: 1;
}
50% {
opacity: 0.5;
}
}
.connection-status {
display: flex;
align-items: center;
gap: 0.5rem;
font-size: 0.75rem;
}
.connection-status .status-dot {
margin-right: 0;
}
</style>
</head>
<body>
<header>
<h1><span>&gt;_</span> System Monitor Internals</h1>
<div style="display: flex; align-items: center; gap: 2rem">
<div class="connection-status">
<span class="status-dot" id="conn-status"></span>
<span id="conn-text">Connecting...</span>
</div>
<nav class="nav-links">
<a href="/">Dashboard</a>
<a href="/docs">API Docs</a>
</nav>
</div>
</header>
<div class="container">
<div class="grid">
<div class="sidebar">
<div class="panel" style="margin-bottom: 1rem">
<div class="panel-header">Services</div>
<div class="panel-body">
<div class="status-grid" id="services">
<div class="status-item">
<span class="status-label">Loading...</span>
</div>
</div>
</div>
</div>
<div class="panel">
<div class="panel-header">Statistics</div>
<div class="panel-body">
<div class="stats-grid">
<div class="stat-box">
<div class="stat-value" id="stat-events">
0
</div>
<div class="stat-label">Events</div>
</div>
<div class="stat-box">
<div class="stat-value" id="stat-ws">0</div>
<div class="stat-label">Broadcasts</div>
</div>
<div class="stat-box">
<div class="stat-value" id="stat-clients">
0
</div>
<div class="stat-label">Clients</div>
</div>
<div class="stat-box">
<div class="stat-value" id="stat-machines">
0
</div>
<div class="stat-label">Machines</div>
</div>
</div>
</div>
</div>
</div>
<div class="panel">
<div class="panel-header">
Event Stream
<span class="pulse" style="color: var(--accent-green)"
></span
>
</div>
<div class="event-stream" id="events">
<div
class="event-item"
style="color: var(--text-secondary)"
>
Waiting for events...
</div>
</div>
</div>
</div>
</div>
<script>
const eventsEl = document.getElementById("events");
const servicesEl = document.getElementById("services");
const connStatus = document.getElementById("conn-status");
const connText = document.getElementById("conn-text");
function formatTime(isoString) {
const d = new Date(isoString);
return d.toLocaleTimeString("en-US", { hour12: false });
}
function updateServices(services) {
servicesEl.innerHTML = Object.entries(services)
.map(([name, info]) => {
const status =
info.status === "healthy"
? "healthy"
: info.status.includes("error")
? "error"
: "unknown";
return `
<div class="status-item">
<span><span class="status-dot ${status}"></span>${name}</span>
<span class="status-value">${info.status}</span>
</div>
`;
})
.join("");
}
function updateStats(stats) {
document.getElementById("stat-events").textContent =
stats.events_received?.toLocaleString() || "0";
document.getElementById("stat-ws").textContent =
stats.websocket_broadcasts?.toLocaleString() || "0";
document.getElementById("stat-clients").textContent = (
(stats.dashboard_connections || 0) +
(stats.internals_connections || 0)
).toString();
document.getElementById("stat-machines").textContent =
stats.cached_machines?.toString() || "0";
}
function addEvent(event) {
const topicClass = event.topic.includes("alert")
? "alerts"
: "metrics";
const html = `
<div class="event-item">
<span class="event-time">${formatTime(event.timestamp)}</span>
<span class="event-topic ${topicClass}">${event.topic}</span>
<span class="event-source">${event.source || "unknown"}</span>
<span class="event-machine">${event.machine_id || "-"}</span>
<span class="event-count">${event.metrics_count || 0} metrics</span>
</div>
`;
eventsEl.insertAdjacentHTML("afterbegin", html);
// Keep max 100 events in DOM
while (eventsEl.children.length > 100) {
eventsEl.removeChild(eventsEl.lastChild);
}
}
function connect() {
const ws = new WebSocket(`ws://${location.host}/ws/internals`);
ws.onopen = () => {
connStatus.className = "status-dot healthy";
connText.textContent = "Connected";
};
ws.onclose = () => {
connStatus.className = "status-dot error";
connText.textContent = "Disconnected - Reconnecting...";
setTimeout(connect, 3000);
};
ws.onmessage = (e) => {
const msg = JSON.parse(e.data);
if (msg.type === "init") {
updateStats(msg.data.stats);
eventsEl.innerHTML = "";
msg.data.recent_events.reverse().forEach(addEvent);
} else if (msg.type === "event") {
addEvent(msg.data);
// Increment local counter
const el = document.getElementById("stat-events");
el.textContent = (
parseInt(el.textContent.replace(/,/g, "")) + 1
).toLocaleString();
}
};
// Ping to keep alive
setInterval(() => {
if (ws.readyState === WebSocket.OPEN) ws.send("ping");
}, 30000);
}
// Fetch initial service status
fetch("/api/internals")
.then((r) => r.json())
.then((data) => {
updateServices(data.services);
updateStats(data.stats);
});
// Refresh service status periodically
setInterval(() => {
fetch("/api/internals")
.then((r) => r.json())
.then((data) => updateServices(data.services));
}, 10000);
connect();
</script>
</body>
</html>