update docs

2026-04-16 18:48:39 -03:00
parent df43c58028
commit 1ae1456502
2 changed files with 232 additions and 14 deletions
--- a/ctrl/deploy.sh
+++ b/ctrl/deploy.sh
@@ -1,9 +1,16 @@
 #!/bin/bash
 # Deploy UNT (NOVA) to server
-# Usage: ./ctrl/deploy.sh [rsync|edge]
+# Usage: ./ctrl/deploy.sh [rsync|sync|restart|push|edge]
 #
-#   rsync  — sync source to server and rebuild there (bypass CI)
+#   rsync    — sync source, rebuild images on server, restart (bypass CI)
-#   edge   — pull latest images and restart containers
+#   sync     — sync source only (no rebuild, no restart)
 #   restart  — restart containers (no sync, no rebuild)
 #   push     — build images locally, push to registry, deploy (avoids OOM on server)
 #   edge     — pull latest images from registry and restart
 #
 # Note: code is baked into the image (no volume mounts), so code changes
 # need a rebuild (rsync | edge). Config-only changes (docker-compose, .env
 # already on server) can use `restart`.
 set -e
 cd "$(dirname "$0")/.."
@@ -11,14 +18,16 @@ cd "$(dirname "$0")/.."
 SERVER="mcrn.ar"
 REMOTE_DIR="~/unt"
-case "${1:-rsync}" in
+do_sync() {
  rsync)
    echo "=== Syncing source to $SERVER ==="
    rsync -avz --exclude='.git' --exclude='node_modules' --exclude='.venv' \
        --exclude='ui/app/dist' --exclude='__pycache__' \
        --exclude='ctrl/edge/.env' \
        --filter=':- .gitignore' \
        . "$SERVER:$REMOTE_DIR/"
 }
 do_rebuild_and_restart() {
    echo "=== Building and restarting on server ==="
    ssh "$SERVER" << 'EOF'
      cd ~/unt
@@ -26,27 +35,69 @@ case "${1:-rsync}" in
      docker build -t registry.mcrn.ar/unt/ui:latest -f ctrl/Dockerfile.ui .
      cd ctrl/edge
      [ -f .env ] || cp .env.example .env
-      docker compose up -d --remove-orphans
+      docker compose up -d --remove-orphans --force-recreate
      docker image prune -f
      docker compose ps
 EOF
 }
 do_restart() {
    echo "=== Restarting containers on $SERVER ==="
    ssh "$SERVER" << 'EOF'
      cd ~/unt/ctrl/edge
      docker compose up -d --remove-orphans --force-recreate
      docker compose ps
 EOF
 }
 case "${1:-rsync}" in
  rsync)
    do_sync
    do_rebuild_and_restart
    ;;
  sync)
    do_sync
    ;;
  restart)
    do_restart
    ;;
  push)
    echo "=== Building images locally ==="
    docker build -t registry.mcrn.ar/unt/api:latest -f ctrl/Dockerfile.api .
    docker build -t registry.mcrn.ar/unt/ui:latest -f ctrl/Dockerfile.ui .
    echo "=== Pushing to registry ==="
    /home/mariano/wdir/ppl/ctrl/push-image.sh unt/api latest
    /home/mariano/wdir/ppl/ctrl/push-image.sh unt/ui latest
    echo "=== Pulling and restarting on $SERVER ==="
    ssh "$SERVER" << 'EOF'
      cd ~/unt/ctrl/edge
      docker compose pull
      docker compose up -d --remove-orphans --force-recreate
      docker image prune -f
      docker compose ps
 EOF
    ;;
  edge)
-    echo "=== Pulling and restarting on $SERVER ==="
+    echo "=== Pulling latest images on $SERVER ==="
    ssh "$SERVER" << 'EOF'
      cd ~/unt/ctrl/edge
      docker compose pull
-      docker compose up -d --remove-orphans
+      docker compose up -d --remove-orphans --force-recreate
      docker image prune -f
      docker compose ps
 EOF
    ;;
  *)
-    echo "Usage: $0 [rsync|edge]"
+    echo "Usage: $0 [rsync|sync|restart|push|edge]"
    exit 1
    ;;
 esac
-echo "=== Deploy complete ==="
+echo "=== Done ==="
--- a/docs/index.html
+++ b/docs/index.html
@@ -168,6 +168,66 @@
  .t-pax { color: #00c853; font-weight: 500; }
  .t-live { color: #00c853; }
  .t-comment { color: #4a5568; }
  /* Prose sections (walkthrough, design) */
  .graph-section h3 {
    font-family: 'JetBrains Mono', monospace;
    font-size: 13px;
    font-weight: 500;
    color: #e8eaf0;
    letter-spacing: 1px;
    margin: 32px 0 10px;
    text-transform: uppercase;
  }
  .prose { max-width: 820px; }
  .prose p {
    font-size: 14px;
    color: #b4bccf;
    margin-bottom: 14px;
    line-height: 1.7;
  }
  .prose p b { color: #e8eaf0; font-weight: 600; }
  .prose code {
    font-family: 'JetBrains Mono', monospace;
    font-size: 12px;
    color: #7ab0ff;
    background: #121829;
    padding: 1px 5px;
    border-radius: 3px;
  }
  .prose a { color: #0066ff; text-decoration: none; }
  .prose a:hover { text-decoration: underline; }
  .prose ul {
    margin: 8px 0 16px 20px;
    font-size: 14px;
    color: #b4bccf;
    line-height: 1.7;
  }
  .prose ul li { margin-bottom: 8px; }
  .cmp-table {
    width: 100%;
    border-collapse: collapse;
    font-size: 13px;
    margin: 8px 0 20px;
    border: 1px solid #1e2a4a;
  }
  .cmp-table th {
    text-align: left;
    background: #121829;
    color: #8892a8;
    font-family: 'JetBrains Mono', monospace;
    font-size: 11px;
    letter-spacing: 1px;
    padding: 10px 14px;
    border-bottom: 1px solid #1e2a4a;
  }
  .cmp-table td {
    padding: 10px 14px;
    color: #b4bccf;
    border-bottom: 1px solid #1e2a4a;
    vertical-align: top;
  }
  .cmp-table tr:last-child td { border-bottom: none; }
 </style>
 </head>
 <body>
@@ -180,18 +240,62 @@
 <div class="layout">
 <nav>
-  <a class="active" onclick="show('system')">System</a>
+  <a class="active" onclick="show('walkthrough')">Walkthrough</a>
  <a onclick="show('system')">System</a>
  <a onclick="show('mcp')">MCP Servers</a>
  <a onclick="show('efhas')">FCE Agent</a>
  <a onclick="show('handover')">Handover Agent</a>
  <a onclick="show('data')">Data Flow</a>
  <a onclick="show('deploy')">Deployment</a>
  <a onclick="show('repo')">Repository</a>
  <a onclick="show('design')">Design</a>
 </nav>
 <main>
-<section id="system" class="graph-section active">
+<section id="walkthrough" class="graph-section active">
  <h2>WALKTHROUGH</h2>
  <p>A guided tour of the platform — start here for a narrative entry point before diving into the diagrams.</p>
  <div class="prose">
    <h3>The problem</h3>
    <p>Stellar Air's operations need two things from the same underlying data. Passenger-facing teams need clear notifications when a flight is disrupted. Ops teams need shift-handover briefs that categorise every open issue by urgency. Both views ride on the same feeds — flights, weather, crew, maintenance — but with different slices, tones, and audiences. This platform unifies them through a shared MCP tool infrastructure.</p>
    <h3>Architecture at a glance</h3>
    <p>Vue UI → Kong Konnect (optional gateway) → FastAPI → LangGraph agents → MCP clients → three domain-scoped MCP servers → live APIs (OpenMeteo, FAA) and scenario data. The <a onclick="show('system')">System</a> diagram shows the full picture.</p>
    <h3>Data layer</h3>
    <p>Domain models live in <code>mcp_servers/data/models.py</code> — Pydantic types with enums for flight status, delay causes, and crew roles. Four scenarios (<code>normal_ops</code>, <code>weather_disruption_ord</code>, <code>maintenance_delay_sfo</code>, <code>crew_swap_ewr</code>) are Python modules loaded lazily by <code>mcp_servers/data/scenarios/manager.py</code>; each is a complete, consistent dataset switchable from the UI at runtime. Weather comes live from OpenMeteo (<code>mcp_servers/data/real/openmeteo.py</code>) — real forecasts along calculated route waypoints. Airport status comes live from the FAA NASSTATUS feed (<code>mcp_servers/data/real/faa.py</code>). Neither live source requires an API key.</p>
    <h3>MCP servers</h3>
    <p>Three servers scoped by access domain. <code>shared</code> exposes the data both agents need — flight status/details, route weather, hub forecasts, airport status/congestion, maintenance flags, and a <code>delay_explainer</code> prompt template. <code>ops</code> adds crew duty, rebookings, a <code>handover-brief</code> prompt, and the handover narrative generator; only the Handover agent connects to it. <code>passenger</code> adds the notification generator and a <code>passenger-notification</code> prompt with selectable tone; only the FCE agent connects to it. Each server declares tools, resources, and prompts.</p>
    <h3>MCP client</h3>
    <p><code>agents/shared/mcp_client.py</code> defines <code>MCPMultiClient</code> plus a per-agent profile that declares which servers to connect to. Calls are namespaced by server name — <code>mcp.call_tool('shared', 'get_flight_status', &hellip;)</code>. Tool results, resource reads, and prompt gets share a common parser and a tool runner that wraps each call in a Langfuse span with timeout and error collection (<code>agents/shared/parser.py</code>, <code>agents/shared/tool_runner.py</code>).</p>
    <h3>Agents</h3>
    <p>The <b>FCE agent</b> (<code>agents/fce.py</code>) is a four-node LangGraph: triage → gather → synthesize → format. The gather node fires five MCP tool calls in parallel via <code>asyncio.gather</code> — route weather, airport status, airport congestion, flight details, and crew notes — each wrapped in <code>asyncio.wait_for</code> with a 15-second timeout. The synthesis node calls <code>generate_notification</code>; if any gather call failed, the prompt is told which sources are missing and omits them rather than hallucinating.</p>
    <p>The <b>Handover agent</b> (<code>agents/handover.py</code>) scans every hub in parallel, scores each disruption with a weighted severity × time-sensitivity function (delay minutes, crew duty limits, passenger impact, connection risk), and categorises the results into IMMEDIATE / MONITOR / FYI.</p>
    <h3>API layer</h3>
    <p>FastAPI (<code>api/main.py</code>) runs agents asynchronously: POST to <code>/agents/fce</code> returns a <code>run_id</code> immediately and the client polls <code>/agents/runs/{run_id}</code>. An <code>EventHub</code> broadcasts lifecycle events over WebSocket — <code>agent_start</code>, <code>node_enter</code>/<code>node_exit</code>, <code>tool_call_end</code>/<code>tool_call_error</code>, <code>agent_end</code> — so the UI can render the agent's internals live. A background task prunes completed runs after one hour. Configuration is centralised in a Pydantic <code>Settings</code> class (<code>api/config.py</code>); HTTP errors surface as proper status codes, not as 200 responses with an error body.</p>
    <h3>Kong Konnect</h3>
    <p>Kong sits in front as an optional API gateway — rate limiting, request analytics, the path to authentication. The UI reads a gateway URL from local storage or <code>VITE_KONG_PROXY_URL</code>; when empty it falls back to direct FastAPI calls. Kong is additive, not required, so the app keeps working even if the gateway is offline.</p>
    <h3>Frontend</h3>
    <p>Vue 3 SPA built on the internal <code>soleprint-ui</code> framework. Four tabs: <i>Operations</i> (run agents, see results), <i>Internals</i> (live tool-call stream over WebSocket via <code>useAgentEvents</code>), <i>Data</i> (inspect and edit the active scenario), and <i>Settings</i> (LLM provider, gateway URL). The internals view is the most useful one for understanding what the agent does on each run.</p>
    <h3>Testing</h3>
    <p>69 tests with dual-mode transport (<code>tests/base.py</code>). Default mode runs against ASGI in-process — fast, no server needed. Set <code>CONTRACT_TEST_MODE=live</code> and <code>CONTRACT_TEST_URL=&hellip;</code> to run the same assertions over real HTTP against any deployed instance.</p>
    <h3>Deployment &amp; CI</h3>
    <p>Woodpecker CI (<code>.woodpecker/build.yml</code>) builds the API and UI images on push to main and pushes them to a private registry. <code>ctrl/deploy.sh</code> has two modes — <code>rsync</code> (copy source, build on the server, for fast iteration) and <code>edge</code> (pull tagged images from the registry, for production). Production runs as docker-compose on EC2 (<code>ctrl/edge/docker-compose.yml</code>) behind nginx, optionally behind Kong. Langfuse runs in a separate Kind cluster and is shared across projects.</p>
  </div>
 </section>
 <section id="system" class="graph-section">
  <h2>SYSTEM ARCHITECTURE</h2>
  <p>End-to-end view: Vue UI → Kong gateway (optional) → FastAPI → MCP servers → live and scenario data sources. Langfuse (separate shared cluster) traces every agent run and tool call.</p>
  <div class="graph-container">
@@ -260,7 +364,7 @@
 <section id="repo" class="graph-section">
  <h2>REPOSITORY STRUCTURE</h2>
-  <p>Monorepo: MCP servers, agents, IRROP engine, API, Vue UI (with shared component framework), and deployment configs.</p>
+  <p>Monorepo: MCP servers, agents, API, Vue UI (with shared component framework), and deployment configs.</p>
  <div class="tree-container">
    <pre class="repo-tree"><span class="t-root">stellar-ops/</span>
 ├── <span class="t-dir">mcp_servers/</span>
@@ -308,6 +412,68 @@
  </div>
 </section>
 <section id="design" class="graph-section">
  <h2>DESIGN NOTES</h2>
  <p>Rationale behind the non-obvious choices, and a roadmap of deferred improvements. Protocol references link to the MCP spec at <a href="https://modelcontextprotocol.io" target="_blank" rel="noopener">modelcontextprotocol.io</a>.</p>
  <div class="prose">
    <h3>Concurrency model</h3>
    <p>Everything runs on one OS thread under asyncio — no GIL contention, no thread locks. Shared mutable state (<code>runs: dict</code>, <code>event_hub._clients: set</code>) is safe because mutations are atomic relative to the event loop scheduler, and disconnects happen between awaits so broadcast iteration is race-free. The FCE agent fires five <code>asyncio.create_task</code> calls then <code>asyncio.gather</code> — five MCP tool calls run concurrently but cooperatively. This only breaks once <code>runs</code> grows large enough to want sharding across processes, at which point the in-process guarantees evaporate and a Redis-backed store becomes necessary (see Roadmap).</p>
    <h3>Stateless API, stateful MCP subprocesses</h3>
    <p>Each agent run spawns three MCP server subprocesses over stdio. This is wasteful per-request (~500 ms cold-start) but has one decisive advantage: full isolation. No shared scenario state across runs, no mutex on the scenario manager, no "wait, whose data was this?". The path forward is Streamable HTTP transport with long-lived servers — same tool code, different transport — which is a config change rather than a rewrite.</p>
    <h3>Domain-scoped MCP servers</h3>
    <p>Three servers — <code>shared</code>, <code>ops</code>, <code>passenger</code> — not one with RBAC filtering. The passenger agent literally cannot call <code>get_crew_duty_status</code> because it never connects to the ops server; the capability isn't even discoverable. Security boundary by architecture, not by authorization. Filter bugs become security bugs; MCP is a capability protocol, so using its native scoping is cleaner than bolting auth on top. If ops tools ever move to a separate team or repo they just become a separately-deployed MCP server — agents update their profile, not their code.</p>
    <h3>Tools, Resources, and Prompts</h3>
    <p>All three MCP primitives are used. <b>Tools</b> are actions or queries with potential side effects: <code>get_flight_status</code>, <code>generate_notification</code>. <b>Resources</b> are read-only data with URIs: <code>ops://hubs/{code}</code>, <code>ops://handover/latest</code> — a dynamic resource (updated after each handover) is still a resource because reading it has no side effects. <b>Prompts</b> are server-versioned templates: <code>delay_explainer(cause_code, audience)</code>, <code>passenger-notification(tone)</code>. The split matters because it lets the server own prompt versioning — update the template on the server and every client picks it up without a redeploy.</p>
    <h3>Why MCP over function calling, LangChain, or direct APIs</h3>
    <p>MCP wins when there are multiple consumers of the same tools (here, both a LangGraph agent and Claude Code), when dynamic tool discovery matters, and when protocol-level contracts are worth having. Provider function calling (OpenAI, Anthropic) bakes tool definitions into prompts and locks to one vendor. LangChain tools couple to LangChain's abstractions. Direct API calls are the N×M integration problem. MCP doesn't replace function calling — the LLM still uses its native tool-calling mechanism — it standardises the execution layer underneath.</p>
    <table class="cmp-table">
      <thead><tr><th>Approach</th><th>Strengths</th><th>Weaknesses</th></tr></thead>
      <tbody>
        <tr><td>MCP</td><td>Standard, discoverable, client-agnostic, composable</td><td>Extra process, protocol overhead for simple cases</td></tr>
        <tr><td>Function calling</td><td>Simple, no extra infrastructure</td><td>Provider-locked, no runtime discovery, definitions duplicated per call</td></tr>
        <tr><td>LangChain tools</td><td>Tight framework integration</td><td>Coupled to LangChain, not usable outside</td></tr>
        <tr><td>Direct API calls</td><td>No abstraction overhead</td><td>N×M integration problem, no standardisation</td></tr>
      </tbody>
    </table>
    <h3>LLM provider abstraction</h3>
    <p>One <code>generate(system_prompt, user_content)</code> function in <code>mcp_servers/shared_llm.py</code> with four backends: Groq (default, free), Anthropic, Bedrock, and any OpenAI-compatible endpoint. Selection happens at runtime via <code>LLM_PROVIDER</code>. LangChain's provider abstraction is heavier than needed here — string in, string out is enough — and switching providers touches one env var rather than the agent code.</p>
    <p>Every narrative tool also has a structured template fallback. Response format is identical: <code>{"text": str, "provider": str}</code>. The UI surfaces the provider as a badge, so it's always visible whether a response came from an LLM or the template — honest about what mode the system is in. Tests pass without any API key; the demo works without any API key.</p>
    <h3>Scenarios in memory, not a database</h3>
    <p>Scenarios are Python modules, versioned with git, loaded lazily by the scenario manager. They are deliberately designed datasets, not user-generated content — git is more valuable than CRUD for them, and switching scenarios is a config change rather than a data migration. The reload-on-subprocess-spawn pattern sidesteps the cache-invalidation problem entirely. This would break once scenarios became per-tenant or grew beyond ~50 MB — then it's a database.</p>
    <h3>Dual-mode tests</h3>
    <p><code>tests/base.py</code> supports two transports with the same 69 assertions. Default (<code>inprocess</code>) uses <code>httpx.AsyncClient</code> over ASGI — no server needed. <code>live</code> mode runs real HTTP against any <code>CONTRACT_TEST_URL</code>, so the same tests validate a deployed instance. Contract tests are definitionally transport-agnostic; duplicating them into two files would be the bug factory every project eventually regrets.</p>
    <h3>Kong as additive</h3>
    <p>The app works with or without Kong. When <code>VITE_KONG_PROXY_URL</code> is empty the UI calls FastAPI directly; when set it routes through Kong Konnect for rate limiting, analytics, and the path to auth. Graceful degradation beats a broken demo — especially relevant when the gateway sits on a trial subscription with a finite lifetime.</p>
    <h3>Langfuse in a shared cluster</h3>
    <p>Langfuse runs in its own Kind cluster separate from the app cluster. The v3 stack needs ClickHouse, Redis, MinIO, and a worker — four extra pods that aren't project-specific. Putting it in a shared cluster means every project points <code>LANGFUSE_HOST</code> at the same instance: one dashboard, one set of keys, one upgrade path. That's how Langfuse belongs in production — shared infra, not per-service.</p>
    <h3>Timeouts, TTL cleanup, error handling</h3>
    <p>Every MCP tool call is wrapped in <code>asyncio.wait_for</code> with a 15-second timeout — long enough to catch real hangs without false positives from slow-but-alive APIs (OpenMeteo and FAA typically respond in under 2 s). On timeout the span is marked <code>ERROR</code> in Langfuse, the error is added to the run's error list, and the agent continues with partial data. The notification prompt is told which sources are missing and omits them rather than hallucinating.</p>
    <p>The in-memory run store is pruned by a background task that removes completed or errored runs older than one hour. Errors surface with proper HTTP status codes — <code>HTTPException(404, &hellip;)</code> for missing resources, <code>400</code> for invalid requests — rather than <code>200</code> responses with an error body, so clients can distinguish failure without parsing the payload.</p>
    <h3>Roadmap</h3>
    <p>Items deferred intentionally — the system works without them, and each is a clean extension rather than a rewrite.</p>
    <ul>
      <li><b>MCP over Streamable HTTP.</b> Replace subprocess-per-run with long-lived server processes. Becomes worthwhile once cold-start latency matters in aggregate or once MCP needs to serve multiple API replicas.</li>
      <li><b>Redis-backed run store and event bus.</b> Enables multi-instance WebSocket broadcast and survives API restarts. Necessary as soon as the API scales past a single process.</li>
      <li><b>Database-backed scenarios.</b> Replace the in-memory modules with a datastore once scenarios need to be per-tenant or grow beyond what fits comfortably in git.</li>
      <li><b>Circuit breakers on external APIs.</b> Exponential backoff and breakers on FAA and OpenMeteo via <code>tenacity</code>. Worth doing once those APIs have their first real outage.</li>
      <li><b>Kong Key Auth.</b> Per-consumer access control and per-agent rate limits. Unlocks multi-tenant use and a formal API-key lifecycle.</li>
    </ul>
  </div>
 </section>
 </main>
 </div>
@@ -317,7 +483,8 @@ function show(id) {
  document.querySelectorAll('.graph-section').forEach(s => s.classList.remove('active'));
  document.querySelectorAll('nav a').forEach(a => a.classList.remove('active'));
  document.getElementById(id).classList.add('active');
-  event.currentTarget.classList.add('active');
+  var navLink = document.querySelector('nav a[onclick="show(\'' + id + '\')"]');
  if (navLink) navLink.classList.add('active');
 }
 </script>