""" Agent provider for OpenAI-compatible APIs (Groq, OpenAI, etc.). Sends frame images as base64. Requires GROQ_API_KEY or OPENAI_API_KEY env var. Auto-detects provider from available env keys. """ import base64 import logging import os from typing import Iterator from cht.agent.base import AgentProvider, SessionContext, FrameRef log = logging.getLogger(__name__) SYSTEM_PROMPT = """You are an assistant integrated into CHT, a screen recording and analysis tool. You help the user understand what happened during their recording session. Be concise and specific. Focus on what's visible in the provided frames.""" # Provider configs: (base_url, default_model, available_models) _PROVIDER_CONFIGS = { "groq": ( "https://api.groq.com/openai/v1", "meta-llama/llama-4-maverick-17b-128e-instruct", [ "meta-llama/llama-4-maverick-17b-128e-instruct", "meta-llama/llama-4-scout-17b-16e-instruct", "qwen/qwen-2.5-vl-72b-instruct", ], ), "openai": ( "https://api.openai.com/v1", "gpt-4o", ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini"], ), } def _detect_provider() -> tuple[str, str, str, list[str]] | None: """Returns (api_key, base_url, model, available_models) or None.""" if key := os.environ.get("GROQ_API_KEY"): base_url, default_model, models = _PROVIDER_CONFIGS["groq"] model = os.environ.get("CHT_MODEL", default_model) return key, base_url, model, models if key := os.environ.get("OPENAI_API_KEY"): base_url, default_model, models = _PROVIDER_CONFIGS["openai"] model = os.environ.get("CHT_MODEL", default_model) return key, base_url, model, models return None def _frame_to_image_content(frame: FrameRef) -> dict: with open(frame.path, "rb") as f: data = base64.standard_b64encode(f.read()).decode() return { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{data}"}, } class OpenAICompatProvider(AgentProvider): """Uses any OpenAI-compatible API. Auto-detects from env vars.""" def __init__(self): detected = _detect_provider() if not detected: raise RuntimeError( "No API key found. Set GROQ_API_KEY or OPENAI_API_KEY." ) self._api_key, self._base_url, self._model, self._models = detected @property def name(self) -> str: if "groq" in self._base_url: return f"groq/{self._model}" return f"openai-compat/{self._model}" @property def available_models(self) -> list[str]: return list(self._models) @property def model(self) -> str: return self._model @model.setter def model(self, value: str): self._model = value def stream(self, message: str, context: SessionContext) -> Iterator[str]: from openai import OpenAI client = OpenAI(api_key=self._api_key, base_url=self._base_url) # Build context header m, s = divmod(int(context.duration), 60) ctx_lines = [ f"Recording duration: {m:02d}:{s:02d}", f"Total frames: {len(context.frames)}", ] if context.transcript_segments: ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):") for t in context.transcript_segments: tm1, ts1 = divmod(int(t.start), 60) tm2, ts2 = divmod(int(t.end), 60) ctx_lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}") ctx_text = "\n".join(ctx_lines) + "\n" frames_to_send = context.mentioned_frames content: list[dict] = [{"type": "text", "text": ctx_text + message}] for frame in frames_to_send: fm, fs = divmod(int(frame.timestamp), 60) content.append({"type": "text", "text": f"{frame.id} at {fm:02d}:{fs:02d}:"}) try: content.append(_frame_to_image_content(frame)) except Exception as e: log.warning("Could not encode frame %s: %s", frame.id, e) stream = client.chat.completions.create( model=self._model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": content}, ], stream=True, ) for chunk in stream: delta = chunk.choices[0].delta.content if delta: yield delta