""" Agent provider for OpenAI-compatible APIs (Groq, OpenAI, etc.). Sends frame images as base64. Requires GROQ_API_KEY or OPENAI_API_KEY env var. Auto-detects provider from available env keys. """ import base64 import logging import os from typing import Iterator from cht.agent.base import AgentProvider, SessionContext, FrameRef log = logging.getLogger(__name__) SYSTEM_PROMPT = """You are an assistant integrated into CHT, a screen recording and analysis tool. You help the user understand what happened during their recording session. Be concise and specific. Focus on what's visible in the provided frames.""" # Default models per provider _PROVIDER_DEFAULTS = { "groq": ("https://api.groq.com/openai/v1", "meta-llama/llama-4-maverick-17b-128e-instruct"), "openai": ("https://api.openai.com/v1", "gpt-4o"), } def _detect_provider() -> tuple[str, str, str] | None: """Returns (api_key, base_url, model) or None if no key found.""" if key := os.environ.get("GROQ_API_KEY"): base_url, model = _PROVIDER_DEFAULTS["groq"] return key, base_url, os.environ.get("CHT_MODEL", model) if key := os.environ.get("OPENAI_API_KEY"): base_url, model = _PROVIDER_DEFAULTS["openai"] return key, base_url, os.environ.get("CHT_MODEL", model) return None def _frame_to_image_content(frame: FrameRef) -> dict: with open(frame.path, "rb") as f: data = base64.standard_b64encode(f.read()).decode() return { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{data}"}, } class OpenAICompatProvider(AgentProvider): """Uses any OpenAI-compatible API. Auto-detects from env vars.""" def __init__(self): detected = _detect_provider() if not detected: raise RuntimeError( "No API key found. Set GROQ_API_KEY or OPENAI_API_KEY." ) self._api_key, self._base_url, self._model = detected @property def name(self) -> str: if "groq" in self._base_url: return f"groq/{self._model}" return f"openai-compat/{self._model}" def stream(self, message: str, context: SessionContext) -> Iterator[str]: from openai import OpenAI client = OpenAI(api_key=self._api_key, base_url=self._base_url) # Build context header m, s = divmod(int(context.duration), 60) ctx_text = ( f"Recording duration: {m:02d}:{s:02d}\n" f"Total frames: {len(context.frames)}\n" ) frames_to_send = context.mentioned_frames content: list[dict] = [{"type": "text", "text": ctx_text + message}] for frame in frames_to_send: fm, fs = divmod(int(frame.timestamp), 60) content.append({"type": "text", "text": f"{frame.id} at {fm:02d}:{fs:02d}:"}) try: content.append(_frame_to_image_content(frame)) except Exception as e: log.warning("Could not encode frame %s: %s", frame.id, e) stream = client.chat.completions.create( model=self._model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": content}, ], stream=True, ) for chunk in stream: delta = chunk.choices[0].delta.content if delta: yield delta