diff --git a/pkg/create/templates.go b/pkg/create/templates.go index fb5845f..54a1ee4 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -19,6 +19,7 @@ const ( TemplateOpenAGIComputerUse = "openagi-computer-use" TemplateClaudeAgentSDK = "claude-agent-sdk" TemplateYutoriComputerUse = "yutori" + TemplateUnifiedCUA = "cua" ) type TemplateInfo struct { @@ -90,6 +91,11 @@ var Templates = map[string]TemplateInfo{ Description: "Implements a Yutori n1 computer use agent", Languages: []string{LanguageTypeScript, LanguagePython}, }, + TemplateUnifiedCUA: { + Name: "Unified CUA", + Description: "Multi-provider computer use agent with Anthropic/OpenAI/Gemini fallback", + Languages: []string{LanguageTypeScript, LanguagePython}, + }, } // GetSupportedTemplatesForLanguage returns a list of all supported template names for a given language @@ -213,6 +219,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, }, + TemplateUnifiedCUA: { + EntryPoint: "index.ts", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke ts-cua cua-task --payload '{"query": "Go to https://news.ycombinator.com and get the top 5 stories"}'`, + }, }, LanguagePython: { TemplateSampleApp: { @@ -260,6 +271,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, }, + TemplateUnifiedCUA: { + EntryPoint: "main.py", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke python-cua cua-task --payload '{"query": "Go to https://news.ycombinator.com and get the top 5 stories"}'`, + }, }, } diff --git a/pkg/templates/python/cua/.env.example b/pkg/templates/python/cua/.env.example new file mode 100644 index 0000000..d4bfcbd --- /dev/null +++ b/pkg/templates/python/cua/.env.example @@ -0,0 +1,26 @@ +# Copy this file to .env and fill in your API keys. +# Only the key for your chosen provider is required. + +# Primary provider: "anthropic", "openai", or "gemini" +CUA_PROVIDER=anthropic + +# Comma-separated fallback order (optional). +# If the primary provider fails, these are tried in order. +# CUA_FALLBACK_PROVIDERS=openai,gemini + +# Provider API keys — set the one(s) you plan to use +ANTHROPIC_API_KEY=your_anthropic_api_key_here +OPENAI_API_KEY=your_openai_api_key_here +GOOGLE_API_KEY=your_google_api_key_here + +# Browser config (proxy, profile, extensions, timeout) is set per-request +# via the payload "browser" field, not here. Example: +# kernel invoke python-cua cua-task --payload '{ +# "query": "...", +# "browser": { +# "proxy_id": "proxy_abc123", +# "profile": { "name": "my-profile", "save_changes": true }, +# "extensions": [{ "name": "my-extension" }], +# "timeout_seconds": 600 +# } +# }' diff --git a/pkg/templates/python/cua/README.md b/pkg/templates/python/cua/README.md new file mode 100644 index 0000000..90bbdfa --- /dev/null +++ b/pkg/templates/python/cua/README.md @@ -0,0 +1,88 @@ +# Unified CUA Template + +A multi-provider Computer Use Agent (CUA) template for [Kernel](https://kernel.sh). Supports **Anthropic**, **OpenAI**, and **Google Gemini** as interchangeable backends with automatic fallback. + +## Quick start + +### 1. Install dependencies + +```bash +uv sync +``` + +### 2. Configure environment + +Copy the example env file and add your API keys: + +```bash +cp .env.example .env +``` + +Set `CUA_PROVIDER` to your preferred provider and add the matching API key: + + +| Provider | Env var for key | Model used | +| ----------- | ------------------- | ----------------------------------------- | +| `anthropic` | `ANTHROPIC_API_KEY` | `claude-sonnet-4-6` | +| `openai` | `OPENAI_API_KEY` | `gpt-5.4` | +| `gemini` | `GOOGLE_API_KEY` | `gemini-2.5-computer-use-preview-10-2025` | + + +### 3. Deploy to Kernel + +```bash +kernel deploy main.py --env-file .env +``` + +### 4. Invoke + +```bash +kernel invoke python-cua cua-task --payload '{"query": "Go to https://news.ycombinator.com and get the top 5 stories"}' +``` + +## Multi-provider fallback + +Set `CUA_FALLBACK_PROVIDERS` to automatically try another provider if the primary fails: + +```env +CUA_PROVIDER=anthropic +CUA_FALLBACK_PROVIDERS=openai,gemini +``` + +This will try Anthropic first, then OpenAI, then Gemini. Only providers with valid API keys are used. + +## Replay recording + +Pass `record_replay: true` in the payload to capture a video replay of the browser session: + +```bash +kernel invoke python-cua cua-task --payload '{"query": "Navigate to example.com", "record_replay": true}' +``` + +The response will include a `replay_url` you can open in your browser. + +## Project structure + +``` +main.py — Kernel app entrypoint +session.py — Browser session lifecycle with replay support +providers/ + __init__.py — Provider factory and fallback logic + anthropic.py — Anthropic Claude adapter + openai.py — OpenAI GPT adapter + gemini.py — Google Gemini adapter +``` + +## Customization + +Each provider adapter is self-contained. To customize a provider's behavior (system prompt, model, tool handling), edit the corresponding file in `providers/`. + +To add a new provider, create a new file that implements the `CuaProvider` protocol and register it in `providers/__init__.py`. + +## Resources + +- [Kernel Docs](https://docs.kernel.sh) +- [Anthropic Computer Use](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use) +- [OpenAI Computer Use](https://platform.openai.com/docs/guides/computer-use) +- [Google Gemini Computer Use](https://ai.google.dev/gemini-api/docs/computer-use) + diff --git a/pkg/templates/python/cua/_gitignore b/pkg/templates/python/cua/_gitignore new file mode 100644 index 0000000..db80737 --- /dev/null +++ b/pkg/templates/python/cua/_gitignore @@ -0,0 +1,31 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +dist/ +build/ + +# Virtual environments +.venv/ +venv/ +env/ + +# Environment +.env +.env.local +.env.*.local + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/pkg/templates/python/cua/main.py b/pkg/templates/python/cua/main.py new file mode 100644 index 0000000..097a470 --- /dev/null +++ b/pkg/templates/python/cua/main.py @@ -0,0 +1,151 @@ +""" +Unified CUA (Computer Use Agent) template with multi-provider support. + +Supports Anthropic, OpenAI, and Gemini as interchangeable providers. +Configure via environment variables: + CUA_PROVIDER — primary provider ("anthropic", "openai", or "gemini") + CUA_FALLBACK_PROVIDERS — comma-separated fallback order (optional) + +Each provider requires its own API key: + ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY +""" + +from __future__ import annotations + +import asyncio +from typing import Literal, TypedDict + +import kernel +from kernel import Kernel + +from providers import resolve_providers, run_with_fallback, TaskOptions +from session import KernelBrowserSession, SessionOptions + +kernel_client = Kernel() +app = kernel.App("python-cua") + + +class BrowserProfile(TypedDict, total=False): + id: str + name: str + save_changes: bool + + +class BrowserExtension(TypedDict, total=False): + id: str + name: str + + +class BrowserConfig(TypedDict, total=False): + proxy_id: str + profile: BrowserProfile + extensions: list[BrowserExtension] + timeout_seconds: int + + +class CuaInput(TypedDict, total=False): + query: str + provider: Literal["anthropic", "openai", "gemini"] + model: str + record_replay: bool + session_id: str + browser: BrowserConfig + + +class CuaOutput(TypedDict, total=False): + result: str + provider: str + replay_url: str + + +# Provider resolution is deferred to the action handler because env vars +# are not available during Hypeman's build/discovery phase. +_providers: list | None = None + + +def _get_providers(): + global _providers + if _providers is None: + _providers = resolve_providers() + print(f"Configured providers: {' -> '.join(p.name for p in _providers)}") + return _providers + + +@app.action("cua-task") +async def cua_task(ctx: kernel.KernelContext, payload: CuaInput | None = None) -> CuaOutput: + if not payload or not payload.get("query"): + raise ValueError('Query is required. Payload must include: {"query": "your task description"}') + + providers = _get_providers() + + # Per-request provider override: move requested provider to front + if payload.get("provider"): + requested = next((p for p in providers if p.name == payload["provider"]), None) + if requested: + providers = [requested] + [p for p in providers if p is not requested] + + # Use an existing browser session (BYOB) or create a new one. + # BYOB is useful for multi-turn CUA on a persistent browser, or HITL + # where a human uses the live view between CUA calls. + if payload.get("session_id"): + browser = await asyncio.to_thread( + kernel_client.browsers.retrieve, payload["session_id"], + ) + vp = getattr(browser, "viewport", None) + task_result = await run_with_fallback( + providers, + TaskOptions( + query=payload["query"], + kernel=kernel_client, + session_id=payload["session_id"], + model=payload.get("model"), + viewport_width=getattr(vp, "width", 1280), + viewport_height=getattr(vp, "height", 800), + ), + ) + return {"result": task_result.result, "provider": task_result.provider} + + browser_cfg = payload.get("browser") or {} + session = KernelBrowserSession( + kernel_client, + SessionOptions( + invocation_id=ctx.invocation_id, + stealth=True, + record_replay=payload.get("record_replay", False), + proxy_id=browser_cfg.get("proxy_id"), + profile=browser_cfg.get("profile"), + extensions=browser_cfg.get("extensions"), + timeout_seconds=browser_cfg.get("timeout_seconds", 300), + ), + ) + + await session.start() + print(f"Live view: {session.live_view_url}") + + try: + task_result = await run_with_fallback( + providers, + TaskOptions( + query=payload["query"], + kernel=kernel_client, + session_id=session.session_id, + model=payload.get("model"), + viewport_width=session.opts.viewport_width, + viewport_height=session.opts.viewport_height, + ), + ) + + session_info = await session.stop() + + output: CuaOutput = { + "result": task_result.result, + "provider": task_result.provider, + } + if session_info.replay_view_url: + output["replay_url"] = session_info.replay_view_url + + return output + + except Exception: + await session.stop() + raise diff --git a/pkg/templates/python/cua/providers/__init__.py b/pkg/templates/python/cua/providers/__init__.py new file mode 100644 index 0000000..63cbb0a --- /dev/null +++ b/pkg/templates/python/cua/providers/__init__.py @@ -0,0 +1,108 @@ +""" +Provider factory with automatic fallback. + +Resolution order: + 1. CUA_PROVIDER env var (required) + 2. CUA_FALLBACK_PROVIDERS env var (optional, comma-separated) +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Protocol + +from kernel import Kernel + + +@dataclass +class TaskOptions: + query: str + kernel: Kernel + session_id: str + model: str | None = None + viewport_width: int = 1280 + viewport_height: int = 800 + + +@dataclass +class TaskResult: + result: str + provider: str + + +class CuaProvider(Protocol): + @property + def name(self) -> str: ... + def is_configured(self) -> bool: ... + async def run_task(self, options: TaskOptions) -> TaskResult: ... + + +def _build_provider(name: str) -> CuaProvider | None: + if name == "anthropic": + from .anthropic import AnthropicProvider + return AnthropicProvider() + if name == "openai": + from .openai import OpenAIProvider + return OpenAIProvider() + if name == "gemini": + from .gemini import GeminiProvider + return GeminiProvider() + return None + + +def resolve_providers() -> list[CuaProvider]: + """Build the ordered list of providers to try.""" + primary = os.environ.get("CUA_PROVIDER", "").strip().lower() + fallbacks = [ + s.strip().lower() + for s in os.environ.get("CUA_FALLBACK_PROVIDERS", "").split(",") + if s.strip() + ] + + order = ([primary] if primary else []) + fallbacks + + seen: set[str] = set() + providers: list[CuaProvider] = [] + + for name in order: + if name in seen: + continue + seen.add(name) + + provider = _build_provider(name) + if provider is None: + print(f'Warning: Unknown provider "{name}", skipping.') + continue + if not provider.is_configured(): + print(f'Warning: Provider "{name}" missing API key, skipping.') + continue + providers.append(provider) + + if not providers: + raise RuntimeError( + "No CUA provider is configured. " + "Set CUA_PROVIDER to one of: anthropic, openai, gemini, " + "and provide the matching API key." + ) + + return providers + + +async def run_with_fallback( + providers: list[CuaProvider], + options: TaskOptions, +) -> TaskResult: + """Run a CUA task, trying each provider in order until one succeeds.""" + errors: list[tuple[str, Exception]] = [] + + for provider in providers: + try: + print(f"Attempting provider: {provider.name}") + return await provider.run_task(options) + except Exception as exc: + print(f'Provider "{provider.name}" failed: {exc}') + errors.append((provider.name, exc)) + + summary = "\n".join(f" {name}: {exc}" for name, exc in errors) + raise RuntimeError(f"All providers failed:\n{summary}") diff --git a/pkg/templates/python/cua/providers/anthropic.py b/pkg/templates/python/cua/providers/anthropic.py new file mode 100644 index 0000000..8e9cdb6 --- /dev/null +++ b/pkg/templates/python/cua/providers/anthropic.py @@ -0,0 +1,220 @@ +"""Anthropic CUA provider adapter using Claude computer-use API.""" + +from __future__ import annotations + +import asyncio +import base64 +import os +from datetime import datetime + +import anthropic + +from . import CuaProvider, TaskOptions, TaskResult + +SYSTEM_PROMPT = """ +* You are utilising an Ubuntu virtual machine with internet access. +* When you connect to the display, CHROMIUM IS ALREADY OPEN. +* If you need to navigate to a new page, use ctrl+l to focus the url bar and then enter the url. +* After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. +* Only when you confirm a step was executed correctly should you move on to the next one. +* The current date is {date}. + + + +* When using Chromium, if a startup wizard appears, IGNORE IT. +* Click on the search bar and enter the appropriate URL there. +""" + +KEY_MAP = { + "Return": "Return", "Enter": "Return", "Backspace": "BackSpace", + "Tab": "Tab", "Escape": "Escape", "space": "space", "Space": "space", + "Up": "Up", "Down": "Down", "Left": "Left", "Right": "Right", + "Home": "Home", "End": "End", "Page_Up": "Prior", "Page_Down": "Next", + "ctrl": "Control_L", "Control_L": "Control_L", + "alt": "Alt_L", "Alt_L": "Alt_L", + "shift": "Shift_L", "Shift_L": "Shift_L", + "super": "Super_L", "Super_L": "Super_L", +} + + +def _map_key(key: str) -> str: + if "+" in key: + return "+".join(KEY_MAP.get(k.strip(), k.strip()) for k in key.split("+")) + return KEY_MAP.get(key, key) + + +class AnthropicProvider: + name = "anthropic" + + def __init__(self) -> None: + self._api_key = os.environ.get("ANTHROPIC_API_KEY", "") + + def is_configured(self) -> bool: + return len(self._api_key) > 0 + + async def run_task(self, options: TaskOptions) -> TaskResult: + client = anthropic.Anthropic(api_key=self._api_key, max_retries=4) + model = options.model or "claude-sonnet-4-6" + messages: list[dict] = [{"role": "user", "content": options.query}] + + date_str = datetime.now().strftime("%A, %B %d, %Y") + system_prompt = SYSTEM_PROMPT.format(date=date_str) + + while True: + response = await asyncio.to_thread( + client.beta.messages.create, + max_tokens=4096, + messages=messages, + model=model, + system=[{"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}], + tools=[{ + "type": "computer_20251124", + "name": "computer", + "display_width_px": options.viewport_width, + "display_height_px": options.viewport_height, + "display_number": 1, + }], + betas=["computer-use-2025-11-24", "prompt-caching-2024-07-31"], + thinking={"type": "enabled", "budget_tokens": 1024}, + ) + + assistant_content = [] + for block in response.content: + if block.type == "thinking": + assistant_content.append({ + "type": "thinking", + "thinking": block.thinking, + "signature": block.signature, + }) + elif block.type == "text": + assistant_content.append({"type": "text", "text": block.text}) + elif block.type == "tool_use": + assistant_content.append({ + "type": "tool_use", + "id": block.id, + "name": block.name, + "input": block.input, + }) + + messages.append({"role": "assistant", "content": assistant_content}) + + if response.stop_reason == "end_turn": + text = " ".join( + b.text for b in response.content if b.type == "text" + ) + return TaskResult(result=text, provider=self.name) + + # Process tool calls + tool_results = [] + for block in response.content: + if block.type != "tool_use": + continue + action = block.input.get("action", "") + try: + screenshot = await self._execute_action( + options, action, block.input, + ) + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": [{ + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": screenshot, + }, + }], + }) + except Exception as exc: + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": [{"type": "text", "text": f"Error: {exc}"}], + "is_error": True, + }) + + if tool_results: + messages.append({"role": "user", "content": tool_results}) + else: + text = " ".join( + b.text for b in response.content if b.type == "text" + ) + return TaskResult(result=text or "(no response)", provider=self.name) + + async def _execute_action( + self, options: TaskOptions, action: str, params: dict + ) -> str: + computer = options.kernel.browsers.computer + + if action == "screenshot": + pass + elif action == "key": + key = _map_key(params.get("key", "")) + await asyncio.to_thread( + computer.press_key, options.session_id, keys=[key] + ) + elif action == "hold_key": + key = _map_key(params.get("key", "")) + duration = params.get("duration", 500) + await asyncio.to_thread( + computer.press_key, options.session_id, + keys=[key], duration=duration, + ) + elif action == "type": + text = params.get("text", "") + await asyncio.to_thread( + computer.type_text, options.session_id, text=text, + ) + elif action in ("left_click", "right_click", "middle_click"): + x, y = params.get("coordinate", [0, 0]) + button = {"left_click": "left", "right_click": "right", "middle_click": "middle"}[action] + await asyncio.to_thread( + computer.click_mouse, options.session_id, x=x, y=y, button=button, + ) + elif action == "double_click": + x, y = params.get("coordinate", [0, 0]) + await asyncio.to_thread( + computer.click_mouse, options.session_id, x=x, y=y, num_clicks=2, + ) + elif action == "triple_click": + x, y = params.get("coordinate", [0, 0]) + await asyncio.to_thread( + computer.click_mouse, options.session_id, x=x, y=y, num_clicks=3, + ) + elif action == "mouse_move": + x, y = params.get("coordinate", [0, 0]) + await asyncio.to_thread( + computer.move_mouse, options.session_id, x=x, y=y, + ) + elif action == "left_click_drag": + sx, sy = params.get("start_coordinate", [0, 0]) + ex, ey = params.get("coordinate", [0, 0]) + await asyncio.to_thread( + computer.drag_mouse, options.session_id, + path=[[sx, sy], [ex, ey]], + ) + elif action == "scroll": + x, y = params.get("coordinate", [0, 0]) + direction = params.get("direction", "down") + amount = params.get("amount", 3) + dx = -amount if direction == "left" else amount if direction == "right" else 0 + dy = -amount if direction == "up" else amount if direction == "down" else 0 + await asyncio.to_thread( + computer.scroll, options.session_id, + x=x, y=y, delta_x=dx, delta_y=dy, + ) + elif action == "wait": + duration = params.get("duration", 1000) + await asyncio.sleep(duration / 1000) + elif action == "cursor_position": + pass + else: + raise ValueError(f"Unknown action: {action}") + + # Screenshot after every action + await asyncio.sleep(0.5) + resp = await asyncio.to_thread( + computer.capture_screenshot, options.session_id, + ) + return base64.b64encode(resp.read()).decode() diff --git a/pkg/templates/python/cua/providers/gemini.py b/pkg/templates/python/cua/providers/gemini.py new file mode 100644 index 0000000..ac32146 --- /dev/null +++ b/pkg/templates/python/cua/providers/gemini.py @@ -0,0 +1,241 @@ +"""Gemini CUA provider adapter using the Google GenAI SDK.""" + +from __future__ import annotations + +import asyncio +import base64 +import os +from datetime import datetime + +from google import genai +from google.genai.types import ( + Content, + GenerateContentConfig, + Part, + ThinkingConfig, + Tool, + ComputerUse, + Environment, +) + +from . import CuaProvider, TaskOptions, TaskResult + +COORDINATE_SCALE = 1000 +DEFAULT_WIDTH = 1200 +DEFAULT_HEIGHT = 800 + +def _system_prompt() -> str: + date = datetime.now().strftime("%A, %B %d, %Y") + return ( + "You are a helpful assistant that can use a web browser.\n" + "You are operating a Chrome browser through computer use tools.\n" + "The browser is already open and ready for use.\n" + "When you need to navigate to a page, use the navigate action.\n" + "After each action, carefully evaluate the screenshot.\n" + f"Current date: {date}." + ) + + +class GeminiProvider: + name = "gemini" + + def __init__(self) -> None: + self._api_key = os.environ.get("GOOGLE_API_KEY", "") + + def is_configured(self) -> bool: + return len(self._api_key) > 0 + + async def run_task(self, options: TaskOptions) -> TaskResult: + width = options.viewport_width or DEFAULT_WIDTH + height = options.viewport_height or DEFAULT_HEIGHT + client = genai.Client(api_key=self._api_key) + model = options.model or "gemini-2.5-computer-use-preview-10-2025" + + contents: list[Content] = [ + Content(role="user", parts=[Part(text=options.query)]), + ] + + for _i in range(50): + response = await asyncio.to_thread( + client.models.generate_content, + model=model, + contents=contents, + config=GenerateContentConfig( + temperature=1, + top_p=0.95, + top_k=40, + max_output_tokens=8192, + system_instruction=_system_prompt(), + tools=[Tool(computer_use=ComputerUse(environment=Environment.ENVIRONMENT_BROWSER))], + thinking_config=ThinkingConfig(include_thoughts=True), + ), + ) + + if not response.candidates or not response.candidates[0].content: + break + + candidate = response.candidates[0] + contents.append(candidate.content) + + # Extract text and function calls + text_parts = [ + p.text for p in (candidate.content.parts or []) + if hasattr(p, "text") and p.text + ] + function_calls = [ + p.function_call for p in (candidate.content.parts or []) + if hasattr(p, "function_call") and p.function_call + ] + + if not function_calls: + return TaskResult( + result=" ".join(text_parts) or "(no response)", + provider=self.name, + ) + + # Execute function calls + responses: list[Part] = [] + for fc in function_calls: + if not fc.name: + continue + args = dict(fc.args) if fc.args else {} + + safety = args.get("safety_decision", {}) + if isinstance(safety, dict) and safety.get("decision") == "require_confirmation": + print(f"Safety check: {safety.get('explanation', '')}") + + result = await self._execute_action( + options, fc.name, args, width, height, + ) + + if result.get("error"): + responses.append(Part.from_function_response( + name=fc.name, + response={"error": result["error"], "url": "about:blank"}, + )) + else: + responses.append(Part.from_function_response( + name=fc.name, + response={"url": result.get("url", "about:blank")}, + )) + if result.get("screenshot"): + responses.append(Part(inline_data={ + "mime_type": "image/png", + "data": result["screenshot"], + })) + + contents.append(Content(role="user", parts=responses)) + + return TaskResult(result="(max iterations reached)", provider=self.name) + + def _denorm(self, value: float | None, dimension: int) -> int: + if value is None: + return 0 + return round((value / COORDINATE_SCALE) * dimension) + + async def _execute_action( + self, + options: TaskOptions, + name: str, + args: dict, + width: int, + height: int, + ) -> dict: + computer = options.kernel.browsers.computer + + try: + if name == "click_at": + x = self._denorm(args.get("x"), width) + y = self._denorm(args.get("y"), height) + await asyncio.to_thread(computer.click_mouse, options.session_id, x=x, y=y) + + elif name == "hover_at": + x = self._denorm(args.get("x"), width) + y = self._denorm(args.get("y"), height) + await asyncio.to_thread(computer.move_mouse, options.session_id, x=x, y=y) + + elif name == "type_text_at": + x = self._denorm(args.get("x"), width) + y = self._denorm(args.get("y"), height) + await asyncio.to_thread(computer.click_mouse, options.session_id, x=x, y=y) + text = args.get("text", "") + if text: + await asyncio.to_thread(computer.type_text, options.session_id, text=text) + + elif name in ("scroll_document", "scroll_at"): + if name == "scroll_at": + x = self._denorm(args.get("x"), width) + y = self._denorm(args.get("y"), height) + else: + x, y = width // 2, height // 2 + magnitude = args.get("magnitude", 3) + direction = args.get("direction", "down") + dy = -magnitude if direction == "up" else magnitude if direction == "down" else 0 + dx = -magnitude if direction == "left" else magnitude if direction == "right" else 0 + await asyncio.to_thread( + computer.scroll, options.session_id, x=x, y=y, delta_x=dx, delta_y=dy, + ) + + elif name == "wait_5_seconds": + await asyncio.sleep(5) + + elif name == "go_back": + await asyncio.to_thread( + computer.press_key, options.session_id, keys=["Left"], hold_keys=["Alt_L"], + ) + + elif name == "go_forward": + await asyncio.to_thread( + computer.press_key, options.session_id, keys=["Right"], hold_keys=["Alt_L"], + ) + + elif name in ("navigate", "search"): + url = args.get("url") or args.get("text", "") + await asyncio.to_thread( + computer.batch, options.session_id, actions=[ + {"type": "press_key", "press_key": {"keys": ["l"], "hold_keys": ["Control_L"]}}, + {"type": "sleep", "sleep": {"duration_ms": 200}}, + {"type": "press_key", "press_key": {"keys": ["a"], "hold_keys": ["Control_L"]}}, + {"type": "type_text", "type_text": {"text": url}}, + {"type": "press_key", "press_key": {"keys": ["Return"]}}, + ], + ) + await asyncio.sleep(1.5) + + elif name == "key_combination": + combo = args.get("key_combination", "") + parts = [k.strip() for k in combo.split("+")] + hold_keys = parts[:-1] if len(parts) > 1 else [] + keys = parts[-1:] if parts else [] + kwargs: dict = {"keys": keys or parts} + if hold_keys: + kwargs["hold_keys"] = hold_keys + await asyncio.to_thread( + computer.press_key, options.session_id, **kwargs, + ) + + elif name == "drag_and_drop": + sx = self._denorm(args.get("start_x"), width) + sy = self._denorm(args.get("start_y"), height) + ex = self._denorm(args.get("end_x"), width) + ey = self._denorm(args.get("end_y"), height) + await asyncio.to_thread( + computer.drag_mouse, options.session_id, path=[[sx, sy], [ex, ey]], + ) + + elif name == "open_web_browser": + pass + + else: + return {"error": f"Unknown action: {name}"} + + # Screenshot after every action + await asyncio.sleep(0.5) + resp = await asyncio.to_thread( + computer.capture_screenshot, options.session_id, + ) + screenshot = base64.b64encode(resp.read()).decode() + return {"screenshot": screenshot, "url": "about:blank"} + + except Exception as exc: + return {"error": str(exc)} diff --git a/pkg/templates/python/cua/providers/openai.py b/pkg/templates/python/cua/providers/openai.py new file mode 100644 index 0000000..f83656f --- /dev/null +++ b/pkg/templates/python/cua/providers/openai.py @@ -0,0 +1,242 @@ +"""OpenAI CUA provider adapter using the Responses API.""" + +from __future__ import annotations + +import asyncio +import base64 +import json +import os +from datetime import datetime + +import httpx + +from . import CuaProvider, TaskOptions, TaskResult + +KEYSYM_MAP = { + "ENTER": "Return", "Enter": "Return", "RETURN": "Return", + "BACKSPACE": "BackSpace", "Backspace": "BackSpace", + "DELETE": "Delete", "TAB": "Tab", "ESCAPE": "Escape", "Escape": "Escape", + "SPACE": "space", "Space": "space", + "UP": "Up", "DOWN": "Down", "LEFT": "Left", "RIGHT": "Right", + "HOME": "Home", "END": "End", + "PAGEUP": "Prior", "PAGE_UP": "Prior", "PageUp": "Prior", + "PAGEDOWN": "Next", "PAGE_DOWN": "Next", "PageDown": "Next", + "CTRL": "Control_L", "Ctrl": "Control_L", "CONTROL": "Control_L", "Control": "Control_L", + "ALT": "Alt_L", "Alt": "Alt_L", + "SHIFT": "Shift_L", "Shift": "Shift_L", + "META": "Super_L", "Meta": "Super_L", "CMD": "Super_L", "COMMAND": "Super_L", +} + +MODIFIER_KEYSYMS = { + "Control_L", "Control_R", "Alt_L", "Alt_R", + "Shift_L", "Shift_R", "Super_L", "Super_R", +} + + +def _translate_keys(keys: list[str]) -> list[str]: + return [KEYSYM_MAP.get(k, k) for k in keys] + + +def _expand_and_translate( + keys: list[str], hold_keys: list[str], +) -> tuple[list[str], list[str]]: + expanded = [] + for raw in keys: + for part in (raw.split("+") if "+" in raw else [raw]): + trimmed = part.strip() + if trimmed: + expanded.append(trimmed) + + translated = _translate_keys(expanded) + translated_hold = _translate_keys(hold_keys) + + hold_from_keys = [k for k in translated if k in MODIFIER_KEYSYMS] + primary = [k for k in translated if k not in MODIFIER_KEYSYMS] + + if not primary: + return translated, translated_hold + + merged = list(dict.fromkeys(translated_hold + hold_from_keys)) + return primary, merged + + +def _translate_action(action: dict) -> list[dict]: + action_type = action.get("type", "") + + if action_type == "click": + button = action.get("button", "left") + if button == "back": + return [{"type": "press_key", "press_key": {"keys": ["Left"], "hold_keys": ["Alt_L"]}}] + if button == "forward": + return [{"type": "press_key", "press_key": {"keys": ["Right"], "hold_keys": ["Alt_L"]}}] + if button == "wheel": + return [{"type": "scroll", "scroll": { + "x": action.get("x", 0), "y": action.get("y", 0), + "delta_x": action.get("scroll_x", 0), "delta_y": action.get("scroll_y", 0), + }}] + btn = "left" + if isinstance(button, int): + btn = {2: "middle", 3: "right"}.get(button, "left") + elif isinstance(button, str): + btn = button + return [{"type": "click_mouse", "click_mouse": {"x": action.get("x", 0), "y": action.get("y", 0), "button": btn}}] + + if action_type == "double_click": + return [{"type": "click_mouse", "click_mouse": {"x": action.get("x", 0), "y": action.get("y", 0), "num_clicks": 2}}] + + if action_type == "type": + return [{"type": "type_text", "type_text": {"text": action.get("text", "")}}] + + if action_type == "keypress": + primary, hold = _expand_and_translate(action.get("keys", []), action.get("hold_keys", [])) + result: dict = {"type": "press_key", "press_key": {"keys": primary}} + if hold: + result["press_key"]["hold_keys"] = hold + return [result] + + if action_type == "scroll": + return [{"type": "scroll", "scroll": { + "x": action.get("x", 0), "y": action.get("y", 0), + "delta_x": action.get("scroll_x", 0), "delta_y": action.get("scroll_y", 0), + }}] + + if action_type == "move": + return [{"type": "move_mouse", "move_mouse": {"x": action.get("x", 0), "y": action.get("y", 0)}}] + + if action_type == "drag": + path = action.get("path", []) + points = [] + for p in path: + if isinstance(p, dict): + points.append([p["x"], p["y"]]) + elif isinstance(p, (list, tuple)) and len(p) >= 2: + points.append([p[0], p[1]]) + if len(points) < 2: + raise ValueError("drag requires at least 2 path points") + return [{"type": "drag_mouse", "drag_mouse": {"path": points}}] + + if action_type == "wait": + return [{"type": "sleep", "sleep": {"duration_ms": action.get("ms", 1000)}}] + + if action_type == "goto": + url = action.get("url", "") + return [ + {"type": "press_key", "press_key": {"keys": ["l"], "hold_keys": ["Control_L"]}}, + {"type": "sleep", "sleep": {"duration_ms": 200}}, + {"type": "press_key", "press_key": {"keys": ["a"], "hold_keys": ["Control_L"]}}, + {"type": "type_text", "type_text": {"text": url}}, + {"type": "press_key", "press_key": {"keys": ["Return"]}}, + ] + + if action_type == "back": + return [{"type": "press_key", "press_key": {"keys": ["Left"], "hold_keys": ["Alt_L"]}}] + + if action_type == "screenshot": + return [] + + raise ValueError(f"Unknown CUA action: {action_type}") + + +async def _create_response(api_key: str, **kwargs) -> dict: + """Call the OpenAI Responses API with retry.""" + async with httpx.AsyncClient(timeout=120) as client: + for attempt in range(4): + try: + resp = await client.post( + "https://api.openai.com/v1/responses", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json=kwargs, + ) + resp.raise_for_status() + return resp.json() + except httpx.HTTPStatusError as exc: + if exc.response.status_code >= 500 and attempt < 3: + await asyncio.sleep(2 ** attempt) + continue + raise + raise RuntimeError("Max retries exceeded") + + +class OpenAIProvider: + name = "openai" + + def __init__(self) -> None: + self._api_key = os.environ.get("OPENAI_API_KEY", "") + + def is_configured(self) -> bool: + return len(self._api_key) > 0 + + async def run_task(self, options: TaskOptions) -> TaskResult: + computer = options.kernel.browsers.computer + + # Navigate to starting page + goto_actions = _translate_action({"type": "goto", "url": "https://duckduckgo.com"}) + await asyncio.to_thread( + computer.batch, options.session_id, actions=goto_actions, + ) + + input_items = [ + {"role": "system", "content": f"Current date: {datetime.now().isoformat()}"}, + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": options.query}]}, + ] + items: list[dict] = [] + + for _turn in range(50): + response = await _create_response( + self._api_key, + model=options.model or "gpt-5.4", + input=input_items + items, + tools=[{"type": "computer"}], + truncation="auto", + reasoning={"effort": "low", "summary": "concise"}, + ) + + output = response.get("output", []) + if not output: + raise RuntimeError("No output from model") + + for item in output: + items.append(item) + + if item.get("type") == "computer_call": + action_list = item.get("actions") or ([item["action"]] if "action" in item else []) + + batch: list[dict] = [] + for a in action_list: + batch.extend(_translate_action(a)) + if batch: + await asyncio.to_thread( + computer.batch, options.session_id, actions=batch, + ) + + # Safety checks + for check in item.get("pending_safety_checks", []): + print(f"Safety check: {check.get('message', '')}") + + await asyncio.sleep(0.3) + resp = await asyncio.to_thread( + computer.capture_screenshot, options.session_id, + ) + screenshot = base64.b64encode(resp.read()).decode() + + items.append({ + "type": "computer_call_output", + "call_id": item["call_id"], + "acknowledged_safety_checks": item.get("pending_safety_checks", []), + "output": { + "type": "computer_screenshot", + "image_url": f"data:image/png;base64,{screenshot}", + }, + }) + + # Check for final assistant message + last = output[-1] if output else {} + if last.get("role") == "assistant": + content = last.get("content", []) + texts = [c.get("text", "") for c in content if isinstance(c, dict) and "text" in c] + return TaskResult(result=" ".join(texts) or "(no response)", provider=self.name) + + return TaskResult(result="(max turns reached)", provider=self.name) diff --git a/pkg/templates/python/cua/pyproject.toml b/pkg/templates/python/cua/pyproject.toml new file mode 100644 index 0000000..37e844e --- /dev/null +++ b/pkg/templates/python/cua/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "python-cua" +version = "0.1.0" +description = "Unified CUA template with multi-provider fallback for Kernel" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "anthropic>=0.92.0", + "google-genai>=1.71.0", + "httpx>=0.28.1", + "kernel>=0.47.0", + "openai>=2.30.0", + "python-dotenv>=1.2.2", +] diff --git a/pkg/templates/python/cua/session.py b/pkg/templates/python/cua/session.py new file mode 100644 index 0000000..3cbe254 --- /dev/null +++ b/pkg/templates/python/cua/session.py @@ -0,0 +1,163 @@ +"""Kernel Browser Session Manager with optional replay recording.""" + +from __future__ import annotations + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Any + +from kernel import Kernel + + +@dataclass +class SessionOptions: + invocation_id: str | None = None + stealth: bool = True + timeout_seconds: int = 300 + record_replay: bool = False + replay_grace_period: float = 5.0 + viewport_width: int = 1280 + viewport_height: int = 800 + proxy_id: str | None = None + profile: dict | None = None + extensions: list[dict] | None = None + + +@dataclass +class SessionInfo: + session_id: str = "" + live_view_url: str = "" + replay_id: str | None = None + replay_view_url: str | None = None + viewport_width: int = 1280 + viewport_height: int = 800 + + +class KernelBrowserSession: + """Manages Kernel browser lifecycle with optional replay recording.""" + + def __init__(self, kernel: Kernel, options: SessionOptions | None = None) -> None: + self.kernel = kernel + self.opts = options or SessionOptions() + self._session_id: str | None = None + self._live_view_url: str | None = None + self._replay_id: str | None = None + self._replay_view_url: str | None = None + + @property + def session_id(self) -> str: + if not self._session_id: + raise RuntimeError("Session not started. Call start() first.") + return self._session_id + + @property + def live_view_url(self) -> str | None: + return self._live_view_url + + @property + def replay_view_url(self) -> str | None: + return self._replay_view_url + + @property + def info(self) -> SessionInfo: + return SessionInfo( + session_id=self.session_id, + live_view_url=self._live_view_url or "", + replay_id=self._replay_id, + replay_view_url=self._replay_view_url, + viewport_width=self.opts.viewport_width, + viewport_height=self.opts.viewport_height, + ) + + async def start(self) -> SessionInfo: + create_kwargs: dict = { + "invocation_id": self.opts.invocation_id, + "stealth": self.opts.stealth, + "timeout_seconds": self.opts.timeout_seconds, + "viewport": { + "width": self.opts.viewport_width, + "height": self.opts.viewport_height, + }, + } + if self.opts.proxy_id: + create_kwargs["proxy_id"] = self.opts.proxy_id + if self.opts.profile: + create_kwargs["profile"] = self.opts.profile + if self.opts.extensions: + create_kwargs["extensions"] = self.opts.extensions + + browser = await asyncio.to_thread( + self.kernel.browsers.create, + **create_kwargs, + ) + + self._session_id = browser.session_id + self._live_view_url = getattr(browser, "browser_live_view_url", None) + + print(f"Browser session: {self._session_id}") + print(f"Live view: {self._live_view_url}") + + if self.opts.record_replay: + try: + replay = await asyncio.to_thread( + self.kernel.browsers.replays.start, self._session_id, + ) + self._replay_id = replay.replay_id + print(f"Replay recording started: {self._replay_id}") + except Exception as exc: + print(f"Warning: Failed to start replay: {exc}") + + return self.info + + async def stop(self) -> SessionInfo: + info = self.info + + if self._session_id: + try: + if self.opts.record_replay and self._replay_id: + if self.opts.replay_grace_period > 0: + await asyncio.sleep(self.opts.replay_grace_period) + await self._stop_replay() + info.replay_view_url = self._replay_view_url + finally: + print(f"Destroying browser session: {self._session_id}") + await asyncio.to_thread( + self.kernel.browsers.delete_by_id, self._session_id, + ) + + self._session_id = None + self._live_view_url = None + self._replay_id = None + self._replay_view_url = None + + return info + + async def _stop_replay(self) -> None: + if not self._session_id or not self._replay_id: + return + + await asyncio.to_thread( + self.kernel.browsers.replays.stop, + self._replay_id, + id=self._session_id, + ) + await asyncio.sleep(2) + + deadline = time.monotonic() + 60 + while time.monotonic() < deadline: + try: + replays = await asyncio.to_thread( + self.kernel.browsers.replays.list, self._session_id, + ) + for r in replays: + if r.replay_id == self._replay_id: + self._replay_view_url = getattr(r, "replay_view_url", None) + if self._replay_view_url: + print(f"Replay URL: {self._replay_view_url}") + return + except Exception: + pass + await asyncio.sleep(1) + + print("Warning: Replay may still be processing.") diff --git a/pkg/templates/typescript/cua/.env.example b/pkg/templates/typescript/cua/.env.example new file mode 100644 index 0000000..b56ea3b --- /dev/null +++ b/pkg/templates/typescript/cua/.env.example @@ -0,0 +1,26 @@ +# Copy this file to .env and fill in your API keys. +# Only the key for your chosen provider is required. + +# Primary provider: "anthropic", "openai", or "gemini" +CUA_PROVIDER=anthropic + +# Comma-separated fallback order (optional). +# If the primary provider fails, these are tried in order. +# CUA_FALLBACK_PROVIDERS=openai,gemini + +# Provider API keys — set the one(s) you plan to use +ANTHROPIC_API_KEY=your_anthropic_api_key_here +OPENAI_API_KEY=your_openai_api_key_here +GOOGLE_API_KEY=your_google_api_key_here + +# Browser config (proxy, profile, extensions, timeout) is set per-request +# via the payload "browser" field, not here. Example: +# kernel invoke ts-cua cua-task --payload '{ +# "query": "...", +# "browser": { +# "proxy_id": "proxy_abc123", +# "profile": { "name": "my-profile", "save_changes": true }, +# "extensions": [{ "name": "my-extension" }], +# "timeout_seconds": 600 +# } +# }' diff --git a/pkg/templates/typescript/cua/README.md b/pkg/templates/typescript/cua/README.md new file mode 100644 index 0000000..ada6069 --- /dev/null +++ b/pkg/templates/typescript/cua/README.md @@ -0,0 +1,85 @@ +# Unified CUA Template + +A multi-provider Computer Use Agent (CUA) template for [Kernel](https://kernel.sh). Supports **Anthropic**, **OpenAI**, and **Google Gemini** as interchangeable backends with automatic fallback. + +## Quick start + +### 1. Install dependencies + +```bash +npm install +``` + +### 2. Configure environment + +Copy the example env file and add your API keys: + +```bash +cp .env.example .env +``` + +Set `CUA_PROVIDER` to your preferred provider and add the matching API key: + +| Provider | Env var for key | Model used | +|-------------|----------------------|--------------------------------------------| +| `anthropic` | `ANTHROPIC_API_KEY` | `claude-sonnet-4-6` | +| `openai` | `OPENAI_API_KEY` | `gpt-5.4` | +| `gemini` | `GOOGLE_API_KEY` | `gemini-2.5-computer-use-preview-10-2025` | + +### 3. Deploy to Kernel + +```bash +kernel deploy index.ts --env-file .env +``` + +### 4. Invoke + +```bash +kernel invoke ts-cua cua-task --payload '{"query": "Go to https://news.ycombinator.com and get the top 5 stories"}' +``` + +## Multi-provider fallback + +Set `CUA_FALLBACK_PROVIDERS` to automatically try another provider if the primary fails: + +```env +CUA_PROVIDER=anthropic +CUA_FALLBACK_PROVIDERS=openai,gemini +``` + +This will try Anthropic first, then OpenAI, then Gemini. Only providers with valid API keys are used. + +## Replay recording + +Pass `record_replay: true` in the payload to capture a video replay of the browser session: + +```bash +kernel invoke ts-cua cua-task --payload '{"query": "Navigate to example.com", "record_replay": true}' +``` + +The response will include a `replay_url` you can open in your browser. + +## Project structure + +``` +index.ts — Kernel app entrypoint +session.ts — Browser session lifecycle with replay support +providers/ + index.ts — Provider factory and fallback logic + anthropic.ts — Anthropic Claude adapter + openai.ts — OpenAI GPT adapter + gemini.ts — Google Gemini adapter +``` + +## Customization + +Each provider adapter is self-contained. To customize a provider's behavior (system prompt, model, tool handling), edit the corresponding file in `providers/`. + +To add a new provider, create a new file that implements the `CuaProvider` interface and register it in `providers/index.ts`. + +## Resources + +- [Kernel Docs](https://docs.kernel.sh) +- [Anthropic Computer Use](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use) +- [OpenAI Computer Use](https://platform.openai.com/docs/guides/computer-use) +- [Google Gemini Computer Use](https://ai.google.dev/gemini-api/docs/computer-use) diff --git a/pkg/templates/typescript/cua/_gitignore b/pkg/templates/typescript/cua/_gitignore new file mode 100644 index 0000000..0b43630 --- /dev/null +++ b/pkg/templates/typescript/cua/_gitignore @@ -0,0 +1,32 @@ +# Dependencies +node_modules/ +package-lock.json + +# TypeScript +*.tsbuildinfo +dist/ +build/ + +# Environment +.env +.env.local +.env.*.local + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +logs/ +*.log +npm-debug.log* + +# Misc +.cache/ +.temp/ diff --git a/pkg/templates/typescript/cua/index.ts b/pkg/templates/typescript/cua/index.ts new file mode 100644 index 0000000..dc165f0 --- /dev/null +++ b/pkg/templates/typescript/cua/index.ts @@ -0,0 +1,122 @@ +/** + * Unified CUA (Computer Use Agent) template with multi-provider support. + * + * Supports Anthropic, OpenAI, and Gemini as interchangeable providers. + * Configure via environment variables: + * CUA_PROVIDER — primary provider ("anthropic", "openai", or "gemini") + * CUA_FALLBACK_PROVIDERS — comma-separated fallback order (optional) + * + * Each provider requires its own API key: + * ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY + */ + +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { KernelBrowserSession } from './session'; +import { resolveProviders, runWithFallback, type ProviderName } from './providers/index'; + +const kernel = new Kernel(); +const app = kernel.app('ts-cua'); + +interface BrowserConfig { + proxy_id?: string; + profile?: { id?: string; name?: string; save_changes?: boolean }; + extensions?: Array<{ id?: string; name?: string }>; + timeout_seconds?: number; +} + +interface CuaInput { + query: string; + provider?: ProviderName; + model?: string; + record_replay?: boolean; + session_id?: string; + browser?: BrowserConfig; +} + +interface CuaOutput { + result: string; + provider: string; + replay_url?: string; +} + +// Provider resolution is deferred to the action handler because env vars +// are not available during Hypeman's build/discovery phase. +let _providers: ReturnType | null = null; +function getProviders() { + if (!_providers) { + _providers = resolveProviders(); + console.log(`Configured providers: ${_providers.map(p => p.name).join(' -> ')}`); + } + return _providers; +} + +app.action( + 'cua-task', + async (ctx: KernelContext, payload?: CuaInput): Promise => { + if (!payload?.query) { + throw new Error('Query is required. Payload must include: { "query": "your task description" }'); + } + + let providers = getProviders(); + + // Per-request provider override: move requested provider to front + if (payload.provider) { + const requested = providers.find(p => p.name === payload.provider); + if (requested) { + providers = [requested, ...providers.filter(p => p !== requested)]; + } + } + + // Use an existing browser session (BYOB) or create a new one. + // BYOB is useful for multi-turn CUA on a persistent browser, or HITL + // where a human uses the live view between CUA calls. + if (payload.session_id) { + const browser = await kernel.browsers.retrieve(payload.session_id); + const { result, provider } = await runWithFallback(providers, { + query: payload.query, + model: payload.model, + kernel, + sessionId: payload.session_id, + viewportWidth: browser.viewport?.width ?? 1280, + viewportHeight: browser.viewport?.height ?? 800, + }); + return { result, provider }; + } + + const session = new KernelBrowserSession(kernel, { + invocationId: ctx.invocation_id, + stealth: true, + recordReplay: payload.record_replay ?? false, + ...(payload.browser?.proxy_id ? { proxyId: payload.browser.proxy_id } : {}), + ...(payload.browser?.profile ? { profile: payload.browser.profile } : {}), + ...(payload.browser?.extensions ? { extensions: payload.browser.extensions } : {}), + ...(payload.browser?.timeout_seconds ? { timeoutSeconds: payload.browser.timeout_seconds } : {}), + }); + + await session.start(); + console.log('Live view:', session.liveViewUrl); + + try { + const { result, provider } = await runWithFallback(providers, { + query: payload.query, + model: payload.model, + kernel, + sessionId: session.sessionId, + viewportWidth: session.viewportWidth, + viewportHeight: session.viewportHeight, + }); + + const sessionInfo = await session.stop(); + + return { + result, + provider, + replay_url: sessionInfo.replayViewUrl, + }; + } catch (error) { + console.error('CUA task failed:', error); + await session.stop(); + throw error; + } + }, +); diff --git a/pkg/templates/typescript/cua/package.json b/pkg/templates/typescript/cua/package.json new file mode 100644 index 0000000..08c932d --- /dev/null +++ b/pkg/templates/typescript/cua/package.json @@ -0,0 +1,16 @@ +{ + "name": "ts-cua", + "module": "index.ts", + "type": "module", + "private": true, + "dependencies": { + "@anthropic-ai/sdk": "^0.86.1", + "@google/genai": "^1.49.0", + "@onkernel/sdk": "^0.47.0", + "openai": "^6.33.0" + }, + "devDependencies": { + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/cua/providers/anthropic.ts b/pkg/templates/typescript/cua/providers/anthropic.ts new file mode 100644 index 0000000..6c8e4e8 --- /dev/null +++ b/pkg/templates/typescript/cua/providers/anthropic.ts @@ -0,0 +1,229 @@ +/** + * Anthropic CUA provider adapter. + * + * Uses the Anthropic SDK's beta computer-use API with Claude models. + */ + +import { Anthropic } from '@anthropic-ai/sdk'; +import type { CuaProvider, TaskOptions, TaskResult } from './index'; + +function getSystemPrompt(): string { + const date = new Date().toLocaleDateString('en-US', { weekday: 'long', month: 'long', day: 'numeric', year: 'numeric' }); + return ` +* You are utilising an Ubuntu virtual machine with internet access. +* When you connect to the display, CHROMIUM IS ALREADY OPEN. +* If you need to navigate to a new page, use ctrl+l to focus the url bar and then enter the url. +* After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. +* Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. +* Only when you confirm a step was executed correctly should you move on to the next one. +* The current date is ${date}. + + + +* When using Chromium, if a startup wizard appears, IGNORE IT. +* Click on the search bar and enter the appropriate URL there. +`; +} + +type BetaMessageParam = Anthropic.Beta.Messages.BetaMessageParam; +type BetaContentBlockParam = Anthropic.Beta.Messages.BetaContentBlockParam; + +export class AnthropicProvider implements CuaProvider { + readonly name = 'anthropic'; + private apiKey: string; + + constructor() { + this.apiKey = process.env.ANTHROPIC_API_KEY ?? ''; + } + + isConfigured(): boolean { + return this.apiKey.length > 0; + } + + async runTask(options: TaskOptions): Promise { + const { query, kernel, sessionId, viewportWidth = 1280, viewportHeight = 800 } = options; + const client = new Anthropic({ apiKey: this.apiKey, maxRetries: 4 }); + const model = options.model || 'claude-sonnet-4-6'; + + const messages: BetaMessageParam[] = [{ role: 'user', content: query }]; + + while (true) { + const response = await client.beta.messages.create({ + max_tokens: 4096, + messages, + model, + system: [{ type: 'text', text: getSystemPrompt(), cache_control: { type: 'ephemeral' } }], + tools: [{ + type: 'computer_20251124', + name: 'computer', + display_width_px: viewportWidth, + display_height_px: viewportHeight, + display_number: 1, + }], + betas: ['computer-use-2025-11-24', 'prompt-caching-2024-07-31'], + thinking: { type: 'enabled', budget_tokens: 1024 }, + }); + + // Build assistant content for the messages array + const assistantContent: BetaContentBlockParam[] = response.content.map(block => { + if (block.type === 'thinking') { + return { type: 'thinking' as const, thinking: block.thinking, signature: block.signature }; + } + if (block.type === 'text') { + return { type: 'text' as const, text: block.text }; + } + if (block.type === 'tool_use') { + return { type: 'tool_use' as const, id: block.id, name: block.name, input: block.input }; + } + return block as unknown as BetaContentBlockParam; + }); + messages.push({ role: 'assistant', content: assistantContent }); + + if (response.stop_reason === 'end_turn') { + const text = response.content + .filter((b): b is Anthropic.Beta.Messages.BetaTextBlock => b.type === 'text') + .map(b => b.text) + .join(''); + return { result: text, provider: this.name }; + } + + // Process tool calls + const toolResults: BetaContentBlockParam[] = []; + for (const block of response.content) { + if (block.type !== 'tool_use') continue; + + const input = block.input as Record; + const action = input.action as string; + + try { + const screenshot = await this.executeAction(kernel, sessionId, action, input); + toolResults.push({ + type: 'tool_result' as unknown as 'text', + tool_use_id: block.id, + content: [{ type: 'image', source: { type: 'base64', media_type: 'image/png', data: screenshot } }], + } as unknown as BetaContentBlockParam); + } catch (error) { + toolResults.push({ + type: 'tool_result' as unknown as 'text', + tool_use_id: block.id, + content: [{ type: 'text', text: `Error: ${error instanceof Error ? error.message : String(error)}` }], + is_error: true, + } as unknown as BetaContentBlockParam); + } + } + + if (toolResults.length > 0) { + messages.push({ role: 'user', content: toolResults }); + } else { + // No tool use and not end_turn — model is done + const text = response.content + .filter((b): b is Anthropic.Beta.Messages.BetaTextBlock => b.type === 'text') + .map(b => b.text) + .join(''); + return { result: text || '(no response)', provider: this.name }; + } + } + } + + private async executeAction( + kernel: TaskOptions['kernel'], + sessionId: string, + action: string, + input: Record, + ): Promise { + const computer = kernel.browsers.computer; + + switch (action) { + case 'screenshot': break; + case 'key': { + const key = input.key as string; + await computer.pressKey(sessionId, { keys: [this.mapKey(key)] }); + break; + } + case 'hold_key': { + const key = input.key as string; + const duration = (input.duration as number) ?? 500; + await computer.pressKey(sessionId, { keys: [this.mapKey(key)], duration }); + break; + } + case 'type': { + const text = input.text as string; + await computer.typeText(sessionId, { text }); + break; + } + case 'cursor_position': break; + case 'mouse_move': { + const [x, y] = input.coordinate as [number, number]; + await computer.moveMouse(sessionId, { x, y }); + break; + } + case 'left_click': + case 'right_click': + case 'middle_click': { + const [x, y] = input.coordinate as [number, number]; + const button = action === 'right_click' ? 'right' : action === 'middle_click' ? 'middle' : 'left'; + await computer.clickMouse(sessionId, { x, y, button }); + break; + } + case 'double_click': { + const [x, y] = input.coordinate as [number, number]; + await computer.clickMouse(sessionId, { x, y, num_clicks: 2 }); + break; + } + case 'triple_click': { + const [x, y] = input.coordinate as [number, number]; + await computer.clickMouse(sessionId, { x, y, num_clicks: 3 }); + break; + } + case 'left_click_drag': { + const startCoordinate = input.start_coordinate as [number, number]; + const [ex, ey] = input.coordinate as [number, number]; + await computer.dragMouse(sessionId, { + path: [ + [startCoordinate[0], startCoordinate[1]], + [ex, ey], + ], + }); + break; + } + case 'scroll': { + const [x, y] = input.coordinate as [number, number]; + const direction = input.direction as string; + const amount = (input.amount as number) ?? 3; + const deltaX = direction === 'left' ? -amount : direction === 'right' ? amount : 0; + const deltaY = direction === 'up' ? -amount : direction === 'down' ? amount : 0; + await computer.scroll(sessionId, { x, y, delta_x: deltaX, delta_y: deltaY }); + break; + } + case 'wait': { + const duration = (input.duration as number) ?? 1000; + await new Promise(r => setTimeout(r, duration)); + break; + } + default: + throw new Error(`Unknown action: ${action}`); + } + + // Take screenshot after every action + await new Promise(r => setTimeout(r, 500)); + const resp = await computer.captureScreenshot(sessionId); + const buf = Buffer.from(await resp.arrayBuffer()); + return buf.toString('base64'); + } + + private mapKey(key: string): string { + const map: Record = { + Return: 'Return', Enter: 'Return', Backspace: 'BackSpace', + Tab: 'Tab', Escape: 'Escape', space: 'space', Space: 'space', + Up: 'Up', Down: 'Down', Left: 'Left', Right: 'Right', + Home: 'Home', End: 'End', Page_Up: 'Prior', Page_Down: 'Next', + ctrl: 'Control_L', Control_L: 'Control_L', alt: 'Alt_L', Alt_L: 'Alt_L', + shift: 'Shift_L', Shift_L: 'Shift_L', super: 'Super_L', Super_L: 'Super_L', + }; + // Handle combos like "ctrl+l" + if (key.includes('+')) { + return key.split('+').map(k => map[k.trim()] ?? k.trim()).join('+'); + } + return map[key] ?? key; + } +} diff --git a/pkg/templates/typescript/cua/providers/gemini.ts b/pkg/templates/typescript/cua/providers/gemini.ts new file mode 100644 index 0000000..1cc1ba3 --- /dev/null +++ b/pkg/templates/typescript/cua/providers/gemini.ts @@ -0,0 +1,247 @@ +/** + * Gemini CUA provider adapter. + * + * Uses Google's GenAI SDK with the computer-use-preview model. + */ + +import { + GoogleGenAI, + Environment, + type Content, + type FunctionCall, + type Part, +} from '@google/genai'; +import type { CuaProvider, TaskOptions, TaskResult } from './index'; + +// Gemini uses a 0-1000 coordinate scale that maps to actual screen pixels. +const COORDINATE_SCALE = 1000; +const DEFAULT_WIDTH = 1200; +const DEFAULT_HEIGHT = 800; + +const PREDEFINED_ACTIONS = [ + 'click_at', 'hover_at', 'type_text_at', 'scroll_document', + 'scroll_at', 'wait_5_seconds', 'go_back', 'go_forward', + 'search', 'navigate', 'key_combination', 'drag_and_drop', + 'open_web_browser', +]; + +function getSystemPrompt(): string { + const date = new Date().toLocaleDateString('en-US', { + weekday: 'long', year: 'numeric', month: 'long', day: 'numeric', + }); + return `You are a helpful assistant that can use a web browser. +You are operating a Chrome browser through computer use tools. +The browser is already open and ready for use. +When you need to navigate to a page, use the navigate action with a full URL. +After each action, carefully evaluate the screenshot to determine your next step. +Current date: ${date}.`; +} + +interface GeminiArgs { + x?: number; + y?: number; + text?: string; + url?: string; + key_combination?: string; + direction?: string; + magnitude?: number; + start_x?: number; + start_y?: number; + end_x?: number; + end_y?: number; + safety_decision?: { decision: string; explanation?: string }; + [key: string]: unknown; +} + +export class GeminiProvider implements CuaProvider { + readonly name = 'gemini'; + private apiKey: string; + + constructor() { + this.apiKey = process.env.GOOGLE_API_KEY ?? ''; + } + + isConfigured(): boolean { + return this.apiKey.length > 0; + } + + async runTask(options: TaskOptions): Promise { + const { query, kernel, sessionId } = options; + const width = options.viewportWidth ?? DEFAULT_WIDTH; + const height = options.viewportHeight ?? DEFAULT_HEIGHT; + const ai = new GoogleGenAI({ apiKey: this.apiKey }); + const model = options.model || 'gemini-2.5-computer-use-preview-10-2025'; + + const contents: Content[] = [{ role: 'user', parts: [{ text: query }] }]; + const maxIterations = 50; + + for (let i = 0; i < maxIterations; i++) { + const response = await ai.models.generateContent({ + model, + contents, + config: { + temperature: 1, + topP: 0.95, + topK: 40, + maxOutputTokens: 8192, + systemInstruction: getSystemPrompt(), + tools: [{ computerUse: { environment: Environment.ENVIRONMENT_BROWSER } }], + thinkingConfig: { includeThoughts: true }, + }, + }); + + const candidateContent = response.candidates?.[0]?.content; + if (!candidateContent) break; + contents.push(candidateContent); + + // Extract text and function calls + const textParts = (candidateContent.parts ?? []) + .filter(p => 'text' in p && p.text) + .map(p => (p as { text: string }).text); + const functionCalls = (candidateContent.parts ?? []) + .filter(p => 'functionCall' in p) + .map(p => (p as { functionCall: FunctionCall }).functionCall); + + if (functionCalls.length === 0) { + return { result: textParts.join(' ') || '(no response)', provider: this.name }; + } + + // Execute function calls + const responses: Part[] = []; + for (const fc of functionCalls) { + if (!fc.name) continue; + const args = (fc.args ?? {}) as GeminiArgs; + + if (args.safety_decision?.decision === 'require_confirmation') { + console.log('Safety check:', args.safety_decision.explanation); + } + + const result = await this.executeAction(kernel, sessionId, fc.name, args, width, height); + + const responseData: Record = { url: result.url || 'about:blank' }; + const part: Part = { + functionResponse: { + name: fc.name, + response: result.error ? { error: result.error, url: 'about:blank' } : responseData, + ...(result.screenshot && PREDEFINED_ACTIONS.includes(fc.name) ? { + parts: [{ inlineData: { mimeType: 'image/png', data: result.screenshot } }], + } : {}), + }, + }; + responses.push(part); + } + + contents.push({ role: 'user', parts: responses }); + } + + return { result: '(max iterations reached)', provider: this.name }; + } + + private denormalize(value: number | undefined, dimension: number): number { + if (value === undefined) return 0; + return Math.round((value / COORDINATE_SCALE) * dimension); + } + + private async executeAction( + kernel: TaskOptions['kernel'], + sessionId: string, + name: string, + args: GeminiArgs, + width: number, + height: number, + ): Promise<{ screenshot?: string; url?: string; error?: string }> { + const computer = kernel.browsers.computer; + + try { + switch (name) { + case 'click_at': { + const x = this.denormalize(args.x, width); + const y = this.denormalize(args.y, height); + await computer.clickMouse(sessionId, { x, y }); + break; + } + case 'hover_at': { + const x = this.denormalize(args.x, width); + const y = this.denormalize(args.y, height); + await computer.moveMouse(sessionId, { x, y }); + break; + } + case 'type_text_at': { + const x = this.denormalize(args.x, width); + const y = this.denormalize(args.y, height); + await computer.clickMouse(sessionId, { x, y }); + if (args.text) { + await computer.typeText(sessionId, { text: args.text }); + } + break; + } + case 'scroll_document': + case 'scroll_at': { + const x = name === 'scroll_at' ? this.denormalize(args.x, width) : width / 2; + const y = name === 'scroll_at' ? this.denormalize(args.y, height) : height / 2; + const magnitude = args.magnitude ?? 3; + const dir = args.direction ?? 'down'; + const deltaY = dir === 'up' ? -magnitude : dir === 'down' ? magnitude : 0; + const deltaX = dir === 'left' ? -magnitude : dir === 'right' ? magnitude : 0; + await computer.scroll(sessionId, { x, y, delta_x: deltaX, delta_y: deltaY }); + break; + } + case 'wait_5_seconds': + await new Promise(r => setTimeout(r, 5000)); + break; + case 'go_back': + await computer.pressKey(sessionId, { keys: ['Left'], hold_keys: ['Alt_L'] }); + break; + case 'go_forward': + await computer.pressKey(sessionId, { keys: ['Right'], hold_keys: ['Alt_L'] }); + break; + case 'navigate': + case 'search': { + const url = args.url ?? args.text ?? ''; + await computer.batch(sessionId, { + actions: [ + { type: 'press_key', press_key: { keys: ['l'], hold_keys: ['Control_L'] } }, + { type: 'sleep', sleep: { duration_ms: 200 } }, + { type: 'press_key', press_key: { keys: ['a'], hold_keys: ['Control_L'] } }, + { type: 'type_text', type_text: { text: url } }, + { type: 'press_key', press_key: { keys: ['Return'] } }, + ] as Parameters[1]['actions'], + }); + await new Promise(r => setTimeout(r, 1500)); + break; + } + case 'key_combination': { + const combo = args.key_combination ?? ''; + const parts = combo.split('+').map(k => k.trim()); + const holdKeys = parts.slice(0, -1); + const keys = parts.slice(-1); + await computer.pressKey(sessionId, { + keys: keys.length > 0 ? keys : parts, + ...(holdKeys.length > 0 ? { hold_keys: holdKeys } : {}), + }); + break; + } + case 'drag_and_drop': { + const sx = this.denormalize(args.start_x, width); + const sy = this.denormalize(args.start_y, height); + const ex = this.denormalize(args.end_x, width); + const ey = this.denormalize(args.end_y, height); + await computer.dragMouse(sessionId, { path: [[sx, sy], [ex, ey]] }); + break; + } + case 'open_web_browser': + break; + default: + return { error: `Unknown action: ${name}` }; + } + + // Take screenshot after action + await new Promise(r => setTimeout(r, 500)); + const resp = await computer.captureScreenshot(sessionId); + const buf = Buffer.from(await resp.arrayBuffer()); + return { screenshot: buf.toString('base64'), url: 'about:blank' }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + } +} diff --git a/pkg/templates/typescript/cua/providers/index.ts b/pkg/templates/typescript/cua/providers/index.ts new file mode 100644 index 0000000..a1766d9 --- /dev/null +++ b/pkg/templates/typescript/cua/providers/index.ts @@ -0,0 +1,114 @@ +/** + * Provider factory with automatic fallback. + * + * Resolution order: + * 1. CUA_PROVIDER env var (required) + * 2. CUA_FALLBACK_PROVIDERS env var (optional, comma-separated) + * + * A provider is "available" when its API key env var is set. + */ + +import type { Kernel } from '@onkernel/sdk'; +import { AnthropicProvider } from './anthropic'; +import { OpenAIProvider } from './openai'; +import { GeminiProvider } from './gemini'; + +// Shared interface every provider adapter must implement. +export interface TaskOptions { + query: string; + model?: string; + kernel: Kernel; + sessionId: string; + viewportWidth?: number; + viewportHeight?: number; +} + +export interface TaskResult { + result: string; + provider: string; +} + +export interface CuaProvider { + readonly name: string; + isConfigured(): boolean; + runTask(options: TaskOptions): Promise; +} + +export type ProviderName = 'anthropic' | 'openai' | 'gemini'; + +const PROVIDERS: Record CuaProvider> = { + anthropic: () => new AnthropicProvider(), + openai: () => new OpenAIProvider(), + gemini: () => new GeminiProvider(), +}; + +/** + * Build the ordered list of providers to try. + * Throws if no configured provider is found. + */ +export function resolveProviders(): CuaProvider[] { + const primaryName = (process.env.CUA_PROVIDER ?? '').trim().toLowerCase(); + const fallbackNames = (process.env.CUA_FALLBACK_PROVIDERS ?? '') + .split(',') + .map(s => s.trim().toLowerCase()) + .filter(Boolean); + + const order = primaryName ? [primaryName, ...fallbackNames] : fallbackNames; + + // Deduplicate while preserving order + const seen = new Set(); + const providers: CuaProvider[] = []; + + for (const name of order) { + if (seen.has(name)) continue; + seen.add(name); + + const factory = PROVIDERS[name]; + if (!factory) { + console.warn(`Unknown provider "${name}", skipping.`); + continue; + } + + const provider = factory(); + if (provider.isConfigured()) { + providers.push(provider); + } else { + console.warn(`Provider "${name}" is not configured (missing API key), skipping.`); + } + } + + if (providers.length === 0) { + const available = Object.keys(PROVIDERS).join(', '); + throw new Error( + 'No CUA provider is configured. ' + + `Set CUA_PROVIDER to one of: ${available}, and provide the matching API key.`, + ); + } + + return providers; +} + +/** + * Run a CUA task, trying each provider in order until one succeeds. + */ +export async function runWithFallback( + providers: CuaProvider[], + options: TaskOptions, +): Promise { + const errors: Array<{ provider: string; error: unknown }> = []; + + for (const provider of providers) { + try { + console.log(`Attempting provider: ${provider.name}`); + return await provider.runTask(options); + } catch (error) { + console.error(`Provider "${provider.name}" failed:`, error); + errors.push({ provider: provider.name, error }); + } + } + + const summary = errors + .map(e => ` ${e.provider}: ${e.error instanceof Error ? e.error.message : String(e.error)}`) + .join('\n'); + throw new Error(`All providers failed:\n${summary}`); +} diff --git a/pkg/templates/typescript/cua/providers/openai.ts b/pkg/templates/typescript/cua/providers/openai.ts new file mode 100644 index 0000000..a83274e --- /dev/null +++ b/pkg/templates/typescript/cua/providers/openai.ts @@ -0,0 +1,255 @@ +/** + * OpenAI CUA provider adapter. + * + * Uses the OpenAI Responses API with computer use tool. + */ + +import OpenAI from 'openai'; +import type { + ResponseInputItem, + ResponseItem, + ResponseComputerToolCall, + ResponseOutputMessage, +} from 'openai/resources/responses/responses'; +import type { CuaProvider, TaskOptions, TaskResult } from './index'; + +const KEYSYM_MAP: Record = { + ENTER: 'Return', Enter: 'Return', RETURN: 'Return', + BACKSPACE: 'BackSpace', Backspace: 'BackSpace', + DELETE: 'Delete', TAB: 'Tab', ESCAPE: 'Escape', Escape: 'Escape', + SPACE: 'space', Space: 'space', + UP: 'Up', DOWN: 'Down', LEFT: 'Left', RIGHT: 'Right', + HOME: 'Home', END: 'End', + PAGEUP: 'Prior', PAGE_UP: 'Prior', PageUp: 'Prior', + PAGEDOWN: 'Next', PAGE_DOWN: 'Next', PageDown: 'Next', + CTRL: 'Control_L', Ctrl: 'Control_L', CONTROL: 'Control_L', Control: 'Control_L', + ALT: 'Alt_L', Alt: 'Alt_L', + SHIFT: 'Shift_L', Shift: 'Shift_L', + META: 'Super_L', Meta: 'Super_L', CMD: 'Super_L', COMMAND: 'Super_L', + F1: 'F1', F2: 'F2', F3: 'F3', F4: 'F4', F5: 'F5', F6: 'F6', + F7: 'F7', F8: 'F8', F9: 'F9', F10: 'F10', F11: 'F11', F12: 'F12', +}; + +const MODIFIER_KEYSYMS = new Set([ + 'Control_L', 'Control_R', 'Alt_L', 'Alt_R', + 'Shift_L', 'Shift_R', 'Super_L', 'Super_R', +]); + +function translateKeys(keys: string[]): string[] { + return keys.map(k => KEYSYM_MAP[k] ?? k); +} + +function expandAndTranslateKeys(keys: string[], holdKeys: string[]): { keys: string[]; holdKeys: string[] } { + const expanded: string[] = []; + for (const raw of keys) { + const parts = raw.includes('+') ? raw.split('+') : [raw]; + for (const part of parts) { + const trimmed = part.trim(); + if (trimmed) expanded.push(trimmed); + } + } + + const translated = translateKeys(expanded); + const translatedHold = translateKeys(holdKeys); + + const holdFromKeys: string[] = []; + const primaryKeys: string[] = []; + for (const key of translated) { + if (MODIFIER_KEYSYMS.has(key)) holdFromKeys.push(key); + else primaryKeys.push(key); + } + + if (primaryKeys.length === 0) return { keys: translated, holdKeys: translatedHold }; + + const merged = [...new Set([...translatedHold, ...holdFromKeys])]; + return { keys: primaryKeys, holdKeys: merged }; +} + +interface CuaAction { + type: string; + x?: number; + y?: number; + text?: string; + url?: string; + keys?: string[]; + hold_keys?: string[]; + button?: string | number; + scroll_x?: number; + scroll_y?: number; + ms?: number; + path?: Array<{ x: number; y: number }>; + [key: string]: unknown; +} + +type BatchAction = { + type: string; + click_mouse?: { x: number; y: number; button?: string; num_clicks?: number }; + move_mouse?: { x: number; y: number }; + type_text?: { text: string }; + press_key?: { keys: string[]; hold_keys?: string[] }; + scroll?: { x: number; y: number; delta_x?: number; delta_y?: number }; + drag_mouse?: { path: number[][] }; + sleep?: { duration_ms: number }; +}; + +function normalizeButton(button?: string | number): string { + if (button === undefined || button === null) return 'left'; + if (typeof button === 'number') return button === 2 ? 'middle' : button === 3 ? 'right' : 'left'; + return button; +} + +function translateCuaAction(action: CuaAction): BatchAction[] { + switch (action.type) { + case 'click': { + if (action.button === 'back') return [{ type: 'press_key', press_key: { hold_keys: ['Alt_L'], keys: ['Left'] } }]; + if (action.button === 'forward') return [{ type: 'press_key', press_key: { hold_keys: ['Alt_L'], keys: ['Right'] } }]; + if (action.button === 'wheel') { + return [{ type: 'scroll', scroll: { x: action.x ?? 0, y: action.y ?? 0, delta_x: action.scroll_x ?? 0, delta_y: action.scroll_y ?? 0 } }]; + } + return [{ type: 'click_mouse', click_mouse: { x: action.x ?? 0, y: action.y ?? 0, button: normalizeButton(action.button) } }]; + } + case 'double_click': + return [{ type: 'click_mouse', click_mouse: { x: action.x ?? 0, y: action.y ?? 0, num_clicks: 2 } }]; + case 'type': + return [{ type: 'type_text', type_text: { text: action.text ?? '' } }]; + case 'keypress': { + const n = expandAndTranslateKeys(action.keys ?? [], action.hold_keys ?? []); + return [{ type: 'press_key', press_key: { keys: n.keys, ...(n.holdKeys.length ? { hold_keys: n.holdKeys } : {}) } }]; + } + case 'scroll': + return [{ type: 'scroll', scroll: { x: action.x ?? 0, y: action.y ?? 0, delta_x: action.scroll_x ?? 0, delta_y: action.scroll_y ?? 0 } }]; + case 'move': + return [{ type: 'move_mouse', move_mouse: { x: action.x ?? 0, y: action.y ?? 0 } }]; + case 'drag': { + const points = (action.path ?? []).map(p => [p.x, p.y]); + if (points.length < 2) throw new Error('drag requires at least 2 path points'); + return [{ type: 'drag_mouse', drag_mouse: { path: points } }]; + } + case 'wait': + return [{ type: 'sleep', sleep: { duration_ms: action.ms ?? 1000 } }]; + case 'goto': + return [ + { type: 'press_key', press_key: { keys: ['l'], hold_keys: ['Control_L'] } }, + { type: 'sleep', sleep: { duration_ms: 200 } }, + { type: 'press_key', press_key: { keys: ['a'], hold_keys: ['Control_L'] } }, + { type: 'type_text', type_text: { text: action.url ?? '' } }, + { type: 'press_key', press_key: { keys: ['Return'] } }, + ]; + case 'back': + return [{ type: 'press_key', press_key: { keys: ['Left'], hold_keys: ['Alt_L'] } }]; + case 'screenshot': + return []; + default: + throw new Error(`Unknown CUA action: ${action.type}`); + } +} + +export class OpenAIProvider implements CuaProvider { + readonly name = 'openai'; + private apiKey: string; + + constructor() { + this.apiKey = process.env.OPENAI_API_KEY ?? ''; + } + + isConfigured(): boolean { + return this.apiKey.length > 0; + } + + async runTask(options: TaskOptions): Promise { + const { query, kernel, sessionId } = options; + const client = new OpenAI({ apiKey: this.apiKey }); + + // Navigate to a neutral starting page + await kernel.browsers.computer.batch(sessionId, { + actions: translateCuaAction({ type: 'goto', url: 'https://duckduckgo.com' }) as Parameters[1]['actions'], + }); + + const input: ResponseInputItem[] = [ + { + role: 'system', + content: `Current date: ${new Date().toISOString()}`, + } as unknown as ResponseInputItem, + { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: query }], + }, + ]; + + const items: ResponseItem[] = []; + const maxTurns = 50; + + for (let turn = 0; turn < maxTurns; turn++) { + const response = await client.responses.create({ + model: options.model || 'gpt-5.4', + input: [...input, ...items] as ResponseInputItem[], + tools: [{ type: 'computer' } as unknown as OpenAI.Responses.Tool], + truncation: 'auto', + reasoning: { effort: 'low', summary: 'concise' }, + }); + + if (!response.output) throw new Error('No output from model'); + + for (const item of response.output as ResponseItem[]) { + items.push(item); + + if (item.type === 'computer_call') { + const cc = item as ResponseComputerToolCall & { + action?: CuaAction; + actions?: CuaAction[]; + }; + const actionList: CuaAction[] = Array.isArray(cc.actions) + ? cc.actions + : cc.action ? [cc.action] : []; + + // Execute actions + const batch: BatchAction[] = []; + for (const a of actionList) { + batch.push(...translateCuaAction(a)); + } + if (batch.length > 0) { + await kernel.browsers.computer.batch(sessionId, { + actions: batch as Parameters[1]['actions'], + }); + } + + // Acknowledge safety checks + const pending = cc.pending_safety_checks ?? []; + for (const check of pending) { + console.log(`Safety check: ${check.message ?? ''}`); + } + + // Take screenshot + await new Promise(r => setTimeout(r, 300)); + const screenshotResp = await kernel.browsers.computer.captureScreenshot(sessionId); + const buf = Buffer.from(await screenshotResp.arrayBuffer()); + const screenshot = buf.toString('base64'); + + items.push({ + type: 'computer_call_output', + call_id: cc.call_id, + acknowledged_safety_checks: pending, + output: { + type: 'computer_screenshot', + image_url: `data:image/png;base64,${screenshot}`, + }, + } as unknown as ResponseItem); + } + } + + // Check if the model produced a final assistant message + const lastItem = response.output[response.output.length - 1] as ResponseItem & { role?: string }; + if (lastItem?.role === 'assistant') { + const msg = lastItem as ResponseOutputMessage; + const text = msg.content + ?.filter(c => c && 'text' in c) + .map(c => (c as { text: string }).text) + .join('') ?? ''; + return { result: text || '(no response)', provider: this.name }; + } + } + + return { result: '(max turns reached)', provider: this.name }; + } +} diff --git a/pkg/templates/typescript/cua/session.ts b/pkg/templates/typescript/cua/session.ts new file mode 100644 index 0000000..8492238 --- /dev/null +++ b/pkg/templates/typescript/cua/session.ts @@ -0,0 +1,159 @@ +/** + * Kernel Browser Session Manager. + * + * Manages browser lifecycle with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + invocationId?: string; + stealth?: boolean; + timeoutSeconds?: number; + recordReplay?: boolean; + replayGracePeriod?: number; + viewportWidth?: number; + viewportHeight?: number; + proxyId?: string; + profile?: { id?: string; name?: string; save_changes?: boolean }; + extensions?: Array<{ id?: string; name?: string }>; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + replayId?: string; + replayViewUrl?: string; + viewportWidth: number; + viewportHeight: number; +} + +const DEFAULTS = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, + viewportWidth: 1280, + viewportHeight: 800, +}; + +export class KernelBrowserSession { + private kernel: Kernel; + private opts: Required> & Pick; + + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.opts = { ...DEFAULTS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) throw new Error('Session not started. Call start() first.'); + return this._sessionId; + } + + get liveViewUrl(): string | null { return this._liveViewUrl; } + get replayViewUrl(): string | null { return this._replayViewUrl; } + get viewportWidth(): number { return this.opts.viewportWidth; } + get viewportHeight(): number { return this.opts.viewportHeight; } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + viewportWidth: this.opts.viewportWidth, + viewportHeight: this.opts.viewportHeight, + }; + } + + async start(): Promise { + const browser = await this.kernel.browsers.create({ + invocation_id: this.opts.invocationId, + stealth: this.opts.stealth, + timeout_seconds: this.opts.timeoutSeconds, + viewport: { width: this.opts.viewportWidth, height: this.opts.viewportHeight }, + ...(this.opts.proxyId ? { proxy_id: this.opts.proxyId } : {}), + ...(this.opts.profile ? { profile: this.opts.profile } : {}), + ...(this.opts.extensions?.length ? { extensions: this.opts.extensions } : {}), + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url ?? null; + + console.log(`Browser session: ${this._sessionId}`); + console.log(`Live view: ${this._liveViewUrl}`); + + if (this.opts.recordReplay) { + try { + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } catch (error) { + console.warn(`Failed to start replay: ${error}`); + } + } + + return this.info; + } + + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + try { + if (this.opts.recordReplay && this._replayId) { + if (this.opts.replayGracePeriod > 0) { + await sleep(this.opts.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + } + } + + this._sessionId = null; + this._liveViewUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) return; + + await this.kernel.browsers.replays.stop(this._replayId, { id: this._sessionId }); + await sleep(2000); + + // Poll for replay URL + const deadline = Date.now() + 60_000; + while (Date.now() < deadline) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId!); + const match = replays.find(r => r.replay_id === this._replayId); + if (match) { + this._replayViewUrl = match.replay_view_url ?? null; + if (this._replayViewUrl) { + console.log(`Replay URL: ${this._replayViewUrl}`); + } + return; + } + } catch { /* polling */ } + await sleep(1000); + } + console.warn('Replay may still be processing.'); + } +} + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} diff --git a/pkg/templates/typescript/cua/tsconfig.json b/pkg/templates/typescript/cua/tsconfig.json new file mode 100644 index 0000000..cbe5246 --- /dev/null +++ b/pkg/templates/typescript/cua/tsconfig.json @@ -0,0 +1,22 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true, + "outDir": "./dist", + "rootDir": ".", + "declaration": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "allowImportingTsExtensions": true, + "noEmit": true + }, + "include": ["./**/*.ts"], + "exclude": ["node_modules", "dist"] +}