diff --git a/pkg/create/templates.go b/pkg/create/templates.go index fb5845f..f8541a1 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -19,6 +19,7 @@ const ( TemplateOpenAGIComputerUse = "openagi-computer-use" TemplateClaudeAgentSDK = "claude-agent-sdk" TemplateYutoriComputerUse = "yutori" + TemplateTzafonComputerUse = "tzafon" ) type TemplateInfo struct { @@ -90,6 +91,11 @@ var Templates = map[string]TemplateInfo{ Description: "Implements a Yutori n1 computer use agent", Languages: []string{LanguageTypeScript, LanguagePython}, }, + TemplateTzafonComputerUse: { + Name: "Tzafon Northstar Computer Use", + Description: "Implements a Tzafon Northstar CUA Fast computer use agent", + Languages: []string{LanguageTypeScript, LanguagePython}, + }, } // GetSupportedTemplatesForLanguage returns a list of all supported template names for a given language @@ -116,6 +122,8 @@ func GetSupportedTemplatesForLanguage(language string) TemplateKeyValues { return 2 case TemplateYutoriComputerUse: return 3 + case TemplateTzafonComputerUse: + return 4 default: return 10 } @@ -213,6 +221,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, }, + TemplateTzafonComputerUse: { + EntryPoint: "index.ts", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke ts-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'`, + }, }, LanguagePython: { TemplateSampleApp: { @@ -260,6 +273,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, }, + TemplateTzafonComputerUse: { + EntryPoint: "main.py", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke python-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'`, + }, }, } diff --git a/pkg/templates/python/tzafon/.env.example b/pkg/templates/python/tzafon/.env.example new file mode 100644 index 0000000..03c01ff --- /dev/null +++ b/pkg/templates/python/tzafon/.env.example @@ -0,0 +1 @@ +TZAFON_API_KEY=your-tzafon-api-key diff --git a/pkg/templates/python/tzafon/README.md b/pkg/templates/python/tzafon/README.md new file mode 100644 index 0000000..618a73d --- /dev/null +++ b/pkg/templates/python/tzafon/README.md @@ -0,0 +1,57 @@ +# Kernel Python Sample App - Tzafon Northstar Computer Use + +This is a Kernel application that implements a CUA (computer use agent) loop using Tzafon's Northstar CUA Fast model with Kernel's Computer Controls API. The model is accessed via Tzafon's [Lightcone](https://docs.lightcone.ai) API platform. + +[Northstar CUA Fast](https://docs.lightcone.ai) is a vision language model trained with reinforcement learning for computer use tasks. + +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Tzafon**: [tzafon.ai](https://www.tzafon.ai) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your TZAFON_API_KEY +kernel deploy main.py --env-file .env +``` + +## Usage + +```bash +kernel invoke python-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke python-tzafon-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Viewport Configuration + +Northstar CUA Fast works well with a **1280x800** viewport, which is the default. + +## Supported Actions + +| Action | Description | +|--------|-------------| +| `click` | Left or right mouse click at coordinates | +| `double_click` | Double-click at coordinates | +| `point_and_type` | Click at coordinates then type text (with optional Enter) | +| `key` | Press key combo (e.g. `Enter`, `ctrl+a`) | +| `scroll` | Scroll at coordinates | +| `drag` | Click-and-drag from start to end coordinates | +| `done` | Signal task completion with a result summary | + +## Resources + +- [Lightcone API Documentation](https://docs.lightcone.ai) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/python/tzafon/_gitignore b/pkg/templates/python/tzafon/_gitignore new file mode 100644 index 0000000..22e9be5 --- /dev/null +++ b/pkg/templates/python/tzafon/_gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.py[cod] +*$py.class +.env +*.log +.venv/ +venv/ diff --git a/pkg/templates/python/tzafon/loop.py b/pkg/templates/python/tzafon/loop.py new file mode 100644 index 0000000..f1c0256 --- /dev/null +++ b/pkg/templates/python/tzafon/loop.py @@ -0,0 +1,230 @@ +""" +Tzafon Northstar Sampling Loop + +Runs the Northstar CUA model via the Lightcone Responses API using explicit +function tools (click, type, key, scroll, drag, done). Full conversation +history is maintained in the input array — each tool result includes a fresh +screenshot so the model always sees the current screen state. + +@see https://docs.lightcone.ai +""" + +import asyncio +import json +from typing import Any +from kernel import Kernel +from tzafon import Lightcone + +from tools import ComputerTool + +MODEL = "tzafon.northstar-cua-fast" + +INSTRUCTIONS = ( + "Use a mouse and keyboard to interact with a Chromium browser and take screenshots.\n" + "* Chromium is already open on a Kernel cloud browser. If a startup wizard appears, ignore it.\n" + "* The screen's coordinate space is a 0-999 grid.\n" + "* To navigate to a URL, use point_and_type on the address bar, or key('ctrl+l') to focus it first.\n" + "* Some pages may take time to load. Wait and take successive screenshots to confirm the result.\n" + "* Whenever you click on an element, consult the screenshot to determine coordinates first.\n" + "* Click buttons, links, and icons in the center of the element, not on edges.\n" + "* If a click didn't work, try adjusting the coordinates slightly.\n" + "* For full-page scrolling, prefer key('PageDown') / key('PageUp') over the scroll tool.\n" + "* After each action, evaluate the screenshot to confirm it succeeded before moving on.\n" + "* When the task is complete, call done() with a summary of what you found or accomplished.\n" +) + +TOOLS = [ + { + "type": "function", "name": "click", + "description": "Single click at (x, y) in 0-999 grid.", + "parameters": { + "type": "object", + "properties": { + "x": {"type": "integer", "description": "X in 0-999 grid"}, + "y": {"type": "integer", "description": "Y in 0-999 grid"}, + "button": {"type": "string", "enum": ["left", "right"]}, + }, + "required": ["x", "y"], + }, + }, + { + "type": "function", "name": "double_click", + "description": "Double click at (x, y) in 0-999 grid.", + "parameters": { + "type": "object", + "properties": { + "x": {"type": "integer", "description": "X in 0-999 grid"}, + "y": {"type": "integer", "description": "Y in 0-999 grid"}, + }, + "required": ["x", "y"], + }, + }, + { + "type": "function", "name": "point_and_type", + "description": "Click at position then type text. For input fields, search bars, address bars.", + "parameters": { + "type": "object", + "properties": { + "x": {"type": "integer", "description": "X in 0-999 grid"}, + "y": {"type": "integer", "description": "Y in 0-999 grid"}, + "text": {"type": "string"}, + "press_enter": {"type": "boolean", "description": "Press Enter after typing"}, + }, + "required": ["x", "y", "text"], + }, + }, + { + "type": "function", "name": "key", + "description": "Press key combo (e.g. 'Enter', 'ctrl+a', 'Tab').", + "parameters": { + "type": "object", + "properties": {"keys": {"type": "string"}}, + "required": ["keys"], + }, + }, + { + "type": "function", "name": "scroll", + "description": "Scroll at (x, y) in 0-999 grid. Positive dy = down, negative = up.", + "parameters": { + "type": "object", + "properties": { + "x": {"type": "integer", "description": "X in 0-999 grid"}, + "y": {"type": "integer", "description": "Y in 0-999 grid"}, + "dy": {"type": "integer", "description": "Scroll notches. 3=down, -3=up."}, + }, + "required": ["x", "y", "dy"], + }, + }, + { + "type": "function", "name": "drag", + "description": "Drag from (x1, y1) to (x2, y2) in 0-999 grid.", + "parameters": { + "type": "object", + "properties": { + "x1": {"type": "integer", "description": "Start X in 0-999 grid"}, + "y1": {"type": "integer", "description": "Start Y in 0-999 grid"}, + "x2": {"type": "integer", "description": "End X in 0-999 grid"}, + "y2": {"type": "integer", "description": "End Y in 0-999 grid"}, + }, + "required": ["x1", "y1", "x2", "y2"], + }, + }, + { + "type": "function", "name": "done", + "description": "Task complete. Report findings.", + "parameters": { + "type": "object", + "properties": {"result": {"type": "string"}}, + "required": ["result"], + }, + }, +] + + +def _img(screenshot_url: str, text: str = "screenshot") -> dict: + return { + "role": "user", + "content": [ + {"type": "input_text", "text": text}, + {"type": "input_image", "image_url": screenshot_url, "detail": "auto"}, + ], + } + + +async def sampling_loop( + *, + task: str, + api_key: str, + kernel: Kernel, + session_id: str, + model: str = MODEL, + max_steps: int = 50, + viewport_width: int = 1280, + viewport_height: int = 800, +) -> dict[str, Any]: + """Run the Northstar CUA loop until the model calls done() or max steps.""" + tzafon = Lightcone(api_key=api_key) + computer = ComputerTool(kernel, session_id, viewport_width, viewport_height) + + screenshot_url = computer.capture_screenshot() + items: list[Any] = [_img(screenshot_url, text=f"{task}\n\nCurrent screenshot:")] + resp: Any = None + + for step in range(max_steps): + print(f"\n=== Step {step + 1}/{max_steps} ===") + + # Prevent unbounded payload growth — keep the task prompt + recent history + if len(items) > 30: + items = items[:2] + items[-20:] + + resp = tzafon.responses.create( + model=model, input=items, tools=TOOLS, + instructions=INSTRUCTIONS, + temperature=0, max_output_tokens=4096, + ) + + calls: list[tuple[str, str, dict]] = [] + for item in resp.output or []: + if item.type == "message": + for block in item.content or []: + text = block.text or "" + if text: + items.append({"role": "assistant", "content": text}) + print(f" Model: {text[:150]}") + + elif item.type == "function_call": + call_id = item.call_id + name = item.name + raw_args = item.arguments or "{}" + try: + args = json.loads(raw_args) if isinstance(raw_args, str) else raw_args + except (json.JSONDecodeError, TypeError): + args = {} + calls.append((call_id, name, args)) + items.append({ + "type": "function_call", "call_id": call_id, "name": name, + "arguments": raw_args if isinstance(raw_args, str) else json.dumps(raw_args), + }) + + if not calls: + continue + + for call_id, name, args in calls: + print(f" [{step + 1}] {name}({json.dumps(args)[:100]})") + + if name == "done": + result = args.get("result", "") + items.append({"type": "function_call_output", "call_id": call_id, "output": "ok"}) + print(f" Done: {result}") + return {"messages": [], "final_result": result} + + try: + await computer.execute_function(name, args) + except Exception as e: + print(f" Action failed: {e}") + items.append({"type": "function_call_output", "call_id": call_id, "output": f"Error: {e}"}) + continue + + await asyncio.sleep(0.5) + screenshot_url = computer.capture_screenshot() + + # Replace old screenshots with placeholders to save payload space + for it in items[:-1]: + c = it.get("content") if isinstance(it, dict) else None + if isinstance(c, list): + has_img = any(isinstance(p, dict) and p.get("type") == "input_image" for p in c) + if has_img: + it["content"] = [p for p in c if not (isinstance(p, dict) and p.get("type") == "input_image")] or "(old screenshot)" + + items.append({"type": "function_call_output", "call_id": call_id, "output": "[screenshot]"}) + items.append(_img(screenshot_url)) + + messages: list[str] = [] + if resp: + for item in resp.output or []: + if item.type == "message": + for block in item.content or []: + if block.text: + messages.append(block.text) + + return {"messages": messages, "final_result": None} diff --git a/pkg/templates/python/tzafon/main.py b/pkg/templates/python/tzafon/main.py new file mode 100644 index 0000000..0f4c7c9 --- /dev/null +++ b/pkg/templates/python/tzafon/main.py @@ -0,0 +1,65 @@ +import os +from typing import Optional, TypedDict + +import kernel +from loop import sampling_loop +from session import KernelBrowserSession + + +class QueryInput(TypedDict): + query: str + record_replay: Optional[bool] + + +class QueryOutput(TypedDict): + result: str + replay_url: Optional[str] + + +api_key = os.getenv("TZAFON_API_KEY") +if not api_key: + raise ValueError("TZAFON_API_KEY is not set") + +app = kernel.App("python-tzafon-cua") + + +@app.action("cua-task") +async def cua_task( + ctx: kernel.KernelContext, + payload: QueryInput, +) -> QueryOutput: + if not payload or not payload.get("query"): + raise ValueError("Query is required") + + record_replay = payload.get("record_replay", False) + + async with KernelBrowserSession( + invocation_id=ctx.invocation_id, + stealth=True, + record_replay=record_replay, + ) as session: + print("Kernel browser live view url:", session.live_view_url) + + loop_result = await sampling_loop( + task=payload["query"], + api_key=str(api_key), + kernel=session.kernel, + session_id=str(session.session_id), + viewport_width=session.viewport_width, + viewport_height=session.viewport_height, + ) + + final_result = loop_result.get("final_result") + messages = loop_result.get("messages", []) + + if final_result: + result = final_result + elif messages: + result = messages[-1] + else: + result = "Task completed" + + return { + "result": result, + "replay_url": session.replay_view_url, + } diff --git a/pkg/templates/python/tzafon/pyproject.toml b/pkg/templates/python/tzafon/pyproject.toml new file mode 100644 index 0000000..ec51d35 --- /dev/null +++ b/pkg/templates/python/tzafon/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "python-tzafon-cua" +version = "0.1.0" +description = "Kernel reference app for Tzafon Northstar Computer Use" +requires-python = ">=3.9" +dependencies = [ + "kernel>=0.35.0", + "tzafon>=2.31.0", +] diff --git a/pkg/templates/python/tzafon/session.py b/pkg/templates/python/tzafon/session.py new file mode 100644 index 0000000..0c22dc7 --- /dev/null +++ b/pkg/templates/python/tzafon/session.py @@ -0,0 +1,146 @@ +""" +Kernel Browser Session Manager. + +Provides an async context manager for managing Kernel browser lifecycle +with optional video replay recording. +""" + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Optional + +from kernel import Kernel + + +@dataclass +class KernelBrowserSession: + """ + Manages Kernel browser lifecycle as an async context manager. + + Creates a browser session on entry and cleans it up on exit. + Optionally records a video replay of the entire session. + Provides session_id to computer tools. + + Usage: + async with KernelBrowserSession(record_replay=True) as session: + # Use session.session_id and session.kernel for operations + pass + # Browser is automatically cleaned up, replay URL available in session.replay_view_url + """ + + stealth: bool = True + timeout_seconds: int = 300 + + viewport_width: int = 1280 + viewport_height: int = 800 + + # Replay recording options + record_replay: bool = False + replay_grace_period: float = 5.0 + + # Invocation ID to link browser session to the action invocation + invocation_id: Optional[str] = None + + # Set after browser creation + session_id: Optional[str] = field(default=None, init=False) + live_view_url: Optional[str] = field(default=None, init=False) + cdp_ws_url: Optional[str] = field(default=None, init=False) + replay_id: Optional[str] = field(default=None, init=False) + replay_view_url: Optional[str] = field(default=None, init=False) + _kernel: Optional[Kernel] = field(default=None, init=False) + + async def __aenter__(self) -> "KernelBrowserSession": + self._kernel = Kernel() + + browser = self._kernel.browsers.create( + invocation_id=self.invocation_id, + stealth=self.stealth, + timeout_seconds=self.timeout_seconds, + viewport={ + "width": self.viewport_width, + "height": self.viewport_height, + }, + ) + + self.session_id = browser.session_id + self.live_view_url = browser.browser_live_view_url + self.cdp_ws_url = browser.cdp_ws_url + + print(f"Kernel browser created: {self.session_id}") + print(f"Live view URL: {self.live_view_url}") + + if self.record_replay: + try: + await self._start_replay() + except Exception as e: + print(f"Warning: Failed to start replay recording: {e}") + print("Continuing without replay recording.") + + return self + + async def _start_replay(self) -> None: + if not self._kernel or not self.session_id: + return + + print("Starting replay recording...") + replay = self._kernel.browsers.replays.start(self.session_id) + self.replay_id = replay.replay_id + print(f"Replay recording started: {self.replay_id}") + + async def _stop_and_get_replay_url(self) -> None: + if not self._kernel or not self.session_id or not self.replay_id: + return + + print("Stopping replay recording...") + self._kernel.browsers.replays.stop( + replay_id=self.replay_id, + id=self.session_id, + ) + print("Replay recording stopped. Processing video...") + + await asyncio.sleep(2) + + max_wait = 60 # seconds + start_time = time.time() + replay_ready = False + + while time.time() - start_time < max_wait: + try: + replays = self._kernel.browsers.replays.list(self.session_id) + for replay in replays: + if replay.replay_id == self.replay_id: + self.replay_view_url = replay.replay_view_url + replay_ready = True + break + if replay_ready: + break + except Exception: + pass + await asyncio.sleep(1) + + if not replay_ready: + print("Warning: Replay may still be processing") + elif self.replay_view_url: + print(f"Replay view URL: {self.replay_view_url}") + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + if self._kernel and self.session_id: + try: + if self.record_replay and self.replay_id: + if self.replay_grace_period > 0: + print(f"Waiting {self.replay_grace_period}s grace period...") + await asyncio.sleep(self.replay_grace_period) + await self._stop_and_get_replay_url() + finally: + print(f"Destroying browser session: {self.session_id}") + self._kernel.browsers.delete_by_id(self.session_id) + print("Browser session destroyed.") + + self._kernel = None + + @property + def kernel(self) -> Kernel: + if self._kernel is None: + raise RuntimeError("Session not initialized. Use async with context.") + return self._kernel diff --git a/pkg/templates/python/tzafon/tools/__init__.py b/pkg/templates/python/tzafon/tools/__init__.py new file mode 100644 index 0000000..27fce64 --- /dev/null +++ b/pkg/templates/python/tzafon/tools/__init__.py @@ -0,0 +1,9 @@ +"""Tzafon Northstar Computer Tools.""" + +from .base import ToolError +from .computer import ComputerTool + +__all__ = [ + "ToolError", + "ComputerTool", +] diff --git a/pkg/templates/python/tzafon/tools/base.py b/pkg/templates/python/tzafon/tools/base.py new file mode 100644 index 0000000..cc65c20 --- /dev/null +++ b/pkg/templates/python/tzafon/tools/base.py @@ -0,0 +1,7 @@ +"""Base tool types for Tzafon Northstar.""" + + +class ToolError(Exception): + def __init__(self, message: str): + self.message = message + super().__init__(message) diff --git a/pkg/templates/python/tzafon/tools/computer.py b/pkg/templates/python/tzafon/tools/computer.py new file mode 100644 index 0000000..1249000 --- /dev/null +++ b/pkg/templates/python/tzafon/tools/computer.py @@ -0,0 +1,118 @@ +""" +Tzafon Northstar Computer Tool + +Executes function tool calls from the Northstar model on the browser. +Coordinates arrive in a normalised 0-999 grid and are scaled to the +browser viewport before dispatch. +""" + +import asyncio +import base64 +from typing import Any + +from kernel import Kernel + +from .base import ToolError + +KEY_MAP: dict[str, str] = { + "return": "Return", "enter": "Return", + "space": "space", "tab": "Tab", + "backspace": "BackSpace", "delete": "Delete", + "escape": "Escape", "esc": "Escape", "insert": "Insert", + "up": "Up", "down": "Down", "left": "Left", "right": "Right", + "home": "Home", "end": "End", + "pageup": "Page_Up", "page_up": "Page_Up", + "pagedown": "Page_Down", "page_down": "Page_Down", + **{f"f{i}": f"F{i}" for i in range(1, 13)}, +} + +MODIFIER_MAP: dict[str, str] = { + "ctrl": "ctrl", "control": "ctrl", + "alt": "alt", "shift": "shift", + "meta": "super", "cmd": "super", "command": "super", "win": "super", +} + + +def _map_key(key_combo: str) -> str: + """Map a key combo string like 'ctrl+a' or 'Enter' to xdotool format.""" + parts = key_combo.split("+") if "+" in key_combo else [key_combo] + mapped = [] + for part in parts: + k = part.strip().lower() + mapped.append(MODIFIER_MAP.get(k) or KEY_MAP.get(k, part.strip())) + return "+".join(mapped) + + +class ComputerTool: + def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: int = 800): + self.kernel = kernel + self.session_id = session_id + self.width = width + self.height = height + + @staticmethod + def _coord(val: Any) -> int: + """Parse a coordinate value. Handles ints, floats, strings, and + the model's occasional '470,77' (comma-separated pair in one field).""" + if val is None: + return 0 + s = str(val) + if "," in s: + s = s.split(",")[0].strip() + return int(float(s)) + + def _scale(self, x: Any, y: Any) -> tuple[int, int]: + """Convert 0-999 grid coordinates to pixel coordinates.""" + x, y = self._coord(x), self._coord(y) + px = max(0, min(x * (self.width - 1) // 999, self.width - 1)) + py = max(0, min(y * (self.height - 1) // 999, self.height - 1)) + return px, py + + async def execute_function(self, name: str, args: dict) -> None: + if name == "click": + px, py = self._scale(args["x"], args["y"]) + self.kernel.browsers.computer.click_mouse( + self.session_id, x=px, y=py, button=args.get("button", "left"), + ) + + elif name == "double_click": + px, py = self._scale(args["x"], args["y"]) + self.kernel.browsers.computer.click_mouse( + self.session_id, x=px, y=py, num_clicks=2, + ) + + elif name == "point_and_type": + px, py = self._scale(args["x"], args["y"]) + self.kernel.browsers.computer.click_mouse(self.session_id, x=px, y=py) + await asyncio.sleep(0.3) + self.kernel.browsers.computer.type_text(self.session_id, text=args["text"]) + if args.get("press_enter"): + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key(self.session_id, keys=["Return"]) + + elif name == "key": + self.kernel.browsers.computer.press_key( + self.session_id, keys=[_map_key(args["keys"])], + ) + + elif name == "scroll": + px, py = self._scale(args.get("x", 500), args.get("y", 500)) + dy = max(-10, min(10, int(args.get("dy", 3)))) + self.kernel.browsers.computer.scroll( + self.session_id, x=px, y=py, delta_x=0, delta_y=dy, + ) + + elif name == "drag": + px1, py1 = self._scale(args["x1"], args["y1"]) + px2, py2 = self._scale(args["x2"], args["y2"]) + self.kernel.browsers.computer.drag_mouse( + self.session_id, path=[[px1, py1], [px2, py2]], + ) + + else: + raise ToolError(f"Unknown function: {name}") + + def capture_screenshot(self) -> str: + res = self.kernel.browsers.computer.capture_screenshot(self.session_id) + b64 = base64.b64encode(res.read()).decode() + return f"data:image/png;base64,{b64}" diff --git a/pkg/templates/typescript/tzafon/.env.example b/pkg/templates/typescript/tzafon/.env.example new file mode 100644 index 0000000..03c01ff --- /dev/null +++ b/pkg/templates/typescript/tzafon/.env.example @@ -0,0 +1 @@ +TZAFON_API_KEY=your-tzafon-api-key diff --git a/pkg/templates/typescript/tzafon/README.md b/pkg/templates/typescript/tzafon/README.md new file mode 100644 index 0000000..a881db5 --- /dev/null +++ b/pkg/templates/typescript/tzafon/README.md @@ -0,0 +1,57 @@ +# Kernel TypeScript Sample App - Tzafon Northstar Computer Use + +This is a Kernel application that implements a CUA (computer use agent) loop using Tzafon's Northstar CUA Fast model with Kernel's Computer Controls API. The model is accessed via Tzafon's [Lightcone](https://docs.lightcone.ai) API platform. + +[Northstar CUA Fast](https://docs.lightcone.ai) is a vision language model trained with reinforcement learning for computer use tasks. + +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Tzafon**: [tzafon.ai](https://www.tzafon.ai) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your TZAFON_API_KEY +kernel deploy index.ts --env-file .env +``` + +## Usage + +```bash +kernel invoke ts-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke ts-tzafon-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Viewport Configuration + +Northstar CUA Fast works well with a **1280x800** viewport, which is the default. + +## Supported Actions + +| Action | Description | +|--------|-------------| +| `click` | Left or right mouse click at coordinates | +| `double_click` | Double-click at coordinates | +| `point_and_type` | Click at coordinates then type text (with optional Enter) | +| `key` | Press key combo (e.g. `Enter`, `ctrl+a`) | +| `scroll` | Scroll at coordinates | +| `drag` | Click-and-drag from start to end coordinates | +| `done` | Signal task completion with a result summary | + +## Resources + +- [Lightcone API Documentation](https://docs.lightcone.ai) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/typescript/tzafon/_gitignore b/pkg/templates/typescript/tzafon/_gitignore new file mode 100644 index 0000000..aa0926a --- /dev/null +++ b/pkg/templates/typescript/tzafon/_gitignore @@ -0,0 +1,4 @@ +node_modules/ +dist/ +.env +*.log diff --git a/pkg/templates/typescript/tzafon/index.ts b/pkg/templates/typescript/tzafon/index.ts new file mode 100644 index 0000000..d61a5f2 --- /dev/null +++ b/pkg/templates/typescript/tzafon/index.ts @@ -0,0 +1,65 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { samplingLoop } from './loop'; +import { KernelBrowserSession } from './session'; + +const kernel = new Kernel(); + +const app = kernel.app('ts-tzafon-cua'); + +interface QueryInput { + query: string; + record_replay?: boolean; +} + +interface QueryOutput { + result: string; + replay_url?: string; +} + +const TZAFON_API_KEY = process.env.TZAFON_API_KEY; + +if (!TZAFON_API_KEY) { + throw new Error('TZAFON_API_KEY is not set'); +} + +app.action( + 'cua-task', + async (ctx: KernelContext, payload?: QueryInput): Promise => { + if (!payload?.query) { + throw new Error('Query is required'); + } + + const session = new KernelBrowserSession(kernel, { + invocationId: ctx.invocation_id, + stealth: true, + recordReplay: payload.record_replay ?? false, + }); + + await session.start(); + console.log('Kernel browser live view url:', session.liveViewUrl); + + try { + const { finalResult, messages } = await samplingLoop({ + task: payload.query, + apiKey: TZAFON_API_KEY, + kernel, + sessionId: session.sessionId, + viewportWidth: session.viewportWidth, + viewportHeight: session.viewportHeight, + }); + + const result = finalResult ?? messages[messages.length - 1] ?? 'Task completed'; + + const sessionInfo = await session.stop(); + + return { + result, + replay_url: sessionInfo.replayViewUrl, + }; + } catch (error) { + console.error('Error in sampling loop:', error); + await session.stop(); + throw error; + } + }, +); diff --git a/pkg/templates/typescript/tzafon/loop.ts b/pkg/templates/typescript/tzafon/loop.ts new file mode 100644 index 0000000..fc0e8fa --- /dev/null +++ b/pkg/templates/typescript/tzafon/loop.ts @@ -0,0 +1,266 @@ +/** + * Tzafon Northstar Sampling Loop + * + * Runs the Northstar CUA model via the Lightcone Responses API using explicit + * function tools (click, type, key, scroll, drag, done). Full conversation + * history is maintained in the input array — each tool result includes a fresh + * screenshot so the model always sees the current screen state. + * + * @see https://docs.lightcone.ai + */ + +import type { Kernel } from '@onkernel/sdk'; +import Lightcone from '@tzafon/lightcone'; +import { ComputerTool } from './tools/computer'; + +const MODEL = 'tzafon.northstar-cua-fast'; + +const INSTRUCTIONS = [ + 'Use a mouse and keyboard to interact with a Chromium browser and take screenshots.', + '* Chromium is already open on a Kernel cloud browser. If a startup wizard appears, ignore it.', + "* The screen's coordinate space is a 0-999 grid.", + "* To navigate to a URL, use point_and_type on the address bar, or key('ctrl+l') to focus it first.", + '* Some pages may take time to load. Wait and take successive screenshots to confirm the result.', + '* Whenever you click on an element, consult the screenshot to determine coordinates first.', + '* Click buttons, links, and icons in the center of the element, not on edges.', + "* If a click didn't work, try adjusting the coordinates slightly.", + "* For full-page scrolling, prefer key('PageDown') / key('PageUp') over the scroll tool.", + '* After each action, evaluate the screenshot to confirm it succeeded before moving on.', + '* When the task is complete, call done() with a summary of what you found or accomplished.', +].join('\n'); + +interface FunctionTool { + type: 'function'; + name: string; + description: string; + parameters: Record; +} + +const TOOLS: FunctionTool[] = [ + { + type: 'function', name: 'click', + description: 'Single click at (x, y) in 0-999 grid.', + parameters: { + type: 'object', + properties: { + x: { type: 'integer', description: 'X in 0-999 grid' }, + y: { type: 'integer', description: 'Y in 0-999 grid' }, + button: { type: 'string', enum: ['left', 'right'] }, + }, + required: ['x', 'y'], + }, + }, + { + type: 'function', name: 'double_click', + description: 'Double click at (x, y) in 0-999 grid.', + parameters: { + type: 'object', + properties: { + x: { type: 'integer', description: 'X in 0-999 grid' }, + y: { type: 'integer', description: 'Y in 0-999 grid' }, + }, + required: ['x', 'y'], + }, + }, + { + type: 'function', name: 'point_and_type', + description: 'Click at position then type text. For input fields, search bars, address bars.', + parameters: { + type: 'object', + properties: { + x: { type: 'integer', description: 'X in 0-999 grid' }, + y: { type: 'integer', description: 'Y in 0-999 grid' }, + text: { type: 'string' }, + press_enter: { type: 'boolean', description: 'Press Enter after typing' }, + }, + required: ['x', 'y', 'text'], + }, + }, + { + type: 'function', name: 'key', + description: "Press key combo (e.g. 'Enter', 'ctrl+a', 'Tab').", + parameters: { + type: 'object', + properties: { keys: { type: 'string' } }, + required: ['keys'], + }, + }, + { + type: 'function', name: 'scroll', + description: 'Scroll at (x, y) in 0-999 grid. Positive dy = down, negative = up.', + parameters: { + type: 'object', + properties: { + x: { type: 'integer', description: 'X in 0-999 grid' }, + y: { type: 'integer', description: 'Y in 0-999 grid' }, + dy: { type: 'integer', description: 'Scroll notches. 3=down, -3=up.' }, + }, + required: ['x', 'y', 'dy'], + }, + }, + { + type: 'function', name: 'drag', + description: 'Drag from (x1, y1) to (x2, y2) in 0-999 grid.', + parameters: { + type: 'object', + properties: { + x1: { type: 'integer', description: 'Start X in 0-999 grid' }, + y1: { type: 'integer', description: 'Start Y in 0-999 grid' }, + x2: { type: 'integer', description: 'End X in 0-999 grid' }, + y2: { type: 'integer', description: 'End Y in 0-999 grid' }, + }, + required: ['x1', 'y1', 'x2', 'y2'], + }, + }, + { + type: 'function', name: 'done', + description: 'Task complete. Report findings.', + parameters: { + type: 'object', + properties: { result: { type: 'string' } }, + required: ['result'], + }, + }, +]; + +interface SamplingLoopOptions { + task: string; + apiKey: string; + kernel: Kernel; + sessionId: string; + model?: string; + maxSteps?: number; + viewportWidth?: number; + viewportHeight?: number; +} + +interface SamplingLoopResult { + messages: string[]; + finalResult?: string; +} + +function get(obj: any, key: string, fallback?: any): any { + if (obj && typeof obj === 'object' && key in obj) return obj[key]; + return fallback; +} + +function img(screenshotUrl: string, text = 'screenshot') { + return { + role: 'user', + content: [ + { type: 'input_text', text }, + { type: 'input_image', image_url: screenshotUrl, detail: 'auto' }, + ], + }; +} + +export async function samplingLoop({ + task, + apiKey, + kernel, + sessionId, + model = MODEL, + maxSteps = 50, + viewportWidth = 1280, + viewportHeight = 800, +}: SamplingLoopOptions): Promise { + const tzafon = new Lightcone({ apiKey }); + const computer = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); + + let screenshotUrl = await computer.captureScreenshot(); + const items: any[] = [img(screenshotUrl, `${task}\n\nCurrent screenshot:`)]; + + let resp: any; + + for (let step = 0; step < maxSteps; step++) { + console.log(`\n=== Step ${step + 1}/${maxSteps} ===`); + + // Prevent unbounded payload growth — keep the task prompt + recent history + if (items.length > 30) { + items.splice(2, items.length - 22); + } + + resp = await tzafon.responses.create({ + model, + input: items, + tools: TOOLS, + instructions: INSTRUCTIONS, + temperature: 0, + max_output_tokens: 4096, + }); + + const calls: Array<{ callId: string; name: string; args: Record }> = []; + + for (const item of get(resp, 'output') ?? []) { + const itemType = get(item, 'type'); + + if (itemType === 'message') { + for (const block of get(item, 'content') ?? []) { + const text = get(block, 'text', ''); + if (text) { + items.push({ role: 'assistant', content: text }); + console.log(` Model: ${text.slice(0, 150)}`); + } + } + } else if (itemType === 'function_call') { + const callId = get(item, 'call_id'); + const name = get(item, 'name'); + const rawArgs = get(item, 'arguments', '{}'); + let args: Record; + try { + args = typeof rawArgs === 'string' ? JSON.parse(rawArgs) : rawArgs; + } catch { + args = {}; + } + calls.push({ callId, name, args }); + items.push({ + type: 'function_call', call_id: callId, name, + arguments: typeof rawArgs === 'string' ? rawArgs : JSON.stringify(rawArgs), + }); + } + } + + if (calls.length === 0) continue; + + for (const { callId, name, args } of calls) { + console.log(` [${step + 1}] ${name}(${JSON.stringify(args).slice(0, 100)})`); + + if (name === 'done') { + const result = args.result ?? ''; + items.push({ type: 'function_call_output', call_id: callId, output: 'ok' }); + console.log(` Done: ${result}`); + return { messages: [], finalResult: result }; + } + + try { + await computer.executeFunction(name, args); + } catch (e: any) { + console.log(` Action failed: ${e.message}`); + items.push({ type: 'function_call_output', call_id: callId, output: `Error: ${e.message}` }); + continue; + } + + await new Promise((r) => setTimeout(r, 500)); + screenshotUrl = await computer.captureScreenshot(); + + // Replace old screenshots with placeholders to save payload space + for (const it of items.slice(0, -1)) { + const c = it?.content; + if (Array.isArray(c) && c.some((p: any) => p?.type === 'input_image')) { + it.content = c.filter((p: any) => p?.type !== 'input_image'); + if (it.content.length === 0) it.content = '(old screenshot)'; + } + } + + items.push({ type: 'function_call_output', call_id: callId, output: '[screenshot]' }); + items.push(img(screenshotUrl)); + } + } + + const messages = (get(resp, 'output') ?? []) + .filter((o: any) => get(o, 'type') === 'message') + .flatMap((o: any) => (get(o, 'content') ?? []).map((c: any) => get(c, 'text'))) + .filter(Boolean); + + return { messages, finalResult: undefined }; +} diff --git a/pkg/templates/typescript/tzafon/package.json b/pkg/templates/typescript/tzafon/package.json new file mode 100644 index 0000000..3542a52 --- /dev/null +++ b/pkg/templates/typescript/tzafon/package.json @@ -0,0 +1,14 @@ +{ + "name": "ts-tzafon-cua", + "module": "index.ts", + "type": "module", + "private": true, + "dependencies": { + "@onkernel/sdk": "^0.35.0", + "@tzafon/lightcone": "^0.7.0" + }, + "devDependencies": { + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/tzafon/pnpm-lock.yaml b/pkg/templates/typescript/tzafon/pnpm-lock.yaml new file mode 100644 index 0000000..fdfb925 --- /dev/null +++ b/pkg/templates/typescript/tzafon/pnpm-lock.yaml @@ -0,0 +1,56 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@onkernel/sdk': + specifier: ^0.35.0 + version: 0.35.0 + '@tzafon/lightcone': + specifier: ^0.7.0 + version: 0.7.1 + devDependencies: + '@types/node': + specifier: ^22.15.17 + version: 22.19.15 + typescript: + specifier: ^5.9.3 + version: 5.9.3 + +packages: + + '@onkernel/sdk@0.35.0': + resolution: {integrity: sha512-EnTEyTm85WwOOXZziDTySNHl46ZO+DSJjVDJDJNarwkD+kv623TzXDLpgH7vwy4LfQjQ4DzOQe0hHKgCYrAv5A==} + + '@types/node@22.19.15': + resolution: {integrity: sha512-F0R/h2+dsy5wJAUe3tAU6oqa2qbWY5TpNfL/RGmo1y38hiyO1w3x2jPtt76wmuaJI4DQnOBu21cNXQ2STIUUWg==} + + '@tzafon/lightcone@0.7.1': + resolution: {integrity: sha512-LW+7bw07zXgjgOZ55bYJbJhq43/qWjE43M1ZqmriBt3N5Lq1iOAm2f8xevnMUcNZ7ujbtrrnSCasmBVRALYEMw==} + + typescript@5.9.3: + resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} + engines: {node: '>=14.17'} + hasBin: true + + undici-types@6.21.0: + resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} + +snapshots: + + '@onkernel/sdk@0.35.0': {} + + '@types/node@22.19.15': + dependencies: + undici-types: 6.21.0 + + '@tzafon/lightcone@0.7.1': {} + + typescript@5.9.3: {} + + undici-types@6.21.0: {} diff --git a/pkg/templates/typescript/tzafon/session.ts b/pkg/templates/typescript/tzafon/session.ts new file mode 100644 index 0000000..dbc7f4e --- /dev/null +++ b/pkg/templates/typescript/tzafon/session.ts @@ -0,0 +1,225 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + invocationId?: string; + stealth?: boolean; + timeoutSeconds?: number; + recordReplay?: boolean; + replayGracePeriod?: number; + viewportWidth?: number; + viewportHeight?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + cdpWsUrl: string; + replayId?: string; + replayViewUrl?: string; + viewportWidth: number; + viewportHeight: number; +} + +type SessionOptionsWithDefaults = Required> & Pick; + +const DEFAULT_OPTIONS: Required> = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, + viewportWidth: 1280, + viewportHeight: 800, +}; + +/** + * Manages Kernel browser lifecycle with optional replay recording. + * + * Usage: + * ```typescript + * const session = new KernelBrowserSession(kernel, options); + * await session.start(); + * try { + * // Use session.sessionId for computer controls + * } finally { + * await session.stop(); + * } + * ``` + */ +export class KernelBrowserSession { + private kernel: Kernel; + private options: SessionOptionsWithDefaults; + + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _cdpWsUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get cdpWsUrl(): string | null { + return this._cdpWsUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get viewportWidth(): number { + return this.options.viewportWidth; + } + + get viewportHeight(): number { + return this.options.viewportHeight; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + cdpWsUrl: this._cdpWsUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + viewportWidth: this.options.viewportWidth, + viewportHeight: this.options.viewportHeight, + }; + } + + async start(): Promise { + const browser = await this.kernel.browsers.create({ + invocation_id: this.options.invocationId, + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: this.options.viewportWidth, + height: this.options.viewportHeight, + }, + }); + + this._sessionId = browser.session_id ?? null; + this._liveViewUrl = browser.browser_live_view_url ?? null; + this._cdpWsUrl = browser.cdp_ws_url ?? null; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + if (this.options.recordReplay) { + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } + } + + return this.info; + } + + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + await this.sleep(2000); + + const maxWait = 60000; + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url ?? null; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + try { + if (this.options.recordReplay && this._replayId) { + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); + } + } + + this._sessionId = null; + this._liveViewUrl = null; + this._cdpWsUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/tzafon/tools/computer.ts b/pkg/templates/typescript/tzafon/tools/computer.ts new file mode 100644 index 0000000..700f08f --- /dev/null +++ b/pkg/templates/typescript/tzafon/tools/computer.ts @@ -0,0 +1,145 @@ +/** + * Tzafon Northstar Computer Tool + * + * Executes function tool calls from the Northstar model on the browser. + * Coordinates arrive in a normalised 0-999 grid and are scaled to the + * browser viewport before dispatch. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} + +const KEY_MAP: Record = { + return: 'Return', enter: 'Return', + space: 'space', tab: 'Tab', + backspace: 'BackSpace', delete: 'Delete', + escape: 'Escape', esc: 'Escape', insert: 'Insert', + up: 'Up', down: 'Down', left: 'Left', right: 'Right', + home: 'Home', end: 'End', + pageup: 'Page_Up', page_up: 'Page_Up', + pagedown: 'Page_Down', page_down: 'Page_Down', + ...Object.fromEntries(Array.from({ length: 12 }, (_, i) => [`f${i + 1}`, `F${i + 1}`])), +}; + +const MODIFIER_MAP: Record = { + ctrl: 'ctrl', control: 'ctrl', + alt: 'alt', shift: 'shift', + meta: 'super', cmd: 'super', command: 'super', win: 'super', +}; + +function mapKey(keyCombo: string): string { + const parts = keyCombo.includes('+') ? keyCombo.split('+') : [keyCombo]; + return parts + .map((p) => { + const k = p.trim().toLowerCase(); + return MODIFIER_MAP[k] ?? KEY_MAP[k] ?? p.trim(); + }) + .join('+'); +} + +export class ComputerTool { + private kernel: Kernel; + private sessionId: string; + private width: number; + private height: number; + + constructor(kernel: Kernel, sessionId: string, width = 1280, height = 800) { + this.kernel = kernel; + this.sessionId = sessionId; + this.width = width; + this.height = height; + } + + /** Parse a coordinate value. Handles the model's occasional '470,77' format. */ + private coord(val: unknown): number { + if (val == null) return 0; + let s = String(val); + if (s.includes(',')) s = s.split(',')[0].trim(); + return Math.trunc(Number(s)); + } + + /** Convert 0-999 grid coordinates to pixel coordinates. */ + private scale(x: unknown, y: unknown): [number, number] { + const cx = this.coord(x); + const cy = this.coord(y); + const px = Math.max(0, Math.min(Math.trunc(cx * (this.width - 1) / 999), this.width - 1)); + const py = Math.max(0, Math.min(Math.trunc(cy * (this.height - 1) / 999), this.height - 1)); + return [px, py]; + } + + async executeFunction(name: string, args: Record): Promise { + switch (name) { + case 'click': { + const [px, py] = this.scale(args.x, args.y); + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x: px, y: py, button: args.button ?? 'left', + }); + break; + } + + case 'double_click': { + const [px, py] = this.scale(args.x, args.y); + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x: px, y: py, num_clicks: 2, + }); + break; + } + + case 'point_and_type': { + const [px, py] = this.scale(args.x, args.y); + await this.kernel.browsers.computer.clickMouse(this.sessionId, { x: px, y: py }); + await sleep(300); + await this.kernel.browsers.computer.typeText(this.sessionId, { text: args.text }); + if (args.press_enter) { + await sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['Return'] }); + } + break; + } + + case 'key': { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [mapKey(args.keys)], + }); + break; + } + + case 'scroll': { + const [px, py] = this.scale(args.x ?? 500, args.y ?? 500); + const dy = Math.max(-10, Math.min(10, args.dy ?? 3)); + await this.kernel.browsers.computer.scroll(this.sessionId, { + x: px, y: py, delta_x: 0, delta_y: dy, + }); + break; + } + + case 'drag': { + const [px1, py1] = this.scale(args.x1, args.y1); + const [px2, py2] = this.scale(args.x2, args.y2); + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[px1, py1], [px2, py2]], + }); + break; + } + + default: + throw new ToolError(`Unknown function: ${name}`); + } + } + + async captureScreenshot(): Promise { + const res = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const buf = Buffer.from(await res.arrayBuffer()); + return `data:image/png;base64,${buf.toString('base64')}`; + } +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/pkg/templates/typescript/tzafon/tsconfig.json b/pkg/templates/typescript/tzafon/tsconfig.json new file mode 100644 index 0000000..bcfe93c --- /dev/null +++ b/pkg/templates/typescript/tzafon/tsconfig.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + }, + "include": ["./**/*.ts"], + "exclude": ["node_modules", "dist"] +}