From a922c0469e2f570384881dd7d572695323b95e39 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Mon, 19 Jan 2026 21:34:43 -0500 Subject: [PATCH 01/12] feat: Add Yutori n1 computer use templates for TypeScript and Python Add new CLI templates for Yutori's n1 computer use model, enabling users to quickly scaffold browser automation projects using Kernel's infrastructure. Templates (TypeScript & Python): - Agentic sampling loop with n1's OpenAI-compatible API - Computer tool mapping n1 actions (click, type, scroll, drag, etc.) to Kernel's Computer Controls API - Coordinate scaling from n1's 1000x1000 relative space to actual viewport - Session management with replay recording support - read_texts_and_links action using Playwright execution API (with fallback) Key implementation details: - n1 requires screenshots sent with role 'observation' (not 'user') - Model: n1-preview-2025-11 outputs coordinates in 1000x1000 space - Viewport: 1200x800 at 25Hz (closest to Yutori's recommended 1280x800) - Navigation actions (refresh, go_back, goto_url) use keyboard shortcuts via Computer Controls since n1 doesn't use Playwright directly Also updated: - .gitignore: Added qa-* to exclude QA testing directories - pkg/create/templates.go: Registered new yutori-computer-use templates - .cursor/commands/qa.md: Added Yutori templates to QA testing matrix Closes KERNEL-742 --- .cursor/commands/qa.md | 34 +- .gitignore | 3 + pkg/create/templates.go | 18 + .../python/yutori-computer-use/README.md | 65 +++ .../python/yutori-computer-use/_gitignore | 7 + .../python/yutori-computer-use/loop.py | 234 +++++++++ .../python/yutori-computer-use/main.py | 94 ++++ .../python/yutori-computer-use/pyproject.toml | 13 + .../python/yutori-computer-use/session.py | 153 ++++++ .../yutori-computer-use/tools/__init__.py | 11 + .../python/yutori-computer-use/tools/base.py | 19 + .../yutori-computer-use/tools/computer.py | 411 +++++++++++++++ .../typescript/yutori-computer-use/README.md | 65 +++ .../typescript/yutori-computer-use/_gitignore | 4 + .../typescript/yutori-computer-use/index.ts | 92 ++++ .../typescript/yutori-computer-use/loop.ts | 266 ++++++++++ .../yutori-computer-use/package.json | 16 + .../typescript/yutori-computer-use/session.ts | 233 +++++++++ .../yutori-computer-use/tools/computer.ts | 467 ++++++++++++++++++ .../yutori-computer-use/tsconfig.json | 9 + 20 files changed, 2210 insertions(+), 4 deletions(-) create mode 100644 pkg/templates/python/yutori-computer-use/README.md create mode 100644 pkg/templates/python/yutori-computer-use/_gitignore create mode 100644 pkg/templates/python/yutori-computer-use/loop.py create mode 100644 pkg/templates/python/yutori-computer-use/main.py create mode 100644 pkg/templates/python/yutori-computer-use/pyproject.toml create mode 100644 pkg/templates/python/yutori-computer-use/session.py create mode 100644 pkg/templates/python/yutori-computer-use/tools/__init__.py create mode 100644 pkg/templates/python/yutori-computer-use/tools/base.py create mode 100644 pkg/templates/python/yutori-computer-use/tools/computer.py create mode 100644 pkg/templates/typescript/yutori-computer-use/README.md create mode 100644 pkg/templates/typescript/yutori-computer-use/_gitignore create mode 100644 pkg/templates/typescript/yutori-computer-use/index.ts create mode 100644 pkg/templates/typescript/yutori-computer-use/loop.ts create mode 100644 pkg/templates/typescript/yutori-computer-use/package.json create mode 100644 pkg/templates/typescript/yutori-computer-use/session.ts create mode 100644 pkg/templates/typescript/yutori-computer-use/tools/computer.ts create mode 100644 pkg/templates/typescript/yutori-computer-use/tsconfig.json diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 46fb874..4132052 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -58,6 +58,7 @@ Here are all valid language + template combinations: | typescript | openai-computer-use | ts-openai-cua | ts-openai-cua | Yes | OPENAI_API_KEY | | typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY | | typescript | claude-agent-sdk | ts-claude-agent-sdk | ts-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | +| typescript | yutori-computer-use | ts-yutori-cua | ts-yutori-cua | Yes | YUTORI_API_KEY | | python | sample-app | py-sample-app | python-basic | No | - | | python | captcha-solver | py-captcha-solver | python-captcha-solver | No | - | | python | browser-use | py-browser-use | python-bu | Yes | OPENAI_API_KEY | @@ -65,6 +66,7 @@ Here are all valid language + template combinations: | python | openai-computer-use | py-openai-cua | python-openai-cua | Yes | OPENAI_API_KEY | | python | openagi-computer-use | py-openagi-cua | python-openagi-cua | Yes | OAGI_API_KEY | | python | claude-agent-sdk | py-claude-agent-sdk | py-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | +| python | yutori-computer-use | py-yutori-cua | python-yutori-cua | Yes | YUTORI_API_KEY | ### Create Commands @@ -80,6 +82,7 @@ Run each of these (they are non-interactive when all flags are provided): ../bin/kernel create -n ts-openai-cua -l typescript -t openai-computer-use ../bin/kernel create -n ts-gemini-cua -l typescript -t gemini-computer-use ../bin/kernel create -n ts-claude-agent-sdk -l typescript -t claude-agent-sdk +../bin/kernel create -n ts-yutori-cua -l typescript -t yutori-computer-use # Python templates ../bin/kernel create -n py-sample-app -l python -t sample-app @@ -89,6 +92,7 @@ Run each of these (they are non-interactive when all flags are provided): ../bin/kernel create -n py-openai-cua -l python -t openai-computer-use ../bin/kernel create -n py-openagi-cua -l python -t openagi-computer-use ../bin/kernel create -n py-claude-agent-sdk -l python -t claude-agent-sdk +../bin/kernel create -n py-yutori-cua -l python -t yutori-computer-use ``` ## Step 5: Deploy Each Template @@ -176,6 +180,15 @@ echo "ANTHROPIC_API_KEY=" > .env cd .. ``` +**ts-yutori-cua** (needs YUTORI_API_KEY): + +```bash +cd ts-yutori-cua +echo "YUTORI_API_KEY=" > .env +../bin/kernel deploy index.ts --env-file .env +cd .. +``` + **py-browser-use** (needs OPENAI_API_KEY): ```bash @@ -221,6 +234,15 @@ echo "ANTHROPIC_API_KEY=" > .env cd .. ``` +**py-yutori-cua** (needs YUTORI_API_KEY): + +```bash +cd py-yutori-cua +echo "YUTORI_API_KEY=" > .env +../bin/kernel deploy main.py --env-file .env +cd .. +``` + ## Step 6: Provide Invoke Commands Once all deployments are complete, present the human with these invoke commands to test manually: @@ -235,6 +257,7 @@ kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipe kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -244,11 +267,12 @@ kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://m kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}' kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' ``` ## Step 7: Automated Runtime Testing (Optional) -**STOP and ask the human:** "Would you like me to automatically invoke all 15 templates and report back on their runtime status?" +**STOP and ask the human:** "Would you like me to automatically invoke all 17 templates and report back on their runtime status?" If the human agrees, invoke each template use the Kernel CLI and collect results. Present findings in this format: @@ -268,6 +292,7 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | ts-openai-cua | ts-openai-cua | | | | ts-gemini-cua | ts-gemini-cua | | | | ts-claude-agent-sdk | ts-claude-agent-sdk | | | +| ts-yutori-cua | ts-yutori-cua | | | | py-sample-app | python-basic | | | | py-captcha-solver | python-captcha-solver | | | | py-browser-use | python-bu | | | @@ -275,6 +300,7 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | py-openai-cua | python-openai-cua | | | | py-openagi-cua | python-openagi-cua | | | | py-claude-agent-sdk | py-claude-agent-sdk | | | +| py-yutori-cua | python-yutori-cua | | | Status values: - **SUCCESS**: App started and returned a result @@ -287,9 +313,9 @@ Notes should include brief error messages for failures or confirmation of succes - [ ] Built CLI with `make build` - [ ] Created QA directory - [ ] Got KERNEL_API_KEY from human -- [ ] Created all 15 template variations -- [ ] Got required API keys from human (OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, OAGI_API_KEY) -- [ ] Deployed all 15 apps +- [ ] Created all 17 template variations +- [ ] Got required API keys from human (OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, OAGI_API_KEY, YUTORI_API_KEY) +- [ ] Deployed all 17 apps - [ ] Provided invoke commands to human for manual testing - [ ] (Optional) Ran automated runtime testing and reviewed results diff --git a/.gitignore b/.gitignore index 900b34b..39ae834 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store kernel + +# QA testing directories +qa-* diff --git a/pkg/create/templates.go b/pkg/create/templates.go index f99c4e6..957d45f 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -18,6 +18,7 @@ const ( TemplateStagehand = "stagehand" TemplateOpenAGIComputerUse = "openagi-computer-use" TemplateClaudeAgentSDK = "claude-agent-sdk" + TemplateYutoriComputerUse = "yutori-computer-use" ) type TemplateInfo struct { @@ -84,6 +85,11 @@ var Templates = map[string]TemplateInfo{ Description: "Implements a Claude Agent SDK browser automation agent", Languages: []string{LanguageTypeScript, LanguagePython}, }, + TemplateYutoriComputerUse: { + Name: "Yutori n1 Computer Use", + Description: "Implements a Yutori n1 computer use agent", + Languages: []string{LanguageTypeScript, LanguagePython}, + }, } // GetSupportedTemplatesForLanguage returns a list of all supported template names for a given language @@ -108,6 +114,8 @@ func GetSupportedTemplatesForLanguage(language string) TemplateKeyValues { return 1 case TemplateGeminiComputerUse: return 2 + case TemplateYutoriComputerUse: + return 3 default: return 10 } @@ -200,6 +208,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}'`, }, + TemplateYutoriComputerUse: { + EntryPoint: "index.ts", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, + }, }, LanguagePython: { TemplateSampleApp: { @@ -237,6 +250,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}'`, }, + TemplateYutoriComputerUse: { + EntryPoint: "main.py", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, + }, }, } diff --git a/pkg/templates/python/yutori-computer-use/README.md b/pkg/templates/python/yutori-computer-use/README.md new file mode 100644 index 0000000..2f8ec2f --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/README.md @@ -0,0 +1,65 @@ +# Kernel Python Sample App - Yutori n1 Computer Use + +This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. + +[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. + +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Yutori**: [yutori.com](https://yutori.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your YUTORI_API_KEY +kernel deploy main.py --env-file .env +``` + +## Usage + +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Viewport Configuration + +Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. Kernel's closest supported viewport is **1200×800 at 25Hz**, which this template uses by default. + +> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. The slight width difference (1200 vs 1280) should have minimal impact on accuracy. + +See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. + +## n1 Supported Actions + +| Action | Description | +|--------|-------------| +| `click` | Left mouse click at coordinates | +| `scroll` | Scroll page in a direction | +| `type` | Type text into focused element | +| `key_press` | Send keyboard input | +| `hover` | Move mouse without clicking | +| `drag` | Click-and-drag operation | +| `wait` | Pause for UI to update | +| `refresh` | Reload current page | +| `go_back` | Navigate back in history | +| `goto_url` | Navigate to a URL | +| `stop` | End task with final answer | + +## Resources + +- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/python/yutori-computer-use/_gitignore b/pkg/templates/python/yutori-computer-use/_gitignore new file mode 100644 index 0000000..22e9be5 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/_gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.py[cod] +*$py.class +.env +*.log +.venv/ +venv/ diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py new file mode 100644 index 0000000..e2049af --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -0,0 +1,234 @@ +""" +Yutori n1 Sampling Loop + +Implements the agent loop for Yutori's n1 computer use model. +n1 uses an OpenAI-compatible API with specific conventions: +- Screenshots use role: "observation" (not "user") +- Coordinates are returned in 1000x1000 space and need scaling +- WebP format recommended for screenshots + +@see https://docs.yutori.com/reference/n1 +""" + +import json +import re +from typing import Any, Optional + +from kernel import Kernel +from openai import OpenAI + +from tools import ComputerTool, N1Action, ToolResult + +# n1 uses its own system prompt - custom prompts may degrade performance +# Per docs: "we generally do not recommend providing custom system prompts" + + +async def sampling_loop( + *, + model: str = "n1-preview-2025-11", + task: str, + api_key: str, + kernel: Kernel, + session_id: str, + max_tokens: int = 4096, + max_iterations: int = 50, + # Default viewport matches WXGA (1200x800) - closest to Yutori's recommended 1280x800 + viewport_width: int = 1200, + viewport_height: int = 800, +) -> dict[str, Any]: + """ + Run the n1 sampling loop until the model returns a stop action or max iterations. + + Args: + model: The n1 model to use + task: The user's task/query + api_key: Yutori API key + kernel: Kernel client instance + session_id: Browser session ID + max_tokens: Maximum tokens per response + max_iterations: Maximum loop iterations + viewport_width: Browser viewport width for coordinate scaling + viewport_height: Browser viewport height for coordinate scaling + + Returns: + Dict with 'messages' (conversation history) and 'final_answer' (if stopped) + """ + client = OpenAI( + api_key=api_key, + base_url="https://api.yutori.com/v1", + ) + + computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) + + # Take initial screenshot + initial_screenshot = computer_tool.screenshot() + + # Build conversation per n1 format: + # 1. User message with task + # 2. Observation message with screenshot + conversation_messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": [{"type": "text", "text": task}], + } + ] + + # Add initial screenshot as observation (n1's required format) + if initial_screenshot.get("base64_image"): + conversation_messages.append({ + "role": "observation", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" + }, + } + ], + }) + + iteration = 0 + final_answer: Optional[str] = None + + while iteration < max_iterations: + iteration += 1 + print(f"\n=== Iteration {iteration} ===") + + # Call the n1 API (no system prompt - n1 uses its own) + try: + response = client.chat.completions.create( + model=model, + messages=conversation_messages, + max_tokens=max_tokens, + temperature=0.3, + ) + except Exception as api_error: + print(f"API call failed: {api_error}") + raise api_error + + if not response.choices or len(response.choices) == 0: + print(f"No choices in response: {response}") + raise ValueError("No choices in API response") + + assistant_message = response.choices[0].message + if not assistant_message: + raise ValueError("No response from model") + + response_content = assistant_message.content or "" + print("Assistant response:", response_content) + + # Add assistant message to conversation + conversation_messages.append({ + "role": "assistant", + "content": response_content, + }) + + # Parse the action(s) from the response + # n1 returns JSON with "thoughts" and "actions" array + parsed = _parse_n1_response(response_content) + + if not parsed or not parsed.get("actions"): + print("No actions found in response, ending loop") + break + + # Execute each action in the actions array + for action in parsed["actions"]: + print(f"Executing action: {action.get('action_type')}", action) + + # Check for stop action + if action.get("action_type") == "stop": + final_answer = action.get("answer") + print(f"Stop action received, final answer: {final_answer}") + return {"messages": conversation_messages, "final_answer": final_answer} + + # Scale coordinates from n1's 1000x1000 space to actual viewport + scaled_action = _scale_coordinates(action, viewport_width, viewport_height) + + # Execute the action + result: ToolResult + try: + result = await computer_tool.execute(scaled_action) + except Exception as e: + print(f"Action failed: {e}") + result = {"error": str(e)} + + # After action, add observation with screenshot and optional text output + if result.get("base64_image") or result.get("output"): + observation_content = [] + + # Add text output first (e.g., from read_texts_and_links) + if result.get("output"): + observation_content.append({ + "type": "text", + "text": result["output"], + }) + + # Add screenshot + if result.get("base64_image"): + observation_content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{result['base64_image']}" + }, + }) + + conversation_messages.append({ + "role": "observation", + "content": observation_content, + }) + elif result.get("error"): + # If there was an error, add it as text observation + conversation_messages.append({ + "role": "observation", + "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], + }) + + if iteration >= max_iterations: + print("Max iterations reached") + + return { + "messages": conversation_messages, + "final_answer": final_answer, + } + + +def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: + """ + Parse n1's response format: { "thoughts": "...", "actions": [...] } + """ + try: + # The response should be JSON + return json.loads(content) + except json.JSONDecodeError: + # Try to extract JSON from the response if it's wrapped in text + json_match = re.search(r'\{[\s\S]*\}', content) + if json_match: + try: + return json.loads(json_match.group(0)) + except json.JSONDecodeError: + print(f"Failed to parse action JSON: {json_match.group(0)}") + return None + + +def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action: + """ + Scale coordinates from n1's 1000x1000 space to actual viewport dimensions. + Per docs: "n1-preview-2025-11 outputs relative coordinates in 1000×1000" + """ + scaled = dict(action) + + if "center_coordinates" in scaled and scaled["center_coordinates"]: + coords = scaled["center_coordinates"] + scaled["center_coordinates"] = [ + round((coords[0] / 1000) * viewport_width), + round((coords[1] / 1000) * viewport_height), + ] + + if "start_coordinates" in scaled and scaled["start_coordinates"]: + coords = scaled["start_coordinates"] + scaled["start_coordinates"] = [ + round((coords[0] / 1000) * viewport_width), + round((coords[1] / 1000) * viewport_height), + ] + + return scaled diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py new file mode 100644 index 0000000..4830797 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -0,0 +1,94 @@ +import os +from typing import Optional, TypedDict + +import kernel +from loop import sampling_loop +from session import KernelBrowserSession + + +class QueryInput(TypedDict): + query: str + record_replay: Optional[bool] + + +class QueryOutput(TypedDict): + result: str + replay_url: Optional[str] + + +api_key = os.getenv("YUTORI_API_KEY") +if not api_key: + raise ValueError("YUTORI_API_KEY is not set") + +app = kernel.App("python-yutori-cua") + + +@app.action("cua-task") +async def cua_task( + ctx: kernel.KernelContext, + payload: QueryInput, +) -> QueryOutput: + """ + Process a user query using Yutori n1 Computer Use with Kernel's browser automation. + + Args: + ctx: Kernel context containing invocation information + payload: An object containing: + - query: The task/query string to process + - record_replay: Optional boolean to enable video replay recording + + Returns: + A dictionary containing: + - result: The result of the sampling loop as a string + - replay_url: URL to view the replay (if recording was enabled) + """ + if not payload or not payload.get("query"): + raise ValueError("Query is required") + + record_replay = payload.get("record_replay", False) + + async with KernelBrowserSession( + stealth=True, + record_replay=record_replay, + ) as session: + print("Kernel browser live view url:", session.live_view_url) + + loop_result = await sampling_loop( + model="n1-preview-2025-11", + task=payload["query"], + api_key=str(api_key), + kernel=session.kernel, + session_id=session.session_id, + ) + + final_answer = loop_result.get("final_answer") + messages = loop_result.get("messages", []) + + if final_answer: + result = final_answer + else: + # Extract last assistant message + result = _extract_last_assistant_message(messages) + + return { + "result": result, + "replay_url": session.replay_view_url, + } + + +def _extract_last_assistant_message(messages: list) -> str: + """Extract the last assistant message text from the conversation.""" + import json + + for msg in reversed(messages): + if msg.get("role") == "assistant": + content = msg.get("content") + if isinstance(content, str): + # Try to parse the thoughts from JSON response + try: + parsed = json.loads(content) + if parsed.get("thoughts"): + return parsed["thoughts"] + except json.JSONDecodeError: + return content + return "Task completed" diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori-computer-use/pyproject.toml new file mode 100644 index 0000000..456eab0 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "python-yutori-cua" +version = "0.1.0" +description = "Kernel reference app for Yutori n1 Computer Use" +requires-python = ">=3.9" +dependencies = [ + "openai>=1.58.0", + "python-dateutil>=2.9.0", + "pydantic>=2.12.5", + "typing-extensions>=4.15.0", + "kernel>=0.24.0", + "python-dotenv>=1.2.1", +] diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori-computer-use/session.py new file mode 100644 index 0000000..522d4c9 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/session.py @@ -0,0 +1,153 @@ +""" +Kernel Browser Session Manager. + +Provides an async context manager for managing Kernel browser lifecycle +with optional video replay recording. +""" + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Optional + +from kernel import Kernel + + +@dataclass +class KernelBrowserSession: + """ + Manages Kernel browser lifecycle as an async context manager. + + Creates a browser session on entry and cleans it up on exit. + Optionally records a video replay of the entire session. + Provides session_id to computer tools. + + Usage: + async with KernelBrowserSession(record_replay=True) as session: + # Use session.session_id and session.kernel for operations + pass + # Browser is automatically cleaned up, replay URL available in session.replay_view_url + """ + + stealth: bool = True + timeout_seconds: int = 300 + + # Replay recording options + record_replay: bool = False + replay_grace_period: float = 5.0 # Seconds to wait before stopping replay + + # Set after browser creation + session_id: Optional[str] = field(default=None, init=False) + live_view_url: Optional[str] = field(default=None, init=False) + cdp_ws_url: Optional[str] = field(default=None, init=False) + replay_id: Optional[str] = field(default=None, init=False) + replay_view_url: Optional[str] = field(default=None, init=False) + _kernel: Optional[Kernel] = field(default=None, init=False) + + async def __aenter__(self) -> "KernelBrowserSession": + """Create a Kernel browser session and optionally start recording.""" + self._kernel = Kernel() + + # Create browser with viewport closest to Yutori n1's recommended 1280x800 + # Using 1200x800 (WXGA at 25Hz) - the closest supported Kernel viewport + # See: https://docs.yutori.com/reference/n1#screenshot-requirements + browser = self._kernel.browsers.create( + stealth=self.stealth, + timeout_seconds=self.timeout_seconds, + viewport={ + "width": 1200, + "height": 800, + "refresh_rate": 25, + }, + ) + + self.session_id = browser.session_id + self.live_view_url = browser.browser_live_view_url + self.cdp_ws_url = browser.cdp_ws_url + + print(f"Kernel browser created: {self.session_id}") + print(f"Live view URL: {self.live_view_url}") + + # Start replay recording if enabled + if self.record_replay: + try: + await self._start_replay() + except Exception as e: + print(f"Warning: Failed to start replay recording: {e}") + print("Continuing without replay recording.") + + return self + + async def _start_replay(self) -> None: + """Start recording a replay of the browser session.""" + if not self._kernel or not self.session_id: + return + + print("Starting replay recording...") + replay = self._kernel.browsers.replays.start(self.session_id) + self.replay_id = replay.replay_id + print(f"Replay recording started: {self.replay_id}") + + async def _stop_and_get_replay_url(self) -> None: + """Stop recording and get the replay URL.""" + if not self._kernel or not self.session_id or not self.replay_id: + return + + print("Stopping replay recording...") + self._kernel.browsers.replays.stop( + replay_id=self.replay_id, + id=self.session_id, + ) + print("Replay recording stopped. Processing video...") + + # Wait a moment for processing + await asyncio.sleep(2) + + # Poll for replay to be ready (with timeout) + max_wait = 60 # seconds + start_time = time.time() + replay_ready = False + + while time.time() - start_time < max_wait: + try: + replays = self._kernel.browsers.replays.list(self.session_id) + for replay in replays: + if replay.replay_id == self.replay_id: + self.replay_view_url = replay.replay_view_url + replay_ready = True + break + if replay_ready: + break + except Exception: + pass + await asyncio.sleep(1) + + if not replay_ready: + print("Warning: Replay may still be processing") + elif self.replay_view_url: + print(f"Replay view URL: {self.replay_view_url}") + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Stop recording and delete the browser session.""" + if self._kernel and self.session_id: + try: + # Stop replay if recording was enabled + if self.record_replay and self.replay_id: + # Wait grace period before stopping to capture final state + if self.replay_grace_period > 0: + print(f"Waiting {self.replay_grace_period}s grace period...") + await asyncio.sleep(self.replay_grace_period) + await self._stop_and_get_replay_url() + finally: + print(f"Destroying browser session: {self.session_id}") + self._kernel.browsers.delete_by_id(self.session_id) + print("Browser session destroyed.") + + self._kernel = None + + @property + def kernel(self) -> Kernel: + """Get the Kernel client instance.""" + if self._kernel is None: + raise RuntimeError("Session not initialized. Use async with context.") + return self._kernel diff --git a/pkg/templates/python/yutori-computer-use/tools/__init__.py b/pkg/templates/python/yutori-computer-use/tools/__init__.py new file mode 100644 index 0000000..63da518 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/__init__.py @@ -0,0 +1,11 @@ +"""Yutori n1 Computer Tools.""" + +from .base import ToolError, ToolResult +from .computer import ComputerTool, N1Action + +__all__ = [ + "ToolError", + "ToolResult", + "ComputerTool", + "N1Action", +] diff --git a/pkg/templates/python/yutori-computer-use/tools/base.py b/pkg/templates/python/yutori-computer-use/tools/base.py new file mode 100644 index 0000000..e50a455 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/base.py @@ -0,0 +1,19 @@ +"""Base tool types for Yutori n1.""" + +from typing import TypedDict + + +class ToolError(Exception): + """Error raised when a tool execution fails.""" + + def __init__(self, message: str): + self.message = message + super().__init__(message) + + +class ToolResult(TypedDict, total=False): + """Result from executing a tool action.""" + + base64_image: str + output: str + error: str diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py new file mode 100644 index 0000000..cb216f2 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -0,0 +1,411 @@ +""" +Yutori n1 Computer Tool + +Maps n1 action format to Kernel's Computer Controls API. +""" + +import asyncio +import base64 +import json +from typing import Literal, TypedDict, Optional + +from kernel import Kernel + +from .base import ToolError, ToolResult + +TYPING_DELAY_MS = 12 +SCREENSHOT_DELAY_MS = 0.5 +ACTION_DELAY_MS = 0.5 + + +# n1 action types +N1ActionType = Literal[ + "click", + "scroll", + "type", + "key_press", + "hover", + "drag", + "wait", + "refresh", + "go_back", + "goto_url", + "read_texts_and_links", + "stop", +] + + +class N1Action(TypedDict, total=False): + action_type: N1ActionType + center_coordinates: tuple[int, int] | list[int] + start_coordinates: tuple[int, int] | list[int] + direction: Literal["up", "down", "left", "right"] + amount: int + text: str + press_enter_after: bool + clear_before_typing: bool + key_comb: str + url: str + answer: str + + +# Key mappings from Playwright format (n1 output) to xdotool format (Kernel) +KEY_MAP = { + "Enter": "Return", + "Escape": "Escape", + "Backspace": "BackSpace", + "Tab": "Tab", + "Delete": "Delete", + "ArrowUp": "Up", + "ArrowDown": "Down", + "ArrowLeft": "Left", + "ArrowRight": "Right", + "Home": "Home", + "End": "End", + "PageUp": "Page_Up", + "PageDown": "Page_Down", + "F1": "F1", + "F2": "F2", + "F3": "F3", + "F4": "F4", + "F5": "F5", + "F6": "F6", + "F7": "F7", + "F8": "F8", + "F9": "F9", + "F10": "F10", + "F11": "F11", + "F12": "F12", +} + +MODIFIER_MAP = { + "control": "ctrl", + "ctrl": "ctrl", + "alt": "alt", + "shift": "shift", + "meta": "super", + "command": "super", + "cmd": "super", +} + + +class ComputerTool: + """Computer tool for Yutori n1 actions using Kernel browser.""" + + def __init__(self, kernel: Kernel, session_id: str, width: int = 1200, height: int = 800): + self.kernel = kernel + self.session_id = session_id + self.width = width + self.height = height + + async def execute(self, action: N1Action) -> ToolResult: + """Execute an n1 action and return the result.""" + action_type = action.get("action_type") + + handlers = { + "click": self._handle_click, + "scroll": self._handle_scroll, + "type": self._handle_type, + "key_press": self._handle_key_press, + "hover": self._handle_hover, + "drag": self._handle_drag, + "wait": self._handle_wait, + "refresh": self._handle_refresh, + "go_back": self._handle_go_back, + "goto_url": self._handle_goto_url, + "read_texts_and_links": self._handle_read_texts_and_links, + "stop": self._handle_stop, + } + + handler = handlers.get(action_type) + if not handler: + raise ToolError(f"Unknown action type: {action_type}") + + return await handler(action) + + async def _handle_click(self, action: N1Action) -> ToolResult: + coords = self._get_coordinates(action.get("center_coordinates")) + + self.kernel.browsers.computer.click_mouse( + self.session_id, + x=coords["x"], + y=coords["y"], + button="left", + click_type="click", + num_clicks=1, + ) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return self.screenshot() + + async def _handle_scroll(self, action: N1Action) -> ToolResult: + coords = self._get_coordinates(action.get("center_coordinates")) + direction = action.get("direction") + amount = action.get("amount", 3) + + if direction not in ("up", "down", "left", "right"): + raise ToolError(f"Invalid scroll direction: {direction}") + + # Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels + scroll_delta = amount * 100 + + delta_x = 0 + delta_y = 0 + + if direction == "up": + delta_y = -scroll_delta + elif direction == "down": + delta_y = scroll_delta + elif direction == "left": + delta_x = -scroll_delta + elif direction == "right": + delta_x = scroll_delta + + self.kernel.browsers.computer.scroll( + self.session_id, + x=coords["x"], + y=coords["y"], + delta_x=delta_x, + delta_y=delta_y, + ) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return self.screenshot() + + async def _handle_type(self, action: N1Action) -> ToolResult: + text = action.get("text") + if not text: + raise ToolError("text is required for type action") + + # Clear existing text if requested + if action.get("clear_before_typing"): + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["ctrl+a"], + ) + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["BackSpace"], + ) + await asyncio.sleep(0.1) + + # Type the text + self.kernel.browsers.computer.type_text( + self.session_id, + text=text, + delay=TYPING_DELAY_MS, + ) + + # Press Enter if requested + if action.get("press_enter_after"): + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["Return"], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return self.screenshot() + + async def _handle_key_press(self, action: N1Action) -> ToolResult: + key_comb = action.get("key_comb") + if not key_comb: + raise ToolError("key_comb is required for key_press action") + + mapped_key = self._map_key(key_comb) + + self.kernel.browsers.computer.press_key( + self.session_id, + keys=[mapped_key], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return self.screenshot() + + async def _handle_hover(self, action: N1Action) -> ToolResult: + coords = self._get_coordinates(action.get("center_coordinates")) + + self.kernel.browsers.computer.move_mouse( + self.session_id, + x=coords["x"], + y=coords["y"], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return self.screenshot() + + async def _handle_drag(self, action: N1Action) -> ToolResult: + start_coords = self._get_coordinates(action.get("start_coordinates")) + end_coords = self._get_coordinates(action.get("center_coordinates")) + + self.kernel.browsers.computer.drag_mouse( + self.session_id, + path=[[start_coords["x"], start_coords["y"]], [end_coords["x"], end_coords["y"]]], + button="left", + ) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return self.screenshot() + + async def _handle_wait(self, action: N1Action) -> ToolResult: + # Default wait of 2 seconds for UI to update + await asyncio.sleep(2) + return self.screenshot() + + async def _handle_refresh(self, action: N1Action) -> ToolResult: + """Refresh the page using keyboard shortcut (F5).""" + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["F5"], + ) + + # Wait for page to reload + await asyncio.sleep(2) + return self.screenshot() + + async def _handle_go_back(self, action: N1Action) -> ToolResult: + """Go back using keyboard shortcut (Alt+Left).""" + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["alt+Left"], + ) + + # Wait for navigation + await asyncio.sleep(1.5) + return self.screenshot() + + async def _handle_goto_url(self, action: N1Action) -> ToolResult: + """ + Navigate to URL using keyboard shortcuts: + 1. Ctrl+L to focus the URL bar + 2. Type the URL + 3. Press Enter + """ + url = action.get("url") + if not url: + raise ToolError("url is required for goto_url action") + + # Focus URL bar with Ctrl+L + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["ctrl+l"], + ) + await asyncio.sleep(ACTION_DELAY_MS) + + # Select all existing text + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["ctrl+a"], + ) + await asyncio.sleep(0.1) + + # Type the URL + self.kernel.browsers.computer.type_text( + self.session_id, + text=url, + delay=TYPING_DELAY_MS, + ) + await asyncio.sleep(ACTION_DELAY_MS) + + # Press Enter to navigate + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["Return"], + ) + + # Wait for page to load + await asyncio.sleep(2) + return self.screenshot() + + async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: + """ + Read texts and links using Playwright's accessibility tree. + Per n1 docs this is "implemented as an external VLM call" - we use + Kernel's Playwright Execution API for the accessibility tree and + Computer Controls API for the screenshot. + """ + try: + # Get accessibility tree via Playwright Execution API + result = self.kernel.browsers.playwright.execute( + self.session_id, + code=""" + const url = page.url(); + const title = await page.title(); + const accessibilityTree = await page.accessibility.snapshot(); + return { url, title, accessibilityTree }; + """, + timeout_sec=30 + ) + + # Get screenshot via Computer Controls API + screenshot_result = self.screenshot() + + if result.success and result.result: + data = result.result + return { + "base64_image": screenshot_result.get("base64_image", ""), + "output": json.dumps({ + "url": data.get("url"), + "title": data.get("title"), + "accessibilityTree": data.get("accessibilityTree") + }, indent=2) + } + + # Fallback to just screenshot if Playwright execution fails + print("Playwright execution failed, falling back to screenshot only") + return screenshot_result + except Exception as e: + print(f"read_texts_and_links failed: {e}") + return self.screenshot() + + async def _handle_stop(self, action: N1Action) -> ToolResult: + """Return the final answer without taking a screenshot.""" + return {"output": action.get("answer", "Task completed")} + + def screenshot(self) -> ToolResult: + """Take a screenshot of the current browser state.""" + try: + response = self.kernel.browsers.computer.capture_screenshot( + self.session_id + ) + image_bytes = response.read() + base64_image = base64.b64encode(image_bytes).decode("utf-8") + return {"base64_image": base64_image} + except Exception as e: + raise ToolError(f"Failed to take screenshot: {e}") + + def _get_coordinates( + self, coords: tuple[int, int] | list[int] | None + ) -> dict[str, int]: + """Convert n1 coordinates to Kernel format.""" + if coords is None or len(coords) != 2: + # Default to center of screen + return {"x": self.width // 2, "y": self.height // 2} + + x, y = coords + if not isinstance(x, (int, float)) or not isinstance(y, (int, float)) or x < 0 or y < 0: + raise ToolError(f"Invalid coordinates: {coords}") + + return {"x": int(x), "y": int(y)} + + def _map_key(self, key: str) -> str: + """Map key names from Playwright format (n1 output) to xdotool format (Kernel).""" + # Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") + if "+" in key: + parts = key.split("+") + mapped_parts = [] + for part in parts: + trimmed = part.strip() + lower = trimmed.lower() + + # Map modifier names + if lower in MODIFIER_MAP: + mapped_parts.append(MODIFIER_MAP[lower]) + else: + # Check KEY_MAP for special keys + mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) + + return "+".join(mapped_parts) + + return KEY_MAP.get(key, key) diff --git a/pkg/templates/typescript/yutori-computer-use/README.md b/pkg/templates/typescript/yutori-computer-use/README.md new file mode 100644 index 0000000..625c94d --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/README.md @@ -0,0 +1,65 @@ +# Kernel TypeScript Sample App - Yutori n1 Computer Use + +This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. + +[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. + +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Yutori**: [yutori.com](https://yutori.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your YUTORI_API_KEY +kernel deploy index.ts --env-file .env +``` + +## Usage + +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Viewport Configuration + +Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. Kernel's closest supported viewport is **1200×800 at 25Hz**, which this template uses by default. + +> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. The slight width difference (1200 vs 1280) should have minimal impact on accuracy. + +See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. + +## n1 Supported Actions + +| Action | Description | +|--------|-------------| +| `click` | Left mouse click at coordinates | +| `scroll` | Scroll page in a direction | +| `type` | Type text into focused element | +| `key_press` | Send keyboard input | +| `hover` | Move mouse without clicking | +| `drag` | Click-and-drag operation | +| `wait` | Pause for UI to update | +| `refresh` | Reload current page | +| `go_back` | Navigate back in history | +| `goto_url` | Navigate to a URL | +| `stop` | End task with final answer | + +## Resources + +- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/typescript/yutori-computer-use/_gitignore b/pkg/templates/typescript/yutori-computer-use/_gitignore new file mode 100644 index 0000000..aa0926a --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/_gitignore @@ -0,0 +1,4 @@ +node_modules/ +dist/ +.env +*.log diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts new file mode 100644 index 0000000..c39ff0e --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -0,0 +1,92 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { samplingLoop } from './loop'; +import { KernelBrowserSession } from './session'; + +const kernel = new Kernel(); + +const app = kernel.app('ts-yutori-cua'); + +interface QueryInput { + query: string; + record_replay?: boolean; +} + +interface QueryOutput { + result: string; + replay_url?: string; +} + +// LLM API Keys are set in the environment during `kernel deploy -e YUTORI_API_KEY=XXX` +// See https://www.kernel.sh/docs/launch/deploy#environment-variables +const YUTORI_API_KEY = process.env.YUTORI_API_KEY; + +if (!YUTORI_API_KEY) { + throw new Error('YUTORI_API_KEY is not set'); +} + +app.action( + 'cua-task', + async (ctx: KernelContext, payload?: QueryInput): Promise => { + if (!payload?.query) { + throw new Error('Query is required'); + } + + // Create browser session with optional replay recording + const session = new KernelBrowserSession(kernel, { + stealth: true, + recordReplay: payload.record_replay ?? false, + }); + + await session.start(); + console.log('Kernel browser live view url:', session.liveViewUrl); + + try { + // Run the sampling loop + const { finalAnswer, messages } = await samplingLoop({ + model: 'n1-preview-2025-11', + task: payload.query, + apiKey: YUTORI_API_KEY, + kernel, + sessionId: session.sessionId, + }); + + // Extract the result + const result = finalAnswer || extractLastAssistantMessage(messages); + + // Stop session and get replay URL if recording was enabled + const sessionInfo = await session.stop(); + + return { + result, + replay_url: sessionInfo.replayViewUrl, + }; + } catch (error) { + console.error('Error in sampling loop:', error); + await session.stop(); + throw error; + } + }, +); + +/** + * Extract the last assistant message text from the conversation. + */ +function extractLastAssistantMessage(messages: { role: string; content: string | unknown[] }[]): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === 'assistant') { + if (typeof msg.content === 'string') { + // Try to parse the thoughts from JSON response + try { + const parsed = JSON.parse(msg.content); + if (parsed.thoughts) { + return parsed.thoughts; + } + } catch { + return msg.content; + } + } + } + } + return 'Task completed'; +} diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts new file mode 100644 index 0000000..9ba8074 --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -0,0 +1,266 @@ +/** + * Yutori n1 Sampling Loop + * + * Implements the agent loop for Yutori's n1 computer use model. + * n1 uses an OpenAI-compatible API with specific conventions: + * - Screenshots use role: "observation" (not "user") + * - Coordinates are returned in 1000x1000 space and need scaling + * - WebP format recommended for screenshots + * + * @see https://docs.yutori.com/reference/n1 + */ + +import OpenAI from 'openai'; +import type { Kernel } from '@onkernel/sdk'; +import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; + +// n1 uses its own system prompt - custom prompts may degrade performance +// Per docs: "we generally do not recommend providing custom system prompts" + +interface Message { + role: 'user' | 'assistant' | 'observation'; + content: string | MessageContent[]; +} + +interface MessageContent { + type: 'text' | 'image_url'; + text?: string; + image_url?: { + url: string; + }; +} + +interface SamplingLoopOptions { + model?: string; + task: string; + apiKey: string; + kernel: Kernel; + sessionId: string; + maxTokens?: number; + maxIterations?: number; + /** Viewport width for coordinate scaling (default: 1200 to match WXGA) */ + viewportWidth?: number; + /** Viewport height for coordinate scaling (default: 800 to match WXGA) */ + viewportHeight?: number; +} + +interface SamplingLoopResult { + messages: Message[]; + finalAnswer?: string; +} + +/** + * Run the n1 sampling loop until the model returns a stop action or max iterations. + */ +export async function samplingLoop({ + model = 'n1-preview-2025-11', + task, + apiKey, + kernel, + sessionId, + maxTokens = 4096, + maxIterations = 50, + // Default viewport matches WXGA (1200x800) - closest to Yutori's recommended 1280x800 + viewportWidth = 1200, + viewportHeight = 800, +}: SamplingLoopOptions): Promise { + const client = new OpenAI({ + apiKey, + baseURL: 'https://api.yutori.com/v1', + }); + + const computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); + + // Take initial screenshot + const initialScreenshot = await computerTool.screenshot(); + + // Build conversation per n1 format: + // 1. User message with task + // 2. Observation message with screenshot + const conversationMessages: Message[] = [ + { + role: 'user', + content: [{ type: 'text', text: task }], + }, + ]; + + // Add initial screenshot as observation (n1's required format) + if (initialScreenshot.base64Image) { + conversationMessages.push({ + role: 'observation', + content: [ + { + type: 'image_url', + image_url: { + url: `data:image/png;base64,${initialScreenshot.base64Image}`, + }, + }, + ], + }); + } + + let iteration = 0; + let finalAnswer: string | undefined; + + while (iteration < maxIterations) { + iteration++; + console.log(`\n=== Iteration ${iteration} ===`); + + // Call the n1 API (no system prompt - n1 uses its own) + let response; + try { + response = await client.chat.completions.create({ + model, + messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], + max_tokens: maxTokens, + temperature: 0.3, + }); + } catch (apiError) { + console.error('API call failed:', apiError); + throw apiError; + } + + if (!response.choices || response.choices.length === 0) { + console.error('No choices in response:', JSON.stringify(response, null, 2)); + throw new Error('No choices in API response'); + } + + const assistantMessage = response.choices[0]?.message; + if (!assistantMessage) { + throw new Error('No response from model'); + } + + const responseContent = assistantMessage.content || ''; + console.log('Assistant response:', responseContent); + + // Add assistant message to conversation + conversationMessages.push({ + role: 'assistant', + content: responseContent, + }); + + // Parse the action(s) from the response + // n1 returns JSON with "thoughts" and "actions" array + const parsed = parseN1Response(responseContent); + + if (!parsed || !parsed.actions || parsed.actions.length === 0) { + console.log('No actions found in response, ending loop'); + break; + } + + // Execute each action in the actions array + for (const action of parsed.actions) { + console.log('Executing action:', action.action_type, action); + + // Check for stop action + if (action.action_type === 'stop') { + finalAnswer = action.answer; + console.log('Stop action received, final answer:', finalAnswer); + return { messages: conversationMessages, finalAnswer }; + } + + // Scale coordinates from n1's 1000x1000 space to actual viewport + const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); + + // Execute the action + let result: ToolResult; + try { + result = await computerTool.execute(scaledAction); + } catch (error) { + console.error('Action failed:', error); + result = { + error: error instanceof Error ? error.message : String(error), + }; + } + + // After action, add observation with screenshot and optional text output + if (result.base64Image || result.output) { + const observationContent: MessageContent[] = []; + + // Add text output first (e.g., from read_texts_and_links) + if (result.output) { + observationContent.push({ + type: 'text', + text: result.output, + }); + } + + // Add screenshot + if (result.base64Image) { + observationContent.push({ + type: 'image_url', + image_url: { + url: `data:image/png;base64,${result.base64Image}`, + }, + }); + } + + conversationMessages.push({ + role: 'observation', + content: observationContent, + }); + } else if (result.error) { + // If there was an error, add it as text observation + conversationMessages.push({ + role: 'observation', + content: [{ type: 'text', text: `Action failed: ${result.error}` }], + }); + } + } + } + + if (iteration >= maxIterations) { + console.log('Max iterations reached'); + } + + return { + messages: conversationMessages, + finalAnswer, + }; +} + +/** + * Parse n1's response format: { "thoughts": "...", "actions": [...] } + */ +function parseN1Response(content: string): { thoughts?: string; actions?: N1Action[] } | null { + try { + // The response should be JSON + const parsed = JSON.parse(content); + return parsed; + } catch { + // Try to extract JSON from the response if it's wrapped in text + const jsonMatch = content.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + return JSON.parse(jsonMatch[0]); + } catch { + console.error('Failed to parse action JSON:', jsonMatch[0]); + } + } + return null; + } +} + +/** + * Scale coordinates from n1's 1000x1000 space to actual viewport dimensions. + * Per docs: "n1-preview-2025-11 outputs relative coordinates in 1000×1000" + */ +function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action { + const scaled = { ...action }; + + if (scaled.center_coordinates) { + scaled.center_coordinates = [ + Math.round((scaled.center_coordinates[0] / 1000) * viewportWidth), + Math.round((scaled.center_coordinates[1] / 1000) * viewportHeight), + ]; + } + + if (scaled.start_coordinates) { + scaled.start_coordinates = [ + Math.round((scaled.start_coordinates[0] / 1000) * viewportWidth), + Math.round((scaled.start_coordinates[1] / 1000) * viewportHeight), + ]; + } + + return scaled; +} diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori-computer-use/package.json new file mode 100644 index 0000000..9f40a6f --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/package.json @@ -0,0 +1,16 @@ +{ + "name": "ts-yutori-cua", + "module": "index.ts", + "type": "module", + "private": true, + "dependencies": { + "@onkernel/sdk": "^0.24.0", + "openai": "^4.77.0", + "luxon": "^3.7.2" + }, + "devDependencies": { + "@types/node": "^22.15.17", + "@types/luxon": "^3.6.2", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori-computer-use/session.ts new file mode 100644 index 0000000..f597105 --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/session.ts @@ -0,0 +1,233 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + /** Enable stealth mode to avoid bot detection */ + stealth?: boolean; + /** Browser session timeout in seconds */ + timeoutSeconds?: number; + /** Enable replay recording (requires paid plan) */ + recordReplay?: boolean; + /** Grace period in seconds before stopping replay */ + replayGracePeriod?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + cdpWsUrl: string; + replayId?: string; + replayViewUrl?: string; +} + +const DEFAULT_OPTIONS: Required = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, +}; + +/** + * Manages Kernel browser lifecycle with optional replay recording. + * + * Usage: + * ```typescript + * const session = new KernelBrowserSession(kernel, options); + * await session.start(); + * try { + * // Use session.sessionId for computer controls + * } finally { + * await session.stop(); + * } + * ``` + */ +export class KernelBrowserSession { + private kernel: Kernel; + private options: Required; + + // Session state + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _cdpWsUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get cdpWsUrl(): string | null { + return this._cdpWsUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + cdpWsUrl: this._cdpWsUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + }; + } + + /** + * Create a Kernel browser session and optionally start recording. + */ + async start(): Promise { + // Create browser with viewport closest to Yutori n1's recommended 1280x800 + // Using 1200x800 (WXGA at 25Hz) - the closest supported Kernel viewport + // See: https://docs.yutori.com/reference/n1#screenshot-requirements + const browser = await this.kernel.browsers.create({ + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: 1200, + height: 800, + refresh_rate: 25, + }, + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url; + this._cdpWsUrl = browser.cdp_ws_url; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + // Start replay recording if enabled + if (this.options.recordReplay) { + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } + } + + return this.info; + } + + /** + * Start recording a replay of the browser session. + */ + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + /** + * Stop recording and get the replay URL. + */ + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + // Wait a moment for processing + await this.sleep(2000); + + // Poll for replay to be ready (with timeout) + const maxWait = 60000; // 60 seconds + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + /** + * Stop recording, and delete the browser session. + */ + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + try { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + // Always clean up the browser session, even if replay stopping fails + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); + } + } + + // Reset state + this._sessionId = null; + this._liveViewUrl = null; + this._cdpWsUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts new file mode 100644 index 0000000..66ba637 --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -0,0 +1,467 @@ +/** + * Yutori n1 Computer Tool + * + * Maps n1 action format to Kernel's Computer Controls API. + */ + +import { Buffer } from 'buffer'; +import type { Kernel } from '@onkernel/sdk'; + +const TYPING_DELAY_MS = 12; +const SCREENSHOT_DELAY_MS = 500; +const ACTION_DELAY_MS = 500; + +export interface ToolResult { + base64Image?: string; + output?: string; + error?: string; +} + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} + +// n1 action types +export type N1ActionType = + | 'click' + | 'scroll' + | 'type' + | 'key_press' + | 'hover' + | 'drag' + | 'wait' + | 'refresh' + | 'go_back' + | 'goto_url' + | 'read_texts_and_links' + | 'stop'; + +export interface N1Action { + action_type: N1ActionType; + center_coordinates?: [number, number]; + start_coordinates?: [number, number]; + direction?: 'up' | 'down' | 'left' | 'right'; + amount?: number; + text?: string; + press_enter_after?: boolean; + clear_before_typing?: boolean; + key_comb?: string; + url?: string; + answer?: string; +} + +// Key mappings from Playwright format (n1 output) to xdotool format (Kernel) +const KEY_MAP: Record = { + 'Enter': 'Return', + 'Escape': 'Escape', + 'Backspace': 'BackSpace', + 'Tab': 'Tab', + 'Delete': 'Delete', + 'ArrowUp': 'Up', + 'ArrowDown': 'Down', + 'ArrowLeft': 'Left', + 'ArrowRight': 'Right', + 'Home': 'Home', + 'End': 'End', + 'PageUp': 'Page_Up', + 'PageDown': 'Page_Down', + 'F1': 'F1', + 'F2': 'F2', + 'F3': 'F3', + 'F4': 'F4', + 'F5': 'F5', + 'F6': 'F6', + 'F7': 'F7', + 'F8': 'F8', + 'F9': 'F9', + 'F10': 'F10', + 'F11': 'F11', + 'F12': 'F12', +}; + +const MODIFIER_MAP: Record = { + 'control': 'ctrl', + 'ctrl': 'ctrl', + 'alt': 'alt', + 'shift': 'shift', + 'meta': 'super', + 'command': 'super', + 'cmd': 'super', +}; + +/** + * Computer tool for Yutori n1 actions using Kernel browser. + */ +export class ComputerTool { + private kernel: Kernel; + private sessionId: string; + private width: number; + private height: number; + + constructor(kernel: Kernel, sessionId: string, width = 1200, height = 800) { + this.kernel = kernel; + this.sessionId = sessionId; + this.width = width; + this.height = height; + } + + /** + * Execute an n1 action and return the result. + */ + async execute(action: N1Action): Promise { + const { action_type } = action; + + switch (action_type) { + case 'click': + return this.handleClick(action); + case 'scroll': + return this.handleScroll(action); + case 'type': + return this.handleType(action); + case 'key_press': + return this.handleKeyPress(action); + case 'hover': + return this.handleHover(action); + case 'drag': + return this.handleDrag(action); + case 'wait': + return this.handleWait(); + case 'refresh': + return this.handleRefresh(); + case 'go_back': + return this.handleGoBack(); + case 'goto_url': + return this.handleGotoUrl(action); + case 'read_texts_and_links': + return this.handleReadTextsAndLinks(); + case 'stop': + return this.handleStop(action); + default: + throw new ToolError(`Unknown action type: ${action_type}`); + } + } + + private async handleClick(action: N1Action): Promise { + const coords = this.getCoordinates(action.center_coordinates); + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x: coords.x, + y: coords.y, + button: 'left', + click_type: 'click', + num_clicks: 1, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleScroll(action: N1Action): Promise { + const coords = this.getCoordinates(action.center_coordinates); + const direction = action.direction; + const amount = action.amount ?? 3; + + if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { + throw new ToolError(`Invalid scroll direction: ${direction}`); + } + + // Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels + const scrollDelta = amount * 100; + + let delta_x = 0; + let delta_y = 0; + + switch (direction) { + case 'up': + delta_y = -scrollDelta; + break; + case 'down': + delta_y = scrollDelta; + break; + case 'left': + delta_x = -scrollDelta; + break; + case 'right': + delta_x = scrollDelta; + break; + } + + await this.kernel.browsers.computer.scroll(this.sessionId, { + x: coords.x, + y: coords.y, + delta_x, + delta_y, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleType(action: N1Action): Promise { + const text = action.text; + if (!text) { + throw new ToolError('text is required for type action'); + } + + // Clear existing text if requested + if (action.clear_before_typing) { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+a'], + }); + await this.sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['BackSpace'], + }); + await this.sleep(100); + } + + // Type the text + await this.kernel.browsers.computer.typeText(this.sessionId, { + text, + delay: TYPING_DELAY_MS, + }); + + // Press Enter if requested + if (action.press_enter_after) { + await this.sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['Return'], + }); + } + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleKeyPress(action: N1Action): Promise { + const keyComb = action.key_comb; + if (!keyComb) { + throw new ToolError('key_comb is required for key_press action'); + } + + const mappedKey = this.mapKey(keyComb); + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [mappedKey], + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleHover(action: N1Action): Promise { + const coords = this.getCoordinates(action.center_coordinates); + + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x: coords.x, + y: coords.y, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleDrag(action: N1Action): Promise { + const startCoords = this.getCoordinates(action.start_coordinates); + const endCoords = this.getCoordinates(action.center_coordinates); + + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[startCoords.x, startCoords.y], [endCoords.x, endCoords.y]], + button: 'left', + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleWait(): Promise { + // Default wait of 2 seconds for UI to update + await this.sleep(2000); + return this.screenshot(); + } + + /** + * Refresh the page using keyboard shortcut (Ctrl+R or F5) + */ + private async handleRefresh(): Promise { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['F5'], + }); + + // Wait for page to reload + await this.sleep(2000); + return this.screenshot(); + } + + /** + * Go back using keyboard shortcut (Alt+Left) + */ + private async handleGoBack(): Promise { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['alt+Left'], + }); + + // Wait for navigation + await this.sleep(1500); + return this.screenshot(); + } + + /** + * Navigate to URL using keyboard shortcuts: + * 1. Ctrl+L to focus the URL bar + * 2. Type the URL + * 3. Press Enter + */ + private async handleGotoUrl(action: N1Action): Promise { + const url = action.url; + if (!url) { + throw new ToolError('url is required for goto_url action'); + } + + // Focus URL bar with Ctrl+L + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+l'], + }); + await this.sleep(ACTION_DELAY_MS); + + // Select all existing text and type the new URL + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+a'], + }); + await this.sleep(100); + + // Type the URL + await this.kernel.browsers.computer.typeText(this.sessionId, { + text: url, + delay: TYPING_DELAY_MS, + }); + await this.sleep(ACTION_DELAY_MS); + + // Press Enter to navigate + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['Return'], + }); + + // Wait for page to load + await this.sleep(2000); + return this.screenshot(); + } + + /** + * Read texts and links using Playwright's accessibility tree. + * Per n1 docs this is "implemented as an external VLM call" - we use + * Kernel's Playwright Execution API for the accessibility tree and + * Computer Controls API for the screenshot. + */ + private async handleReadTextsAndLinks(): Promise { + try { + // Get accessibility tree via Playwright Execution API + const result = await this.kernel.browsers.playwright.execute( + this.sessionId, + { + code: ` + const url = page.url(); + const title = await page.title(); + const accessibilityTree = await page.accessibility.snapshot(); + return { url, title, accessibilityTree }; + `, + timeout_sec: 30 + } + ); + + // Get screenshot via Computer Controls API + const screenshotResult = await this.screenshot(); + + if (result.success && result.result) { + const { url, title, accessibilityTree } = result.result as { + url: string; + title: string; + accessibilityTree: unknown; + }; + + return { + base64Image: screenshotResult.base64Image, + output: JSON.stringify({ url, title, accessibilityTree }, null, 2) + }; + } + + // Fallback to just screenshot if Playwright execution fails + console.warn('Playwright execution failed, falling back to screenshot only'); + return screenshotResult; + } catch (error) { + console.warn('read_texts_and_links failed:', error); + return this.screenshot(); + } + } + + private handleStop(action: N1Action): ToolResult { + // Return the final answer without taking a screenshot + return { + output: action.answer || 'Task completed', + }; + } + + /** + * Take a screenshot of the current browser state. + */ + async screenshot(): Promise { + try { + const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const blob = await response.blob(); + const arrayBuffer = await blob.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + + return { + base64Image: buffer.toString('base64'), + }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); + } + } + + private getCoordinates(coords?: [number, number]): { x: number; y: number } { + if (!coords || coords.length !== 2) { + // Default to center of screen + return { x: this.width / 2, y: this.height / 2 }; + } + + const [x, y] = coords; + if (typeof x !== 'number' || typeof y !== 'number' || x < 0 || y < 0) { + throw new ToolError(`Invalid coordinates: ${JSON.stringify(coords)}`); + } + + return { x, y }; + } + + /** + * Map key names from Playwright format (n1 output) to xdotool format (Kernel). + */ + private mapKey(key: string): string { + // Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") + if (key.includes('+')) { + const parts = key.split('+'); + const mappedParts = parts.map(part => { + const trimmed = part.trim(); + const lower = trimmed.toLowerCase(); + + // Map modifier names + if (MODIFIER_MAP[lower]) { + return MODIFIER_MAP[lower]; + } + + // Check KEY_MAP for special keys + return KEY_MAP[trimmed] || trimmed; + }); + return mappedParts.join('+'); + } + + return KEY_MAP[key] || key; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/tsconfig.json b/pkg/templates/typescript/yutori-computer-use/tsconfig.json new file mode 100644 index 0000000..13616f5 --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "." + }, + "include": ["./**/*.ts"], + "exclude": ["node_modules", "dist"] +} From 5e9fad93e0a6dc09cc3e4a72c701b87533ba2bcc Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 20 Jan 2026 07:26:36 -0500 Subject: [PATCH 02/12] fix: Use _snapshotForAI() for read_texts_and_links action Replace page.accessibility.snapshot() with page._snapshotForAI() which is specifically designed for AI agents and documented in Kernel's MCP server. The previous implementation used the experimental/deprecated accessibility API which failed silently and fell back to screenshot-only mode. _snapshotForAI() returns a structured representation of the page optimized for LLM consumption, including visible text, interactive elements (links, buttons, inputs), and page structure - exactly what n1 needs for reading texts and saving URLs for citation. --- .../python/yutori-computer-use/tools/computer.py | 12 ++++++------ .../yutori-computer-use/tools/computer.ts | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index cb216f2..87dad3e 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -320,20 +320,20 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: """ - Read texts and links using Playwright's accessibility tree. + Read texts and links using Playwright's _snapshotForAI(). Per n1 docs this is "implemented as an external VLM call" - we use - Kernel's Playwright Execution API for the accessibility tree and + Kernel's Playwright Execution API for the AI snapshot and Computer Controls API for the screenshot. """ try: - # Get accessibility tree via Playwright Execution API + # Get AI snapshot via Playwright Execution API result = self.kernel.browsers.playwright.execute( self.session_id, code=""" + const snapshot = await page._snapshotForAI(); const url = page.url(); const title = await page.title(); - const accessibilityTree = await page.accessibility.snapshot(); - return { url, title, accessibilityTree }; + return { url, title, snapshot }; """, timeout_sec=30 ) @@ -348,7 +348,7 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: "output": json.dumps({ "url": data.get("url"), "title": data.get("title"), - "accessibilityTree": data.get("accessibilityTree") + "snapshot": data.get("snapshot") }, indent=2) } diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index 66ba637..e01bffa 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -351,22 +351,22 @@ export class ComputerTool { } /** - * Read texts and links using Playwright's accessibility tree. + * Read texts and links using Playwright's _snapshotForAI(). * Per n1 docs this is "implemented as an external VLM call" - we use - * Kernel's Playwright Execution API for the accessibility tree and + * Kernel's Playwright Execution API for the AI snapshot and * Computer Controls API for the screenshot. */ private async handleReadTextsAndLinks(): Promise { try { - // Get accessibility tree via Playwright Execution API + // Get AI snapshot via Playwright Execution API const result = await this.kernel.browsers.playwright.execute( this.sessionId, { code: ` + const snapshot = await page._snapshotForAI(); const url = page.url(); const title = await page.title(); - const accessibilityTree = await page.accessibility.snapshot(); - return { url, title, accessibilityTree }; + return { url, title, snapshot }; `, timeout_sec: 30 } @@ -376,15 +376,15 @@ export class ComputerTool { const screenshotResult = await this.screenshot(); if (result.success && result.result) { - const { url, title, accessibilityTree } = result.result as { + const { url, title, snapshot } = result.result as { url: string; title: string; - accessibilityTree: unknown; + snapshot: string; }; return { base64Image: screenshotResult.base64Image, - output: JSON.stringify({ url, title, accessibilityTree }, null, 2) + output: JSON.stringify({ url, title, snapshot }, null, 2) }; } From d2deaaee884728efc2859002a305c5260c853f00 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 20 Jan 2026 17:38:34 -0500 Subject: [PATCH 03/12] feat(yutori): add optional Playwright mode for viewport-only screenshots Add PlaywrightComputerTool adapter that connects via CDP WebSocket for browser-only screenshots, optimized for Yutori n1's training data per their documentation recommendations. Changes: - Add PlaywrightComputerTool class (TS + Python) using CDP connection - Add 'mode' parameter to sampling loop ('computer_use' | 'playwright') - Default to 'computer_use' mode (stable); 'playwright' is opt-in - Add configurable viewport dimensions (1200x800) - Expose cdp_ws_url from session for Playwright connection - Add playwright-core (TS) and playwright (Python) dependencies The playwright mode provides viewport-only screenshots without OS UI or browser chrome, improving n1 model performance per Yutori's docs: https://docs.yutori.com/reference/n1#screenshot-requirements --- .../python/yutori-computer-use/loop.py | 297 +++++++------ .../python/yutori-computer-use/main.py | 13 +- .../python/yutori-computer-use/pyproject.toml | 1 + .../python/yutori-computer-use/session.py | 12 +- .../yutori-computer-use/tools/__init__.py | 2 + .../tools/playwright_computer.py | 352 +++++++++++++++ .../typescript/yutori-computer-use/index.ts | 13 +- .../typescript/yutori-computer-use/loop.ts | 304 +++++++------ .../yutori-computer-use/package.json | 3 +- .../typescript/yutori-computer-use/session.ts | 27 +- .../tools/playwright-computer.ts | 410 ++++++++++++++++++ 11 files changed, 1168 insertions(+), 266 deletions(-) create mode 100644 pkg/templates/python/yutori-computer-use/tools/playwright_computer.py create mode 100644 pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index e2049af..3a19bc1 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -7,17 +7,35 @@ - Coordinates are returned in 1000x1000 space and need scaling - WebP format recommended for screenshots +Supports two modes: +- computer_use: Uses Kernel's Computer Controls API (full VM screenshots) +- playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + @see https://docs.yutori.com/reference/n1 """ import json import re -from typing import Any, Optional +from typing import Any, Literal, Optional, Protocol from kernel import Kernel from openai import OpenAI from tools import ComputerTool, N1Action, ToolResult +from tools.playwright_computer import PlaywrightComputerTool + +# Mode for browser interaction +BrowserMode = Literal["computer_use", "playwright"] + + +class N1ComputerToolProtocol(Protocol): + """Protocol for tools that can execute n1 actions.""" + + async def execute(self, action: N1Action) -> ToolResult: + ... + + async def screenshot(self) -> ToolResult: + ... # n1 uses its own system prompt - custom prompts may degrade performance # Per docs: "we generally do not recommend providing custom system prompts" @@ -30,11 +48,13 @@ async def sampling_loop( api_key: str, kernel: Kernel, session_id: str, + cdp_ws_url: Optional[str] = None, max_tokens: int = 4096, max_iterations: int = 50, - # Default viewport matches WXGA (1200x800) - closest to Yutori's recommended 1280x800 - viewport_width: int = 1200, + # Default viewport matches Yutori's recommended 1280x800 (WXGA 16:10) + viewport_width: int = 1280, viewport_height: int = 800, + mode: BrowserMode = "computer_use", ) -> dict[str, Any]: """ Run the n1 sampling loop until the model returns a stop action or max iterations. @@ -45,10 +65,12 @@ async def sampling_loop( api_key: Yutori API key kernel: Kernel client instance session_id: Browser session ID + cdp_ws_url: CDP WebSocket URL for playwright mode max_tokens: Maximum tokens per response max_iterations: Maximum loop iterations viewport_width: Browser viewport width for coordinate scaling viewport_height: Browser viewport height for coordinate scaling + mode: Browser interaction mode ('computer_use' or 'playwright') Returns: Dict with 'messages' (conversation history) and 'final_answer' (if stopped) @@ -58,138 +80,159 @@ async def sampling_loop( base_url="https://api.yutori.com/v1", ) - computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) + # Create the appropriate tool based on mode + computer_tool: N1ComputerToolProtocol + playwright_tool: Optional[PlaywrightComputerTool] = None - # Take initial screenshot - initial_screenshot = computer_tool.screenshot() + print(f"Mode requested: {mode!r}, cdp_ws_url available: {cdp_ws_url is not None}") - # Build conversation per n1 format: - # 1. User message with task - # 2. Observation message with screenshot - conversation_messages: list[dict[str, Any]] = [ - { - "role": "user", - "content": [{"type": "text", "text": task}], - } - ] - - # Add initial screenshot as observation (n1's required format) - if initial_screenshot.get("base64_image"): - conversation_messages.append({ - "role": "observation", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" - }, - } - ], - }) - - iteration = 0 - final_answer: Optional[str] = None - - while iteration < max_iterations: - iteration += 1 - print(f"\n=== Iteration {iteration} ===") - - # Call the n1 API (no system prompt - n1 uses its own) - try: - response = client.chat.completions.create( - model=model, - messages=conversation_messages, - max_tokens=max_tokens, - temperature=0.3, - ) - except Exception as api_error: - print(f"API call failed: {api_error}") - raise api_error - - if not response.choices or len(response.choices) == 0: - print(f"No choices in response: {response}") - raise ValueError("No choices in API response") - - assistant_message = response.choices[0].message - if not assistant_message: - raise ValueError("No response from model") - - response_content = assistant_message.content or "" - print("Assistant response:", response_content) - - # Add assistant message to conversation - conversation_messages.append({ - "role": "assistant", - "content": response_content, - }) - - # Parse the action(s) from the response - # n1 returns JSON with "thoughts" and "actions" array - parsed = _parse_n1_response(response_content) - - if not parsed or not parsed.get("actions"): - print("No actions found in response, ending loop") - break - - # Execute each action in the actions array - for action in parsed["actions"]: - print(f"Executing action: {action.get('action_type')}", action) - - # Check for stop action - if action.get("action_type") == "stop": - final_answer = action.get("answer") - print(f"Stop action received, final answer: {final_answer}") - return {"messages": conversation_messages, "final_answer": final_answer} - - # Scale coordinates from n1's 1000x1000 space to actual viewport - scaled_action = _scale_coordinates(action, viewport_width, viewport_height) - - # Execute the action - result: ToolResult - try: - result = await computer_tool.execute(scaled_action) - except Exception as e: - print(f"Action failed: {e}") - result = {"error": str(e)} - - # After action, add observation with screenshot and optional text output - if result.get("base64_image") or result.get("output"): - observation_content = [] - - # Add text output first (e.g., from read_texts_and_links) - if result.get("output"): - observation_content.append({ - "type": "text", - "text": result["output"], - }) + if mode == "playwright": + if not cdp_ws_url: + raise ValueError("cdp_ws_url is required for playwright mode") + print(f"Connecting to CDP WebSocket: {cdp_ws_url[:50]}...") + playwright_tool = PlaywrightComputerTool(cdp_ws_url, viewport_width, viewport_height) + await playwright_tool.connect() + computer_tool = playwright_tool + print("Using playwright mode (viewport-only screenshots)") + else: + computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) + print("Using computer_use mode (Computer Controls API)") - # Add screenshot - if result.get("base64_image"): - observation_content.append({ + try: + # Take initial screenshot + initial_screenshot = await computer_tool.screenshot() + + # Build conversation per n1 format: + # 1. User message with task + # 2. Observation message with screenshot + conversation_messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": [{"type": "text", "text": task}], + } + ] + + # Add initial screenshot as observation (n1's required format) + if initial_screenshot.get("base64_image"): + conversation_messages.append({ + "role": "observation", + "content": [ + { "type": "image_url", "image_url": { - "url": f"data:image/png;base64,{result['base64_image']}" + "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" }, + } + ], + }) + + iteration = 0 + final_answer: Optional[str] = None + + while iteration < max_iterations: + iteration += 1 + print(f"\n=== Iteration {iteration} ===") + + # Call the n1 API (no system prompt - n1 uses its own) + try: + response = client.chat.completions.create( + model=model, + messages=conversation_messages, + max_tokens=max_tokens, + temperature=0.3, + ) + except Exception as api_error: + print(f"API call failed: {api_error}") + raise api_error + + if not response.choices or len(response.choices) == 0: + print(f"No choices in response: {response}") + raise ValueError("No choices in API response") + + assistant_message = response.choices[0].message + if not assistant_message: + raise ValueError("No response from model") + + response_content = assistant_message.content or "" + print("Assistant response:", response_content) + + # Add assistant message to conversation + conversation_messages.append({ + "role": "assistant", + "content": response_content, + }) + + # Parse the action(s) from the response + # n1 returns JSON with "thoughts" and "actions" array + parsed = _parse_n1_response(response_content) + + if not parsed or not parsed.get("actions"): + print("No actions found in response, ending loop") + break + + # Execute each action in the actions array + for action in parsed["actions"]: + print(f"Executing action: {action.get('action_type')}", action) + + # Check for stop action + if action.get("action_type") == "stop": + final_answer = action.get("answer") + print(f"Stop action received, final answer: {final_answer}") + return {"messages": conversation_messages, "final_answer": final_answer} + + # Scale coordinates from n1's 1000x1000 space to actual viewport + scaled_action = _scale_coordinates(action, viewport_width, viewport_height) + + # Execute the action + result: ToolResult + try: + result = await computer_tool.execute(scaled_action) + except Exception as e: + print(f"Action failed: {e}") + result = {"error": str(e)} + + # After action, add observation with screenshot and optional text output + if result.get("base64_image") or result.get("output"): + observation_content = [] + + # Add text output first (e.g., from read_texts_and_links) + if result.get("output"): + observation_content.append({ + "type": "text", + "text": result["output"], + }) + + # Add screenshot + if result.get("base64_image"): + observation_content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{result['base64_image']}" + }, + }) + + conversation_messages.append({ + "role": "observation", + "content": observation_content, + }) + elif result.get("error"): + # If there was an error, add it as text observation + conversation_messages.append({ + "role": "observation", + "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], }) - conversation_messages.append({ - "role": "observation", - "content": observation_content, - }) - elif result.get("error"): - # If there was an error, add it as text observation - conversation_messages.append({ - "role": "observation", - "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], - }) - - if iteration >= max_iterations: - print("Max iterations reached") - - return { - "messages": conversation_messages, - "final_answer": final_answer, - } + if iteration >= max_iterations: + print("Max iterations reached") + + return { + "messages": conversation_messages, + "final_answer": final_answer, + } + finally: + # Clean up playwright connection if used + if playwright_tool: + await playwright_tool.disconnect() def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py index 4830797..24693c3 100644 --- a/pkg/templates/python/yutori-computer-use/main.py +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -2,13 +2,17 @@ from typing import Optional, TypedDict import kernel -from loop import sampling_loop +from loop import sampling_loop, BrowserMode from session import KernelBrowserSession class QueryInput(TypedDict): query: str record_replay: Optional[bool] + # Browser interaction mode: + # - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - default + # - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + mode: Optional[BrowserMode] class QueryOutput(TypedDict): @@ -46,6 +50,7 @@ async def cua_task( raise ValueError("Query is required") record_replay = payload.get("record_replay", False) + mode: BrowserMode = payload.get("mode") or "computer_use" async with KernelBrowserSession( stealth=True, @@ -58,7 +63,11 @@ async def cua_task( task=payload["query"], api_key=str(api_key), kernel=session.kernel, - session_id=session.session_id, + session_id=str(session.session_id), + cdp_ws_url=session.cdp_ws_url, + viewport_width=session.viewport_width, + viewport_height=session.viewport_height, + mode=mode, ) final_answer = loop_result.get("final_answer") diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori-computer-use/pyproject.toml index 456eab0..8a3bd44 100644 --- a/pkg/templates/python/yutori-computer-use/pyproject.toml +++ b/pkg/templates/python/yutori-computer-use/pyproject.toml @@ -10,4 +10,5 @@ dependencies = [ "typing-extensions>=4.15.0", "kernel>=0.24.0", "python-dotenv>=1.2.1", + "playwright>=1.52.0", ] diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori-computer-use/session.py index 522d4c9..9c51bbe 100644 --- a/pkg/templates/python/yutori-computer-use/session.py +++ b/pkg/templates/python/yutori-computer-use/session.py @@ -32,6 +32,11 @@ class KernelBrowserSession: stealth: bool = True timeout_seconds: int = 300 + # Viewport dimensions - Yutori n1 recommended: 1280x800, Kernel supports 1200x800 + # See: https://docs.yutori.com/reference/n1#screenshot-requirements + viewport_width: int = 1200 + viewport_height: int = 800 + # Replay recording options record_replay: bool = False replay_grace_period: float = 5.0 # Seconds to wait before stopping replay @@ -48,15 +53,14 @@ async def __aenter__(self) -> "KernelBrowserSession": """Create a Kernel browser session and optionally start recording.""" self._kernel = Kernel() - # Create browser with viewport closest to Yutori n1's recommended 1280x800 - # Using 1200x800 (WXGA at 25Hz) - the closest supported Kernel viewport + # Create browser with Yutori n1's recommended viewport # See: https://docs.yutori.com/reference/n1#screenshot-requirements browser = self._kernel.browsers.create( stealth=self.stealth, timeout_seconds=self.timeout_seconds, viewport={ - "width": 1200, - "height": 800, + "width": self.viewport_width, + "height": self.viewport_height, "refresh_rate": 25, }, ) diff --git a/pkg/templates/python/yutori-computer-use/tools/__init__.py b/pkg/templates/python/yutori-computer-use/tools/__init__.py index 63da518..b01c1a2 100644 --- a/pkg/templates/python/yutori-computer-use/tools/__init__.py +++ b/pkg/templates/python/yutori-computer-use/tools/__init__.py @@ -2,10 +2,12 @@ from .base import ToolError, ToolResult from .computer import ComputerTool, N1Action +from .playwright_computer import PlaywrightComputerTool __all__ = [ "ToolError", "ToolResult", "ComputerTool", "N1Action", + "PlaywrightComputerTool", ] diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py new file mode 100644 index 0000000..bd86e88 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py @@ -0,0 +1,352 @@ +""" +Yutori n1 Playwright Computer Tool + +Maps n1 action format to Playwright methods via CDP WebSocket connection. +Uses viewport-only screenshots optimized for Yutori n1's training data. + +See: https://docs.yutori.com/reference/n1#screenshot-requirements +""" + +import asyncio +import base64 +import json +from typing import Optional + +from playwright.async_api import async_playwright, Browser, BrowserContext, Page + +from .base import ToolError, ToolResult +from .computer import N1Action + +SCREENSHOT_DELAY_MS = 0.5 + +# Key mappings from n1 output format to Playwright format +KEY_MAP = { + "Return": "Enter", + "BackSpace": "Backspace", + "Page_Up": "PageUp", + "Page_Down": "PageDown", +} + +MODIFIER_MAP = { + "ctrl": "Control", + "super": "Meta", + "command": "Meta", + "cmd": "Meta", +} + + +class PlaywrightComputerTool: + """ + Computer tool for Yutori n1 actions using Playwright via CDP connection. + Provides viewport-only screenshots optimized for n1 model performance. + """ + + def __init__(self, cdp_ws_url: str, width: int = 1200, height: int = 800): + self.cdp_ws_url = cdp_ws_url + self.width = width + self.height = height + self._playwright = None + self._browser: Optional[Browser] = None + self._context: Optional[BrowserContext] = None + self._page: Optional[Page] = None + + async def connect(self) -> None: + """ + Connect to the browser via CDP WebSocket. + Must be called before executing any actions. + """ + if self._browser: + return # Already connected + + self._playwright = await async_playwright().start() + self._browser = await self._playwright.chromium.connect_over_cdp(self.cdp_ws_url) + + # Get existing context or create new one + contexts = self._browser.contexts + self._context = contexts[0] if contexts else await self._browser.new_context() + + # Handle new page events + self._context.on("page", self._handle_new_page) + + # Get existing page or create new one + pages = self._context.pages + self._page = pages[0] if pages else await self._context.new_page() + + # Set viewport size to Yutori's recommended dimensions + await self._page.set_viewport_size({"width": self.width, "height": self.height}) + self._page.on("close", self._handle_page_close) + + async def disconnect(self) -> None: + """Disconnect from the browser.""" + # Don't close the browser itself - just stop the playwright connection + # The browser lifecycle is managed by Kernel + if self._playwright: + await self._playwright.stop() + self._playwright = None + self._browser = None + self._context = None + self._page = None + + def _handle_new_page(self, page: Page) -> None: + """Handle the creation of a new page.""" + print("New page created") + self._page = page + page.on("close", self._handle_page_close) + + def _handle_page_close(self, closed_page: Page) -> None: + """Handle the closure of a page.""" + print("Page closed") + if self._page == closed_page and self._context: + pages = self._context.pages + if pages: + self._page = pages[-1] + else: + print("Warning: All pages have been closed.") + self._page = None + + def _assert_page(self) -> Page: + """Assert that page is available and return it.""" + if not self._page: + raise ToolError("Page not available. Did you call connect()?") + return self._page + + async def execute(self, action: N1Action) -> ToolResult: + """Execute an n1 action and return the result.""" + action_type = action.get("action_type") + + handlers = { + "click": self._handle_click, + "scroll": self._handle_scroll, + "type": self._handle_type, + "key_press": self._handle_key_press, + "hover": self._handle_hover, + "drag": self._handle_drag, + "wait": self._handle_wait, + "refresh": self._handle_refresh, + "go_back": self._handle_go_back, + "goto_url": self._handle_goto_url, + "read_texts_and_links": self._handle_read_texts_and_links, + "stop": self._handle_stop, + } + + handler = handlers.get(action_type) + if not handler: + raise ToolError(f"Unknown action type: {action_type}") + + return await handler(action) + + async def _handle_click(self, action: N1Action) -> ToolResult: + page = self._assert_page() + coords = self._get_coordinates(action.get("center_coordinates")) + + await page.mouse.click(coords["x"], coords["y"]) + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return await self.screenshot() + + async def _handle_scroll(self, action: N1Action) -> ToolResult: + page = self._assert_page() + coords = self._get_coordinates(action.get("center_coordinates")) + direction = action.get("direction") + amount = action.get("amount", 3) + + if direction not in ("up", "down", "left", "right"): + raise ToolError(f"Invalid scroll direction: {direction}") + + # Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels + scroll_delta = amount * 100 + + # Move mouse to position first + await page.mouse.move(coords["x"], coords["y"]) + + # Playwright's wheel method takes delta_x and delta_y + delta_x = 0 + delta_y = 0 + + if direction == "up": + delta_y = -scroll_delta + elif direction == "down": + delta_y = scroll_delta + elif direction == "left": + delta_x = -scroll_delta + elif direction == "right": + delta_x = scroll_delta + + await page.mouse.wheel(delta_x, delta_y) + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return await self.screenshot() + + async def _handle_type(self, action: N1Action) -> ToolResult: + page = self._assert_page() + text = action.get("text") + if not text: + raise ToolError("text is required for type action") + + # Clear existing text if requested + if action.get("clear_before_typing"): + await page.keyboard.press("Control+a") + await asyncio.sleep(0.1) + await page.keyboard.press("Backspace") + await asyncio.sleep(0.1) + + # Type the text + await page.keyboard.type(text) + + # Press Enter if requested + if action.get("press_enter_after"): + await asyncio.sleep(0.1) + await page.keyboard.press("Enter") + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return await self.screenshot() + + async def _handle_key_press(self, action: N1Action) -> ToolResult: + page = self._assert_page() + key_comb = action.get("key_comb") + if not key_comb: + raise ToolError("key_comb is required for key_press action") + + mapped_key = self._map_key_to_playwright(key_comb) + await page.keyboard.press(mapped_key) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return await self.screenshot() + + async def _handle_hover(self, action: N1Action) -> ToolResult: + page = self._assert_page() + coords = self._get_coordinates(action.get("center_coordinates")) + + await page.mouse.move(coords["x"], coords["y"]) + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return await self.screenshot() + + async def _handle_drag(self, action: N1Action) -> ToolResult: + page = self._assert_page() + start_coords = self._get_coordinates(action.get("start_coordinates")) + end_coords = self._get_coordinates(action.get("center_coordinates")) + + await page.mouse.move(start_coords["x"], start_coords["y"]) + await page.mouse.down() + await page.mouse.move(end_coords["x"], end_coords["y"]) + await page.mouse.up() + + await asyncio.sleep(SCREENSHOT_DELAY_MS) + return await self.screenshot() + + async def _handle_wait(self, action: N1Action) -> ToolResult: + # Default wait of 2 seconds for UI to update + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_refresh(self, action: N1Action) -> ToolResult: + page = self._assert_page() + await page.reload() + + # Wait for page to reload + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_go_back(self, action: N1Action) -> ToolResult: + page = self._assert_page() + await page.go_back() + + # Wait for navigation + await asyncio.sleep(1.5) + return await self.screenshot() + + async def _handle_goto_url(self, action: N1Action) -> ToolResult: + page = self._assert_page() + url = action.get("url") + if not url: + raise ToolError("url is required for goto_url action") + + await page.goto(url) + + # Wait for page to load + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: + """ + Read texts and links using Playwright's _snapshotForAI(). + Directly calls the method on the CDP-connected page. + """ + page = self._assert_page() + try: + # Call _snapshotForAI directly on the page + # This is an internal Playwright method for AI accessibility + snapshot = await page._snapshot_for_ai() # type: ignore + url = page.url + title = await page.title() + + # Get viewport-only screenshot + screenshot_result = await self.screenshot() + + return { + "base64_image": screenshot_result.get("base64_image", ""), + "output": json.dumps({"url": url, "title": title, "snapshot": snapshot}, indent=2), + } + except Exception as e: + print(f"read_texts_and_links failed: {e}") + return await self.screenshot() + + async def _handle_stop(self, action: N1Action) -> ToolResult: + """Return the final answer without taking a screenshot.""" + return {"output": action.get("answer", "Task completed")} + + async def screenshot(self) -> ToolResult: + """ + Take a viewport-only screenshot of the current browser state. + This captures only the browser content, not the OS UI or browser chrome. + """ + page = self._assert_page() + try: + # full_page=False captures only the viewport (browser content) + buffer = await page.screenshot(full_page=False) + base64_image = base64.b64encode(buffer).decode("utf-8") + return {"base64_image": base64_image} + except Exception as e: + raise ToolError(f"Failed to take screenshot: {e}") + + def get_current_url(self) -> str: + """Get the current page URL.""" + page = self._assert_page() + return page.url + + def _get_coordinates( + self, coords: tuple[int, int] | list[int] | None + ) -> dict[str, int]: + """Convert n1 coordinates to dict format.""" + if coords is None or len(coords) != 2: + # Default to center of viewport + return {"x": self.width // 2, "y": self.height // 2} + + x, y = coords + if not isinstance(x, (int, float)) or not isinstance(y, (int, float)) or x < 0 or y < 0: + raise ToolError(f"Invalid coordinates: {coords}") + + return {"x": int(x), "y": int(y)} + + def _map_key_to_playwright(self, key: str) -> str: + """ + Map key names to Playwright format. + n1 outputs keys in Playwright format, but some may need adjustment. + """ + # Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") + if "+" in key: + parts = key.split("+") + mapped_parts = [] + for part in parts: + trimmed = part.strip() + lower = trimmed.lower() + + # Map modifier names + if lower in MODIFIER_MAP: + mapped_parts.append(MODIFIER_MAP[lower]) + else: + # Check KEY_MAP for special keys + mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) + + return "+".join(mapped_parts) + + return KEY_MAP.get(key, key) diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts index c39ff0e..e9bbdce 100644 --- a/pkg/templates/typescript/yutori-computer-use/index.ts +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -1,5 +1,5 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; -import { samplingLoop } from './loop'; +import { samplingLoop, type BrowserMode } from './loop'; import { KernelBrowserSession } from './session'; const kernel = new Kernel(); @@ -9,6 +9,12 @@ const app = kernel.app('ts-yutori-cua'); interface QueryInput { query: string; record_replay?: boolean; + /** + * Browser interaction mode: + * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - default + * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + */ + mode?: BrowserMode; } interface QueryOutput { @@ -42,12 +48,17 @@ app.action( try { // Run the sampling loop + const mode = payload.mode ?? 'computer_use'; const { finalAnswer, messages } = await samplingLoop({ model: 'n1-preview-2025-11', task: payload.query, apiKey: YUTORI_API_KEY, kernel, sessionId: session.sessionId, + cdpWsUrl: session.cdpWsUrl ?? undefined, + viewportWidth: session.viewportWidth, + viewportHeight: session.viewportHeight, + mode, }); // Extract the result diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 9ba8074..2449f70 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -7,12 +7,26 @@ * - Coordinates are returned in 1000x1000 space and need scaling * - WebP format recommended for screenshots * + * Supports two modes: + * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) + * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + * * @see https://docs.yutori.com/reference/n1 */ import OpenAI from 'openai'; import type { Kernel } from '@onkernel/sdk'; import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; +import { PlaywrightComputerTool } from './tools/playwright-computer'; + +/** Mode for browser interaction */ +export type BrowserMode = 'computer_use' | 'playwright'; + +/** Interface for tool that can execute n1 actions */ +interface N1ComputerTool { + execute(action: N1Action): Promise; + screenshot(): Promise; +} // n1 uses its own system prompt - custom prompts may degrade performance // Per docs: "we generally do not recommend providing custom system prompts" @@ -36,12 +50,21 @@ interface SamplingLoopOptions { apiKey: string; kernel: Kernel; sessionId: string; + /** CDP WebSocket URL for playwright mode */ + cdpWsUrl?: string; maxTokens?: number; maxIterations?: number; - /** Viewport width for coordinate scaling (default: 1200 to match WXGA) */ + /** Viewport width for coordinate scaling (default: 1200, closest to Yutori's 1280 recommendation) */ viewportWidth?: number; - /** Viewport height for coordinate scaling (default: 800 to match WXGA) */ + /** Viewport height for coordinate scaling (default: 800 per Yutori recommendation) */ viewportHeight?: number; + /** + * Browser interaction mode: + * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) + * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + * @default 'computer_use' + */ + mode?: BrowserMode; } interface SamplingLoopResult { @@ -58,165 +81,192 @@ export async function samplingLoop({ apiKey, kernel, sessionId, + cdpWsUrl, maxTokens = 4096, maxIterations = 50, - // Default viewport matches WXGA (1200x800) - closest to Yutori's recommended 1280x800 + // Default viewport: 1200x800 (closest Kernel-supported size to Yutori's recommended 1280x800) viewportWidth = 1200, viewportHeight = 800, + mode = 'computer_use', }: SamplingLoopOptions): Promise { const client = new OpenAI({ apiKey, baseURL: 'https://api.yutori.com/v1', }); - const computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); - - // Take initial screenshot - const initialScreenshot = await computerTool.screenshot(); - - // Build conversation per n1 format: - // 1. User message with task - // 2. Observation message with screenshot - const conversationMessages: Message[] = [ - { - role: 'user', - content: [{ type: 'text', text: task }], - }, - ]; - - // Add initial screenshot as observation (n1's required format) - if (initialScreenshot.base64Image) { - conversationMessages.push({ - role: 'observation', - content: [ - { - type: 'image_url', - image_url: { - url: `data:image/png;base64,${initialScreenshot.base64Image}`, - }, - }, - ], - }); - } + // Create the appropriate tool based on mode + let computerTool: N1ComputerTool; + let playwrightTool: PlaywrightComputerTool | null = null; - let iteration = 0; - let finalAnswer: string | undefined; - - while (iteration < maxIterations) { - iteration++; - console.log(`\n=== Iteration ${iteration} ===`); - - // Call the n1 API (no system prompt - n1 uses its own) - let response; - try { - response = await client.chat.completions.create({ - model, - messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], - max_tokens: maxTokens, - temperature: 0.3, - }); - } catch (apiError) { - console.error('API call failed:', apiError); - throw apiError; - } + console.log(`Mode requested: '${mode}', cdpWsUrl available: ${cdpWsUrl != null}`); - if (!response.choices || response.choices.length === 0) { - console.error('No choices in response:', JSON.stringify(response, null, 2)); - throw new Error('No choices in API response'); + if (mode === 'playwright') { + if (!cdpWsUrl) { + throw new Error('cdpWsUrl is required for playwright mode'); } + console.log(`Connecting to CDP WebSocket: ${cdpWsUrl.substring(0, 50)}...`); + playwrightTool = new PlaywrightComputerTool(cdpWsUrl, viewportWidth, viewportHeight); + await playwrightTool.connect(); + computerTool = playwrightTool; + console.log('Using playwright mode (viewport-only screenshots)'); + } else { + computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); + console.log('Using computer_use mode (Computer Controls API)'); + } - const assistantMessage = response.choices[0]?.message; - if (!assistantMessage) { - throw new Error('No response from model'); - } + try { + // Take initial screenshot + const initialScreenshot = await computerTool.screenshot(); + + // Build conversation per n1 format: + // 1. User message with task + // 2. Observation message with screenshot + const conversationMessages: Message[] = [ + { + role: 'user', + content: [{ type: 'text', text: task }], + }, + ]; - const responseContent = assistantMessage.content || ''; - console.log('Assistant response:', responseContent); + // Add initial screenshot as observation (n1's required format) + if (initialScreenshot.base64Image) { + conversationMessages.push({ + role: 'observation', + content: [ + { + type: 'image_url', + image_url: { + url: `data:image/png;base64,${initialScreenshot.base64Image}`, + }, + }, + ], + }); + } - // Add assistant message to conversation - conversationMessages.push({ - role: 'assistant', - content: responseContent, - }); + let iteration = 0; + let finalAnswer: string | undefined; - // Parse the action(s) from the response - // n1 returns JSON with "thoughts" and "actions" array - const parsed = parseN1Response(responseContent); + while (iteration < maxIterations) { + iteration++; + console.log(`\n=== Iteration ${iteration} ===`); - if (!parsed || !parsed.actions || parsed.actions.length === 0) { - console.log('No actions found in response, ending loop'); - break; - } + // Call the n1 API (no system prompt - n1 uses its own) + let response; + try { + response = await client.chat.completions.create({ + model, + messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], + max_tokens: maxTokens, + temperature: 0.3, + }); + } catch (apiError) { + console.error('API call failed:', apiError); + throw apiError; + } - // Execute each action in the actions array - for (const action of parsed.actions) { - console.log('Executing action:', action.action_type, action); + if (!response.choices || response.choices.length === 0) { + console.error('No choices in response:', JSON.stringify(response, null, 2)); + throw new Error('No choices in API response'); + } - // Check for stop action - if (action.action_type === 'stop') { - finalAnswer = action.answer; - console.log('Stop action received, final answer:', finalAnswer); - return { messages: conversationMessages, finalAnswer }; + const assistantMessage = response.choices[0]?.message; + if (!assistantMessage) { + throw new Error('No response from model'); } - // Scale coordinates from n1's 1000x1000 space to actual viewport - const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); + const responseContent = assistantMessage.content || ''; + console.log('Assistant response:', responseContent); - // Execute the action - let result: ToolResult; - try { - result = await computerTool.execute(scaledAction); - } catch (error) { - console.error('Action failed:', error); - result = { - error: error instanceof Error ? error.message : String(error), - }; + // Add assistant message to conversation + conversationMessages.push({ + role: 'assistant', + content: responseContent, + }); + + // Parse the action(s) from the response + // n1 returns JSON with "thoughts" and "actions" array + const parsed = parseN1Response(responseContent); + + if (!parsed || !parsed.actions || parsed.actions.length === 0) { + console.log('No actions found in response, ending loop'); + break; } - // After action, add observation with screenshot and optional text output - if (result.base64Image || result.output) { - const observationContent: MessageContent[] = []; + // Execute each action in the actions array + for (const action of parsed.actions) { + console.log('Executing action:', action.action_type, action); - // Add text output first (e.g., from read_texts_and_links) - if (result.output) { - observationContent.push({ - type: 'text', - text: result.output, - }); + // Check for stop action + if (action.action_type === 'stop') { + finalAnswer = action.answer; + console.log('Stop action received, final answer:', finalAnswer); + return { messages: conversationMessages, finalAnswer }; } - // Add screenshot - if (result.base64Image) { - observationContent.push({ - type: 'image_url', - image_url: { - url: `data:image/png;base64,${result.base64Image}`, - }, - }); + // Scale coordinates from n1's 1000x1000 space to actual viewport + const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); + + // Execute the action + let result: ToolResult; + try { + result = await computerTool.execute(scaledAction); + } catch (error) { + console.error('Action failed:', error); + result = { + error: error instanceof Error ? error.message : String(error), + }; } - conversationMessages.push({ - role: 'observation', - content: observationContent, - }); - } else if (result.error) { - // If there was an error, add it as text observation - conversationMessages.push({ - role: 'observation', - content: [{ type: 'text', text: `Action failed: ${result.error}` }], - }); + // After action, add observation with screenshot and optional text output + if (result.base64Image || result.output) { + const observationContent: MessageContent[] = []; + + // Add text output first (e.g., from read_texts_and_links) + if (result.output) { + observationContent.push({ + type: 'text', + text: result.output, + }); + } + + // Add screenshot + if (result.base64Image) { + observationContent.push({ + type: 'image_url', + image_url: { + url: `data:image/png;base64,${result.base64Image}`, + }, + }); + } + + conversationMessages.push({ + role: 'observation', + content: observationContent, + }); + } else if (result.error) { + // If there was an error, add it as text observation + conversationMessages.push({ + role: 'observation', + content: [{ type: 'text', text: `Action failed: ${result.error}` }], + }); + } } } - } - if (iteration >= maxIterations) { - console.log('Max iterations reached'); - } + if (iteration >= maxIterations) { + console.log('Max iterations reached'); + } - return { - messages: conversationMessages, - finalAnswer, - }; + return { + messages: conversationMessages, + finalAnswer, + }; + } finally { + // Clean up playwright connection if used + if (playwrightTool) { + await playwrightTool.disconnect(); + } + } } /** diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori-computer-use/package.json index 9f40a6f..86461f9 100644 --- a/pkg/templates/typescript/yutori-computer-use/package.json +++ b/pkg/templates/typescript/yutori-computer-use/package.json @@ -6,7 +6,8 @@ "dependencies": { "@onkernel/sdk": "^0.24.0", "openai": "^4.77.0", - "luxon": "^3.7.2" + "luxon": "^3.7.2", + "playwright-core": "^1.52.0" }, "devDependencies": { "@types/node": "^22.15.17", diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori-computer-use/session.ts index f597105..0e17baf 100644 --- a/pkg/templates/typescript/yutori-computer-use/session.ts +++ b/pkg/templates/typescript/yutori-computer-use/session.ts @@ -16,6 +16,10 @@ export interface SessionOptions { recordReplay?: boolean; /** Grace period in seconds before stopping replay */ replayGracePeriod?: number; + /** Viewport width (default: 1280 per Yutori recommendation) */ + viewportWidth?: number; + /** Viewport height (default: 800 per Yutori recommendation) */ + viewportHeight?: number; } export interface SessionInfo { @@ -24,6 +28,8 @@ export interface SessionInfo { cdpWsUrl: string; replayId?: string; replayViewUrl?: string; + viewportWidth: number; + viewportHeight: number; } const DEFAULT_OPTIONS: Required = { @@ -31,6 +37,10 @@ const DEFAULT_OPTIONS: Required = { timeoutSeconds: 300, recordReplay: false, replayGracePeriod: 5.0, + // Yutori n1 recommended viewport: 1280x800, but Kernel supports 1200x800 + // See: https://docs.yutori.com/reference/n1#screenshot-requirements + viewportWidth: 1200, + viewportHeight: 800, }; /** @@ -82,6 +92,14 @@ export class KernelBrowserSession { return this._replayViewUrl; } + get viewportWidth(): number { + return this.options.viewportWidth; + } + + get viewportHeight(): number { + return this.options.viewportHeight; + } + get info(): SessionInfo { return { sessionId: this.sessionId, @@ -89,6 +107,8 @@ export class KernelBrowserSession { cdpWsUrl: this._cdpWsUrl || '', replayId: this._replayId || undefined, replayViewUrl: this._replayViewUrl || undefined, + viewportWidth: this.options.viewportWidth, + viewportHeight: this.options.viewportHeight, }; } @@ -96,15 +116,14 @@ export class KernelBrowserSession { * Create a Kernel browser session and optionally start recording. */ async start(): Promise { - // Create browser with viewport closest to Yutori n1's recommended 1280x800 - // Using 1200x800 (WXGA at 25Hz) - the closest supported Kernel viewport + // Create browser with Yutori n1's recommended viewport // See: https://docs.yutori.com/reference/n1#screenshot-requirements const browser = await this.kernel.browsers.create({ stealth: this.options.stealth, timeout_seconds: this.options.timeoutSeconds, viewport: { - width: 1200, - height: 800, + width: this.options.viewportWidth, + height: this.options.viewportHeight, refresh_rate: 25, }, }); diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts new file mode 100644 index 0000000..35a385a --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts @@ -0,0 +1,410 @@ +/** + * Yutori n1 Playwright Computer Tool + * + * Maps n1 action format to Playwright methods via CDP WebSocket connection. + * Uses viewport-only screenshots optimized for Yutori n1's training data. + * + * @see https://docs.yutori.com/reference/n1#screenshot-requirements + */ + +import { chromium, type Browser, type BrowserContext, type Page } from 'playwright-core'; +import type { ToolResult, N1Action } from './computer'; +import { ToolError } from './computer'; + +const SCREENSHOT_DELAY_MS = 500; + +// Key mappings from n1 output format to Playwright format +const KEY_MAP: Record = { + 'Return': 'Enter', + 'BackSpace': 'Backspace', + 'Page_Up': 'PageUp', + 'Page_Down': 'PageDown', +}; + +const MODIFIER_MAP: Record = { + 'ctrl': 'Control', + 'super': 'Meta', + 'command': 'Meta', + 'cmd': 'Meta', +}; + +/** + * Computer tool for Yutori n1 actions using Playwright via CDP connection. + * Provides viewport-only screenshots optimized for n1 model performance. + */ +export class PlaywrightComputerTool { + private cdpWsUrl: string; + private width: number; + private height: number; + private browser: Browser | null = null; + private context: BrowserContext | null = null; + private page: Page | null = null; + + constructor(cdpWsUrl: string, width = 1200, height = 800) { + this.cdpWsUrl = cdpWsUrl; + this.width = width; + this.height = height; + } + + /** + * Connect to the browser via CDP WebSocket. + * Must be called before executing any actions. + */ + async connect(): Promise { + if (this.browser) { + return; // Already connected + } + + this.browser = await chromium.connectOverCDP(this.cdpWsUrl); + + // Get existing context or create new one + this.context = this.browser.contexts()[0]; + if (!this.context) { + this.context = await this.browser.newContext(); + } + + // Handle new page events + this.context.on('page', this.handleNewPage.bind(this)); + + // Get existing page or create new one + this.page = this.context.pages()[0]; + if (!this.page) { + this.page = await this.context.newPage(); + } + + // Set viewport size to Yutori's recommended dimensions + await this.page.setViewportSize({ width: this.width, height: this.height }); + this.page.on('close', this.handlePageClose.bind(this)); + } + + /** + * Disconnect from the browser. + */ + async disconnect(): Promise { + if (this.browser) { + // Don't close the browser itself - just disconnect the CDP connection + // The browser lifecycle is managed by Kernel + this.browser = null; + this.context = null; + this.page = null; + } + } + + private handleNewPage(page: Page): void { + console.log('New page created'); + this.page = page; + page.on('close', this.handlePageClose.bind(this)); + } + + private handlePageClose(closedPage: Page): void { + console.log('Page closed'); + if (this.page === closedPage && this.context) { + const pages = this.context.pages(); + if (pages.length > 0) { + this.page = pages[pages.length - 1]; + } else { + console.warn('Warning: All pages have been closed.'); + this.page = null; + } + } + } + + private assertPage(): asserts this is { page: Page } { + if (!this.page) { + throw new ToolError('Page not available. Did you call connect()?'); + } + } + + /** + * Execute an n1 action and return the result. + */ + async execute(action: N1Action): Promise { + this.assertPage(); + const { action_type } = action; + + switch (action_type) { + case 'click': + return this.handleClick(action); + case 'scroll': + return this.handleScroll(action); + case 'type': + return this.handleType(action); + case 'key_press': + return this.handleKeyPress(action); + case 'hover': + return this.handleHover(action); + case 'drag': + return this.handleDrag(action); + case 'wait': + return this.handleWait(); + case 'refresh': + return this.handleRefresh(); + case 'go_back': + return this.handleGoBack(); + case 'goto_url': + return this.handleGotoUrl(action); + case 'read_texts_and_links': + return this.handleReadTextsAndLinks(); + case 'stop': + return this.handleStop(action); + default: + throw new ToolError(`Unknown action type: ${action_type}`); + } + } + + private async handleClick(action: N1Action): Promise { + this.assertPage(); + const coords = this.getCoordinates(action.center_coordinates); + + await this.page.mouse.click(coords.x, coords.y); + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleScroll(action: N1Action): Promise { + this.assertPage(); + const coords = this.getCoordinates(action.center_coordinates); + const direction = action.direction; + const amount = action.amount ?? 3; + + if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { + throw new ToolError(`Invalid scroll direction: ${direction}`); + } + + // Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels + const scrollDelta = amount * 100; + + // Move mouse to position first + await this.page.mouse.move(coords.x, coords.y); + + // Playwright's wheel method takes deltaX and deltaY + let deltaX = 0; + let deltaY = 0; + + switch (direction) { + case 'up': + deltaY = -scrollDelta; + break; + case 'down': + deltaY = scrollDelta; + break; + case 'left': + deltaX = -scrollDelta; + break; + case 'right': + deltaX = scrollDelta; + break; + } + + await this.page.mouse.wheel(deltaX, deltaY); + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleType(action: N1Action): Promise { + this.assertPage(); + const text = action.text; + if (!text) { + throw new ToolError('text is required for type action'); + } + + // Clear existing text if requested + if (action.clear_before_typing) { + await this.page.keyboard.press('Control+a'); + await this.sleep(100); + await this.page.keyboard.press('Backspace'); + await this.sleep(100); + } + + // Type the text + await this.page.keyboard.type(text); + + // Press Enter if requested + if (action.press_enter_after) { + await this.sleep(100); + await this.page.keyboard.press('Enter'); + } + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleKeyPress(action: N1Action): Promise { + this.assertPage(); + const keyComb = action.key_comb; + if (!keyComb) { + throw new ToolError('key_comb is required for key_press action'); + } + + const mappedKey = this.mapKeyToPlaywright(keyComb); + await this.page.keyboard.press(mappedKey); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleHover(action: N1Action): Promise { + this.assertPage(); + const coords = this.getCoordinates(action.center_coordinates); + + await this.page.mouse.move(coords.x, coords.y); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleDrag(action: N1Action): Promise { + this.assertPage(); + const startCoords = this.getCoordinates(action.start_coordinates); + const endCoords = this.getCoordinates(action.center_coordinates); + + await this.page.mouse.move(startCoords.x, startCoords.y); + await this.page.mouse.down(); + await this.page.mouse.move(endCoords.x, endCoords.y); + await this.page.mouse.up(); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleWait(): Promise { + // Default wait of 2 seconds for UI to update + await this.sleep(2000); + return this.screenshot(); + } + + private async handleRefresh(): Promise { + this.assertPage(); + await this.page.reload(); + + // Wait for page to reload + await this.sleep(2000); + return this.screenshot(); + } + + private async handleGoBack(): Promise { + this.assertPage(); + await this.page.goBack(); + + // Wait for navigation + await this.sleep(1500); + return this.screenshot(); + } + + private async handleGotoUrl(action: N1Action): Promise { + this.assertPage(); + const url = action.url; + if (!url) { + throw new ToolError('url is required for goto_url action'); + } + + await this.page.goto(url); + + // Wait for page to load + await this.sleep(2000); + return this.screenshot(); + } + + /** + * Read texts and links using Playwright's _snapshotForAI(). + * Directly calls the method on the CDP-connected page. + */ + private async handleReadTextsAndLinks(): Promise { + this.assertPage(); + try { + // Call _snapshotForAI directly on the page + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const snapshot = await (this.page as any)._snapshotForAI(); + const url = this.page.url(); + const title = await this.page.title(); + + // Get viewport-only screenshot + const screenshotResult = await this.screenshot(); + + return { + base64Image: screenshotResult.base64Image, + output: JSON.stringify({ url, title, snapshot }, null, 2), + }; + } catch (error) { + console.warn('read_texts_and_links failed:', error); + return this.screenshot(); + } + } + + private handleStop(action: N1Action): ToolResult { + // Return the final answer without taking a screenshot + return { + output: action.answer || 'Task completed', + }; + } + + /** + * Take a viewport-only screenshot of the current browser state. + * This captures only the browser content, not the OS UI or browser chrome. + */ + async screenshot(): Promise { + this.assertPage(); + try { + // fullPage: false captures only the viewport (browser content) + const buffer = await this.page.screenshot({ fullPage: false }); + + return { + base64Image: buffer.toString('base64'), + }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); + } + } + + /** + * Get the current page URL. + */ + getCurrentUrl(): string { + this.assertPage(); + return this.page.url(); + } + + private getCoordinates(coords?: [number, number]): { x: number; y: number } { + if (!coords || coords.length !== 2) { + // Default to center of viewport + return { x: this.width / 2, y: this.height / 2 }; + } + + const [x, y] = coords; + if (typeof x !== 'number' || typeof y !== 'number' || x < 0 || y < 0) { + throw new ToolError(`Invalid coordinates: ${JSON.stringify(coords)}`); + } + + return { x, y }; + } + + /** + * Map key names to Playwright format. + * n1 outputs keys in Playwright format, but some may need adjustment. + */ + private mapKeyToPlaywright(key: string): string { + // Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") + if (key.includes('+')) { + const parts = key.split('+'); + const mappedParts = parts.map((part) => { + const trimmed = part.trim(); + const lower = trimmed.toLowerCase(); + + // Map modifier names + if (MODIFIER_MAP[lower]) { + return MODIFIER_MAP[lower]; + } + + // Check KEY_MAP for special keys + return KEY_MAP[trimmed] || trimmed; + }); + return mappedParts.join('+'); + } + + return KEY_MAP[key] || key; + } + + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); + } +} From 748aa2b6b69cbbd2752f18a00179b3916e27e7fd Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 20 Jan 2026 18:43:16 -0500 Subject: [PATCH 04/12] Update qa.md Add templates + modes for Yutori to QA file --- .cursor/commands/qa.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 4132052..9d4f397 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -59,6 +59,9 @@ Here are all valid language + template combinations: | typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY | | typescript | claude-agent-sdk | ts-claude-agent-sdk | ts-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | | typescript | yutori-computer-use | ts-yutori-cua | ts-yutori-cua | Yes | YUTORI_API_KEY | + +> **Note:** The `yutori-computer-use` template supports two modes: `computer_use` (default, full VM screenshots) and `playwright` (viewport-only screenshots via CDP). Both modes should be tested. + | python | sample-app | py-sample-app | python-basic | No | - | | python | captcha-solver | py-captcha-solver | python-captcha-solver | No | - | | python | browser-use | py-browser-use | python-bu | Yes | OPENAI_API_KEY | @@ -68,6 +71,10 @@ Here are all valid language + template combinations: | python | claude-agent-sdk | py-claude-agent-sdk | py-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | | python | yutori-computer-use | py-yutori-cua | python-yutori-cua | Yes | YUTORI_API_KEY | +> **Yutori Modes:** +> - `computer_use` (default): Uses Kernel's Computer Controls API with full VM screenshots +> - `playwright`: Uses Playwright via CDP WebSocket for viewport-only screenshots (optimized for n1 model) + ### Create Commands Run each of these (they are non-interactive when all flags are provided): @@ -257,7 +264,8 @@ kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipe kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' -kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "computer_use"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "playwright"}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -267,12 +275,13 @@ kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://m kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}' kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' -kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "computer_use"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "playwright"}' ``` ## Step 7: Automated Runtime Testing (Optional) -**STOP and ask the human:** "Would you like me to automatically invoke all 17 templates and report back on their runtime status?" +**STOP and ask the human:** "Would you like me to automatically invoke all 19 test cases and report back on their runtime status?" If the human agrees, invoke each template use the Kernel CLI and collect results. Present findings in this format: @@ -292,7 +301,8 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | ts-openai-cua | ts-openai-cua | | | | ts-gemini-cua | ts-gemini-cua | | | | ts-claude-agent-sdk | ts-claude-agent-sdk | | | -| ts-yutori-cua | ts-yutori-cua | | | +| ts-yutori-cua | ts-yutori-cua | | mode: computer_use | +| ts-yutori-cua | ts-yutori-cua | | mode: playwright | | py-sample-app | python-basic | | | | py-captcha-solver | python-captcha-solver | | | | py-browser-use | python-bu | | | @@ -300,7 +310,8 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | py-openai-cua | python-openai-cua | | | | py-openagi-cua | python-openagi-cua | | | | py-claude-agent-sdk | py-claude-agent-sdk | | | -| py-yutori-cua | python-yutori-cua | | | +| py-yutori-cua | python-yutori-cua | | mode: computer_use | +| py-yutori-cua | python-yutori-cua | | mode: playwright | Status values: - **SUCCESS**: App started and returned a result From 8e2df1bb03de34d60f06bdb86e0c3777027efed7 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 07:26:16 -0500 Subject: [PATCH 05/12] Fix Drag Operation on Playwright Mode Fix drag operations that previously weren't working properly on Playwright mode operations. --- .../tools/playwright_computer.py | 14 +++++++++++++- .../tools/playwright-computer.ts | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py index bd86e88..fedf7b3 100644 --- a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py @@ -225,9 +225,21 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: start_coords = self._get_coordinates(action.get("start_coordinates")) end_coords = self._get_coordinates(action.get("center_coordinates")) + # Move to start position await page.mouse.move(start_coords["x"], start_coords["y"]) + await asyncio.sleep(0.1) + + # Press mouse button and wait for drag to register await page.mouse.down() - await page.mouse.move(end_coords["x"], end_coords["y"]) + await asyncio.sleep(0.15) + + # Move gradually to end position using steps for proper drag-and-drop + # The steps parameter makes Playwright simulate intermediate mouse positions + # which is required for HTML5 drag-and-drop to work properly + await page.mouse.move(end_coords["x"], end_coords["y"], steps=20) + await asyncio.sleep(0.1) + + # Release mouse button await page.mouse.up() await asyncio.sleep(SCREENSHOT_DELAY_MS) diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts index 35a385a..c397c3e 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts @@ -258,9 +258,21 @@ export class PlaywrightComputerTool { const startCoords = this.getCoordinates(action.start_coordinates); const endCoords = this.getCoordinates(action.center_coordinates); + // Move to start position await this.page.mouse.move(startCoords.x, startCoords.y); + await this.sleep(100); + + // Press mouse button and wait for drag to register await this.page.mouse.down(); - await this.page.mouse.move(endCoords.x, endCoords.y); + await this.sleep(150); + + // Move gradually to end position using steps for proper drag-and-drop + // The steps parameter makes Playwright simulate intermediate mouse positions + // which is required for HTML5 drag-and-drop to work properly + await this.page.mouse.move(endCoords.x, endCoords.y, { steps: 20 }); + await this.sleep(100); + + // Release mouse button await this.page.mouse.up(); await this.sleep(SCREENSHOT_DELAY_MS); From 71e7a85b31c0ec801f6fcfdbc3213373e7c658cd Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 07:57:58 -0500 Subject: [PATCH 06/12] Fix read_text_and_links action Use ariaSnapshot instead of the existing method, as ariaSnapshot is stably available in both Python and TypeScript versions. --- .../yutori-computer-use/tools/playwright_computer.py | 10 +++++----- .../yutori-computer-use/tools/playwright-computer.ts | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py index fedf7b3..708d8bb 100644 --- a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py @@ -280,14 +280,14 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: """ - Read texts and links using Playwright's _snapshotForAI(). - Directly calls the method on the CDP-connected page. + Read texts and links using Playwright's ariaSnapshot() API. + Returns accessibility tree representation of the page content. """ page = self._assert_page() try: - # Call _snapshotForAI directly on the page - # This is an internal Playwright method for AI accessibility - snapshot = await page._snapshot_for_ai() # type: ignore + # Use the public aria_snapshot() API on the body locator + # This provides an accessibility tree representation of the page + snapshot = await page.locator("body").aria_snapshot() url = page.url title = await page.title() diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts index c397c3e..0306520 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts @@ -318,15 +318,15 @@ export class PlaywrightComputerTool { } /** - * Read texts and links using Playwright's _snapshotForAI(). - * Directly calls the method on the CDP-connected page. + * Read texts and links using Playwright's ariaSnapshot() API. + * Returns accessibility tree representation of the page content. */ private async handleReadTextsAndLinks(): Promise { this.assertPage(); try { - // Call _snapshotForAI directly on the page - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const snapshot = await (this.page as any)._snapshotForAI(); + // Use the public ariaSnapshot() API on the body locator + // This provides an accessibility tree representation of the page + const snapshot = await this.page.locator('body').ariaSnapshot(); const url = this.page.url(); const title = await this.page.title(); From ffcc96e10fd2097a0ff5bb32fc2c86eaa21101aa Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 15:57:36 -0500 Subject: [PATCH 07/12] Fix sync issue Issue: The ComputerTool.screenshot() method was a synchronous function, but: The N1ComputerToolProtocol expected it to be async The PlaywrightComputerTool.screenshot() was async The loop.py code tried to await it Fix: Changed def screenshot() to async def screenshot() Updated all handler methods to await self.screenshot() instead of return self.screenshot() --- .../yutori-computer-use/tools/computer.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index 87dad3e..4363ba2 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -136,7 +136,7 @@ async def _handle_click(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_MS) - return self.screenshot() + return await self.screenshot() async def _handle_scroll(self, action: N1Action) -> ToolResult: coords = self._get_coordinates(action.get("center_coordinates")) @@ -170,7 +170,7 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_MS) - return self.screenshot() + return await self.screenshot() async def _handle_type(self, action: N1Action) -> ToolResult: text = action.get("text") @@ -206,7 +206,7 @@ async def _handle_type(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_MS) - return self.screenshot() + return await self.screenshot() async def _handle_key_press(self, action: N1Action) -> ToolResult: key_comb = action.get("key_comb") @@ -221,7 +221,7 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_MS) - return self.screenshot() + return await self.screenshot() async def _handle_hover(self, action: N1Action) -> ToolResult: coords = self._get_coordinates(action.get("center_coordinates")) @@ -233,7 +233,7 @@ async def _handle_hover(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_MS) - return self.screenshot() + return await self.screenshot() async def _handle_drag(self, action: N1Action) -> ToolResult: start_coords = self._get_coordinates(action.get("start_coordinates")) @@ -246,12 +246,12 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_MS) - return self.screenshot() + return await self.screenshot() async def _handle_wait(self, action: N1Action) -> ToolResult: # Default wait of 2 seconds for UI to update await asyncio.sleep(2) - return self.screenshot() + return await self.screenshot() async def _handle_refresh(self, action: N1Action) -> ToolResult: """Refresh the page using keyboard shortcut (F5).""" @@ -262,7 +262,7 @@ async def _handle_refresh(self, action: N1Action) -> ToolResult: # Wait for page to reload await asyncio.sleep(2) - return self.screenshot() + return await self.screenshot() async def _handle_go_back(self, action: N1Action) -> ToolResult: """Go back using keyboard shortcut (Alt+Left).""" @@ -273,7 +273,7 @@ async def _handle_go_back(self, action: N1Action) -> ToolResult: # Wait for navigation await asyncio.sleep(1.5) - return self.screenshot() + return await self.screenshot() async def _handle_goto_url(self, action: N1Action) -> ToolResult: """ @@ -316,7 +316,7 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: # Wait for page to load await asyncio.sleep(2) - return self.screenshot() + return await self.screenshot() async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: """ @@ -339,7 +339,7 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: ) # Get screenshot via Computer Controls API - screenshot_result = self.screenshot() + screenshot_result = await self.screenshot() if result.success and result.result: data = result.result @@ -357,13 +357,13 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: return screenshot_result except Exception as e: print(f"read_texts_and_links failed: {e}") - return self.screenshot() + return await self.screenshot() async def _handle_stop(self, action: N1Action) -> ToolResult: """Return the final answer without taking a screenshot.""" return {"output": action.get("answer", "Task completed")} - def screenshot(self) -> ToolResult: + async def screenshot(self) -> ToolResult: """Take a screenshot of the current browser state.""" try: response = self.kernel.browsers.computer.capture_screenshot( From 1ae6ddbd716abaf2768a38a784f3e207e475d3e1 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 17:20:55 -0500 Subject: [PATCH 08/12] Update default delays Update default delays for actions and screenshots --- .../yutori-computer-use/tools/computer.py | 23 +++++++++--------- .../tools/playwright_computer.py | 24 +++++++++---------- .../yutori-computer-use/tools/computer.ts | 4 ++-- .../tools/playwright-computer.ts | 12 ++++------ 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index 4363ba2..a881cff 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -13,9 +13,10 @@ from .base import ToolError, ToolResult -TYPING_DELAY_MS = 12 -SCREENSHOT_DELAY_MS = 0.5 -ACTION_DELAY_MS = 0.5 +TYPING_DELAY_MS = 12 # Typing delay in milliseconds (used by Kernel API) +# Delays in seconds for asyncio.sleep (matches TypeScript 300ms = 0.3s) +SCREENSHOT_DELAY_S = 0.3 +ACTION_DELAY_S = 0.3 # n1 action types @@ -135,7 +136,7 @@ async def _handle_click(self, action: N1Action) -> ToolResult: num_clicks=1, ) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_scroll(self, action: N1Action) -> ToolResult: @@ -169,7 +170,7 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: delta_y=delta_y, ) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_type(self, action: N1Action) -> ToolResult: @@ -205,7 +206,7 @@ async def _handle_type(self, action: N1Action) -> ToolResult: keys=["Return"], ) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_key_press(self, action: N1Action) -> ToolResult: @@ -220,7 +221,7 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult: keys=[mapped_key], ) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_hover(self, action: N1Action) -> ToolResult: @@ -232,7 +233,7 @@ async def _handle_hover(self, action: N1Action) -> ToolResult: y=coords["y"], ) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_drag(self, action: N1Action) -> ToolResult: @@ -245,7 +246,7 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: button="left", ) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_wait(self, action: N1Action) -> ToolResult: @@ -291,7 +292,7 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: self.session_id, keys=["ctrl+l"], ) - await asyncio.sleep(ACTION_DELAY_MS) + await asyncio.sleep(ACTION_DELAY_S) # Select all existing text self.kernel.browsers.computer.press_key( @@ -306,7 +307,7 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: text=url, delay=TYPING_DELAY_MS, ) - await asyncio.sleep(ACTION_DELAY_MS) + await asyncio.sleep(ACTION_DELAY_S) # Press Enter to navigate self.kernel.browsers.computer.press_key( diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py index 708d8bb..dc88d97 100644 --- a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py @@ -17,7 +17,9 @@ from .base import ToolError, ToolResult from .computer import N1Action -SCREENSHOT_DELAY_MS = 0.5 +# Delay after actions before taking screenshot (in seconds for asyncio.sleep) +# Matches TypeScript SCREENSHOT_DELAY_MS = 300 (300ms = 0.3s) +SCREENSHOT_DELAY_S = 0.3 # Key mappings from n1 output format to Playwright format KEY_MAP = { @@ -140,7 +142,7 @@ async def _handle_click(self, action: N1Action) -> ToolResult: coords = self._get_coordinates(action.get("center_coordinates")) await page.mouse.click(coords["x"], coords["y"]) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_scroll(self, action: N1Action) -> ToolResult: @@ -172,7 +174,7 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: delta_x = scroll_delta await page.mouse.wheel(delta_x, delta_y) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_type(self, action: N1Action) -> ToolResult: @@ -196,7 +198,7 @@ async def _handle_type(self, action: N1Action) -> ToolResult: await asyncio.sleep(0.1) await page.keyboard.press("Enter") - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_key_press(self, action: N1Action) -> ToolResult: @@ -208,7 +210,7 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult: mapped_key = self._map_key_to_playwright(key_comb) await page.keyboard.press(mapped_key) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_hover(self, action: N1Action) -> ToolResult: @@ -217,7 +219,7 @@ async def _handle_hover(self, action: N1Action) -> ToolResult: await page.mouse.move(coords["x"], coords["y"]) - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_drag(self, action: N1Action) -> ToolResult: @@ -227,22 +229,20 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: # Move to start position await page.mouse.move(start_coords["x"], start_coords["y"]) - await asyncio.sleep(0.1) - # Press mouse button and wait for drag to register + # Press mouse button and wait for dragstart event await page.mouse.down() - await asyncio.sleep(0.15) + await asyncio.sleep(0.05) # Move gradually to end position using steps for proper drag-and-drop # The steps parameter makes Playwright simulate intermediate mouse positions # which is required for HTML5 drag-and-drop to work properly - await page.mouse.move(end_coords["x"], end_coords["y"], steps=20) - await asyncio.sleep(0.1) + await page.mouse.move(end_coords["x"], end_coords["y"], steps=12) # Release mouse button await page.mouse.up() - await asyncio.sleep(SCREENSHOT_DELAY_MS) + await asyncio.sleep(0.3) return await self.screenshot() async def _handle_wait(self, action: N1Action) -> ToolResult: diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index e01bffa..ae2a950 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -8,8 +8,8 @@ import { Buffer } from 'buffer'; import type { Kernel } from '@onkernel/sdk'; const TYPING_DELAY_MS = 12; -const SCREENSHOT_DELAY_MS = 500; -const ACTION_DELAY_MS = 500; +const SCREENSHOT_DELAY_MS = 300; +const ACTION_DELAY_MS = 300; export interface ToolResult { base64Image?: string; diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts index 0306520..c9e8806 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts @@ -11,7 +11,7 @@ import { chromium, type Browser, type BrowserContext, type Page } from 'playwrig import type { ToolResult, N1Action } from './computer'; import { ToolError } from './computer'; -const SCREENSHOT_DELAY_MS = 500; +const SCREENSHOT_DELAY_MS = 300; // Key mappings from n1 output format to Playwright format const KEY_MAP: Record = { @@ -260,22 +260,20 @@ export class PlaywrightComputerTool { // Move to start position await this.page.mouse.move(startCoords.x, startCoords.y); - await this.sleep(100); - // Press mouse button and wait for drag to register + // Press mouse button and wait for dragstart event await this.page.mouse.down(); - await this.sleep(150); + await this.sleep(50); // Move gradually to end position using steps for proper drag-and-drop // The steps parameter makes Playwright simulate intermediate mouse positions // which is required for HTML5 drag-and-drop to work properly - await this.page.mouse.move(endCoords.x, endCoords.y, { steps: 20 }); - await this.sleep(100); + await this.page.mouse.move(endCoords.x, endCoords.y, { steps: 12 }); // Release mouse button await this.page.mouse.up(); - await this.sleep(SCREENSHOT_DELAY_MS); + await this.sleep(300); return this.screenshot(); } From 0a88d32185b88a48845c6f33602c565fcf23b315 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 17:24:27 -0500 Subject: [PATCH 09/12] Update Yutori CUA tasks in qa.md to specify dragging items instead of moving. Clarified instructions for both computer_use and playwright modes to enhance user understanding and execution accuracy. --- .cursor/commands/qa.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 9d4f397..b1d65e1 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -264,8 +264,8 @@ kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipe kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' -kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "computer_use"}' -kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "playwright"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "computer_use"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "playwright"}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -275,8 +275,8 @@ kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://m kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}' kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' -kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "computer_use"}' -kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true, "mode": "playwright"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "computer_use"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "playwright"}' ``` ## Step 7: Automated Runtime Testing (Optional) From 8cc168346be94d92986e2e2c2a804351eeedd870 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 17:41:41 -0500 Subject: [PATCH 10/12] ran deslop The cleanup removed ~300 lines of redundant inline comments and verbose method docstrings while keeping the useful class-level documentation you restored. The templates now match the minimal-comment style of the existing anthropic/openai templates in the codebase. --- .../python/yutori-computer-use/loop.py | 49 +--------------- .../python/yutori-computer-use/main.py | 3 +- .../python/yutori-computer-use/session.py | 9 --- .../python/yutori-computer-use/tools/base.py | 2 - .../yutori-computer-use/tools/computer.py | 39 ------------- .../tools/playwright_computer.py | 57 ------------------- .../typescript/yutori-computer-use/index.ts | 3 - .../typescript/yutori-computer-use/loop.ts | 30 ---------- .../typescript/yutori-computer-use/session.ts | 17 ------ .../yutori-computer-use/tools/computer.ts | 42 -------------- .../tools/playwright-computer.ts | 57 ------------------- 11 files changed, 2 insertions(+), 306 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 3a19bc1..ef5748c 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -29,8 +29,6 @@ class N1ComputerToolProtocol(Protocol): - """Protocol for tools that can execute n1 actions.""" - async def execute(self, action: N1Action) -> ToolResult: ... @@ -51,36 +49,16 @@ async def sampling_loop( cdp_ws_url: Optional[str] = None, max_tokens: int = 4096, max_iterations: int = 50, - # Default viewport matches Yutori's recommended 1280x800 (WXGA 16:10) viewport_width: int = 1280, viewport_height: int = 800, mode: BrowserMode = "computer_use", ) -> dict[str, Any]: - """ - Run the n1 sampling loop until the model returns a stop action or max iterations. - - Args: - model: The n1 model to use - task: The user's task/query - api_key: Yutori API key - kernel: Kernel client instance - session_id: Browser session ID - cdp_ws_url: CDP WebSocket URL for playwright mode - max_tokens: Maximum tokens per response - max_iterations: Maximum loop iterations - viewport_width: Browser viewport width for coordinate scaling - viewport_height: Browser viewport height for coordinate scaling - mode: Browser interaction mode ('computer_use' or 'playwright') - - Returns: - Dict with 'messages' (conversation history) and 'final_answer' (if stopped) - """ + """Run the n1 sampling loop until the model returns a stop action or max iterations.""" client = OpenAI( api_key=api_key, base_url="https://api.yutori.com/v1", ) - # Create the appropriate tool based on mode computer_tool: N1ComputerToolProtocol playwright_tool: Optional[PlaywrightComputerTool] = None @@ -99,12 +77,8 @@ async def sampling_loop( print("Using computer_use mode (Computer Controls API)") try: - # Take initial screenshot initial_screenshot = await computer_tool.screenshot() - # Build conversation per n1 format: - # 1. User message with task - # 2. Observation message with screenshot conversation_messages: list[dict[str, Any]] = [ { "role": "user", @@ -112,7 +86,6 @@ async def sampling_loop( } ] - # Add initial screenshot as observation (n1's required format) if initial_screenshot.get("base64_image"): conversation_messages.append({ "role": "observation", @@ -133,7 +106,6 @@ async def sampling_loop( iteration += 1 print(f"\n=== Iteration {iteration} ===") - # Call the n1 API (no system prompt - n1 uses its own) try: response = client.chat.completions.create( model=model, @@ -156,34 +128,27 @@ async def sampling_loop( response_content = assistant_message.content or "" print("Assistant response:", response_content) - # Add assistant message to conversation conversation_messages.append({ "role": "assistant", "content": response_content, }) - # Parse the action(s) from the response - # n1 returns JSON with "thoughts" and "actions" array parsed = _parse_n1_response(response_content) if not parsed or not parsed.get("actions"): print("No actions found in response, ending loop") break - # Execute each action in the actions array for action in parsed["actions"]: print(f"Executing action: {action.get('action_type')}", action) - # Check for stop action if action.get("action_type") == "stop": final_answer = action.get("answer") print(f"Stop action received, final answer: {final_answer}") return {"messages": conversation_messages, "final_answer": final_answer} - # Scale coordinates from n1's 1000x1000 space to actual viewport scaled_action = _scale_coordinates(action, viewport_width, viewport_height) - # Execute the action result: ToolResult try: result = await computer_tool.execute(scaled_action) @@ -191,18 +156,15 @@ async def sampling_loop( print(f"Action failed: {e}") result = {"error": str(e)} - # After action, add observation with screenshot and optional text output if result.get("base64_image") or result.get("output"): observation_content = [] - # Add text output first (e.g., from read_texts_and_links) if result.get("output"): observation_content.append({ "type": "text", "text": result["output"], }) - # Add screenshot if result.get("base64_image"): observation_content.append({ "type": "image_url", @@ -216,7 +178,6 @@ async def sampling_loop( "content": observation_content, }) elif result.get("error"): - # If there was an error, add it as text observation conversation_messages.append({ "role": "observation", "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], @@ -230,15 +191,11 @@ async def sampling_loop( "final_answer": final_answer, } finally: - # Clean up playwright connection if used if playwright_tool: await playwright_tool.disconnect() def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: - """ - Parse n1's response format: { "thoughts": "...", "actions": [...] } - """ try: # The response should be JSON return json.loads(content) @@ -254,10 +211,6 @@ def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action: - """ - Scale coordinates from n1's 1000x1000 space to actual viewport dimensions. - Per docs: "n1-preview-2025-11 outputs relative coordinates in 1000×1000" - """ scaled = dict(action) if "center_coordinates" in scaled and scaled["center_coordinates"]: diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py index 24693c3..d909c67 100644 --- a/pkg/templates/python/yutori-computer-use/main.py +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -86,9 +86,8 @@ async def cua_task( def _extract_last_assistant_message(messages: list) -> str: - """Extract the last assistant message text from the conversation.""" import json - + for msg in reversed(messages): if msg.get("role") == "assistant": content = msg.get("content") diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori-computer-use/session.py index 9c51bbe..f4f2d01 100644 --- a/pkg/templates/python/yutori-computer-use/session.py +++ b/pkg/templates/python/yutori-computer-use/session.py @@ -32,8 +32,6 @@ class KernelBrowserSession: stealth: bool = True timeout_seconds: int = 300 - # Viewport dimensions - Yutori n1 recommended: 1280x800, Kernel supports 1200x800 - # See: https://docs.yutori.com/reference/n1#screenshot-requirements viewport_width: int = 1200 viewport_height: int = 800 @@ -50,11 +48,8 @@ class KernelBrowserSession: _kernel: Optional[Kernel] = field(default=None, init=False) async def __aenter__(self) -> "KernelBrowserSession": - """Create a Kernel browser session and optionally start recording.""" self._kernel = Kernel() - # Create browser with Yutori n1's recommended viewport - # See: https://docs.yutori.com/reference/n1#screenshot-requirements browser = self._kernel.browsers.create( stealth=self.stealth, timeout_seconds=self.timeout_seconds, @@ -83,7 +78,6 @@ async def __aenter__(self) -> "KernelBrowserSession": return self async def _start_replay(self) -> None: - """Start recording a replay of the browser session.""" if not self._kernel or not self.session_id: return @@ -93,7 +87,6 @@ async def _start_replay(self) -> None: print(f"Replay recording started: {self.replay_id}") async def _stop_and_get_replay_url(self) -> None: - """Stop recording and get the replay URL.""" if not self._kernel or not self.session_id or not self.replay_id: return @@ -132,7 +125,6 @@ async def _stop_and_get_replay_url(self) -> None: print(f"Replay view URL: {self.replay_view_url}") async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: - """Stop recording and delete the browser session.""" if self._kernel and self.session_id: try: # Stop replay if recording was enabled @@ -151,7 +143,6 @@ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: @property def kernel(self) -> Kernel: - """Get the Kernel client instance.""" if self._kernel is None: raise RuntimeError("Session not initialized. Use async with context.") return self._kernel diff --git a/pkg/templates/python/yutori-computer-use/tools/base.py b/pkg/templates/python/yutori-computer-use/tools/base.py index e50a455..d12a38e 100644 --- a/pkg/templates/python/yutori-computer-use/tools/base.py +++ b/pkg/templates/python/yutori-computer-use/tools/base.py @@ -12,8 +12,6 @@ def __init__(self, message: str): class ToolResult(TypedDict, total=False): - """Result from executing a tool action.""" - base64_image: str output: str error: str diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index a881cff..4460161 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -91,8 +91,6 @@ class N1Action(TypedDict, total=False): class ComputerTool: - """Computer tool for Yutori n1 actions using Kernel browser.""" - def __init__(self, kernel: Kernel, session_id: str, width: int = 1200, height: int = 800): self.kernel = kernel self.session_id = session_id @@ -100,7 +98,6 @@ def __init__(self, kernel: Kernel, session_id: str, width: int = 1200, height: i self.height = height async def execute(self, action: N1Action) -> ToolResult: - """Execute an n1 action and return the result.""" action_type = action.get("action_type") handlers = { @@ -147,7 +144,6 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: if direction not in ("up", "down", "left", "right"): raise ToolError(f"Invalid scroll direction: {direction}") - # Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels scroll_delta = amount * 100 delta_x = 0 @@ -178,7 +174,6 @@ async def _handle_type(self, action: N1Action) -> ToolResult: if not text: raise ToolError("text is required for type action") - # Clear existing text if requested if action.get("clear_before_typing"): self.kernel.browsers.computer.press_key( self.session_id, @@ -191,14 +186,12 @@ async def _handle_type(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(0.1) - # Type the text self.kernel.browsers.computer.type_text( self.session_id, text=text, delay=TYPING_DELAY_MS, ) - # Press Enter if requested if action.get("press_enter_after"): await asyncio.sleep(0.1) self.kernel.browsers.computer.press_key( @@ -250,58 +243,42 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: return await self.screenshot() async def _handle_wait(self, action: N1Action) -> ToolResult: - # Default wait of 2 seconds for UI to update await asyncio.sleep(2) return await self.screenshot() async def _handle_refresh(self, action: N1Action) -> ToolResult: - """Refresh the page using keyboard shortcut (F5).""" self.kernel.browsers.computer.press_key( self.session_id, keys=["F5"], ) - - # Wait for page to reload await asyncio.sleep(2) return await self.screenshot() async def _handle_go_back(self, action: N1Action) -> ToolResult: - """Go back using keyboard shortcut (Alt+Left).""" self.kernel.browsers.computer.press_key( self.session_id, keys=["alt+Left"], ) - - # Wait for navigation await asyncio.sleep(1.5) return await self.screenshot() async def _handle_goto_url(self, action: N1Action) -> ToolResult: - """ - Navigate to URL using keyboard shortcuts: - 1. Ctrl+L to focus the URL bar - 2. Type the URL - 3. Press Enter - """ url = action.get("url") if not url: raise ToolError("url is required for goto_url action") - # Focus URL bar with Ctrl+L self.kernel.browsers.computer.press_key( self.session_id, keys=["ctrl+l"], ) await asyncio.sleep(ACTION_DELAY_S) - # Select all existing text self.kernel.browsers.computer.press_key( self.session_id, keys=["ctrl+a"], ) await asyncio.sleep(0.1) - # Type the URL self.kernel.browsers.computer.type_text( self.session_id, text=url, @@ -309,25 +286,15 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(ACTION_DELAY_S) - # Press Enter to navigate self.kernel.browsers.computer.press_key( self.session_id, keys=["Return"], ) - - # Wait for page to load await asyncio.sleep(2) return await self.screenshot() async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: - """ - Read texts and links using Playwright's _snapshotForAI(). - Per n1 docs this is "implemented as an external VLM call" - we use - Kernel's Playwright Execution API for the AI snapshot and - Computer Controls API for the screenshot. - """ try: - # Get AI snapshot via Playwright Execution API result = self.kernel.browsers.playwright.execute( self.session_id, code=""" @@ -339,7 +306,6 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: timeout_sec=30 ) - # Get screenshot via Computer Controls API screenshot_result = await self.screenshot() if result.success and result.result: @@ -353,7 +319,6 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: }, indent=2) } - # Fallback to just screenshot if Playwright execution fails print("Playwright execution failed, falling back to screenshot only") return screenshot_result except Exception as e: @@ -361,11 +326,9 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: return await self.screenshot() async def _handle_stop(self, action: N1Action) -> ToolResult: - """Return the final answer without taking a screenshot.""" return {"output": action.get("answer", "Task completed")} async def screenshot(self) -> ToolResult: - """Take a screenshot of the current browser state.""" try: response = self.kernel.browsers.computer.capture_screenshot( self.session_id @@ -379,7 +342,6 @@ async def screenshot(self) -> ToolResult: def _get_coordinates( self, coords: tuple[int, int] | list[int] | None ) -> dict[str, int]: - """Convert n1 coordinates to Kernel format.""" if coords is None or len(coords) != 2: # Default to center of screen return {"x": self.width // 2, "y": self.height // 2} @@ -391,7 +353,6 @@ def _get_coordinates( return {"x": int(x), "y": int(y)} def _map_key(self, key: str) -> str: - """Map key names from Playwright format (n1 output) to xdotool format (Kernel).""" # Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") if "+" in key: parts = key.split("+") diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py index dc88d97..df98628 100644 --- a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py @@ -38,11 +38,6 @@ class PlaywrightComputerTool: - """ - Computer tool for Yutori n1 actions using Playwright via CDP connection. - Provides viewport-only screenshots optimized for n1 model performance. - """ - def __init__(self, cdp_ws_url: str, width: int = 1200, height: int = 800): self.cdp_ws_url = cdp_ws_url self.width = width @@ -53,10 +48,6 @@ def __init__(self, cdp_ws_url: str, width: int = 1200, height: int = 800): self._page: Optional[Page] = None async def connect(self) -> None: - """ - Connect to the browser via CDP WebSocket. - Must be called before executing any actions. - """ if self._browser: return # Already connected @@ -79,9 +70,6 @@ async def connect(self) -> None: self._page.on("close", self._handle_page_close) async def disconnect(self) -> None: - """Disconnect from the browser.""" - # Don't close the browser itself - just stop the playwright connection - # The browser lifecycle is managed by Kernel if self._playwright: await self._playwright.stop() self._playwright = None @@ -90,13 +78,11 @@ async def disconnect(self) -> None: self._page = None def _handle_new_page(self, page: Page) -> None: - """Handle the creation of a new page.""" print("New page created") self._page = page page.on("close", self._handle_page_close) def _handle_page_close(self, closed_page: Page) -> None: - """Handle the closure of a page.""" print("Page closed") if self._page == closed_page and self._context: pages = self._context.pages @@ -107,13 +93,11 @@ def _handle_page_close(self, closed_page: Page) -> None: self._page = None def _assert_page(self) -> Page: - """Assert that page is available and return it.""" if not self._page: raise ToolError("Page not available. Did you call connect()?") return self._page async def execute(self, action: N1Action) -> ToolResult: - """Execute an n1 action and return the result.""" action_type = action.get("action_type") handlers = { @@ -154,13 +138,10 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: if direction not in ("up", "down", "left", "right"): raise ToolError(f"Invalid scroll direction: {direction}") - # Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels scroll_delta = amount * 100 - # Move mouse to position first await page.mouse.move(coords["x"], coords["y"]) - # Playwright's wheel method takes delta_x and delta_y delta_x = 0 delta_y = 0 @@ -183,17 +164,14 @@ async def _handle_type(self, action: N1Action) -> ToolResult: if not text: raise ToolError("text is required for type action") - # Clear existing text if requested if action.get("clear_before_typing"): await page.keyboard.press("Control+a") await asyncio.sleep(0.1) await page.keyboard.press("Backspace") await asyncio.sleep(0.1) - # Type the text await page.keyboard.type(text) - # Press Enter if requested if action.get("press_enter_after"): await asyncio.sleep(0.1) await page.keyboard.press("Enter") @@ -227,42 +205,28 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: start_coords = self._get_coordinates(action.get("start_coordinates")) end_coords = self._get_coordinates(action.get("center_coordinates")) - # Move to start position await page.mouse.move(start_coords["x"], start_coords["y"]) - - # Press mouse button and wait for dragstart event await page.mouse.down() await asyncio.sleep(0.05) - - # Move gradually to end position using steps for proper drag-and-drop - # The steps parameter makes Playwright simulate intermediate mouse positions - # which is required for HTML5 drag-and-drop to work properly await page.mouse.move(end_coords["x"], end_coords["y"], steps=12) - - # Release mouse button await page.mouse.up() await asyncio.sleep(0.3) return await self.screenshot() async def _handle_wait(self, action: N1Action) -> ToolResult: - # Default wait of 2 seconds for UI to update await asyncio.sleep(2) return await self.screenshot() async def _handle_refresh(self, action: N1Action) -> ToolResult: page = self._assert_page() await page.reload() - - # Wait for page to reload await asyncio.sleep(2) return await self.screenshot() async def _handle_go_back(self, action: N1Action) -> ToolResult: page = self._assert_page() await page.go_back() - - # Wait for navigation await asyncio.sleep(1.5) return await self.screenshot() @@ -273,25 +237,16 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: raise ToolError("url is required for goto_url action") await page.goto(url) - - # Wait for page to load await asyncio.sleep(2) return await self.screenshot() async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: - """ - Read texts and links using Playwright's ariaSnapshot() API. - Returns accessibility tree representation of the page content. - """ page = self._assert_page() try: - # Use the public aria_snapshot() API on the body locator - # This provides an accessibility tree representation of the page snapshot = await page.locator("body").aria_snapshot() url = page.url title = await page.title() - # Get viewport-only screenshot screenshot_result = await self.screenshot() return { @@ -303,17 +258,11 @@ async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: return await self.screenshot() async def _handle_stop(self, action: N1Action) -> ToolResult: - """Return the final answer without taking a screenshot.""" return {"output": action.get("answer", "Task completed")} async def screenshot(self) -> ToolResult: - """ - Take a viewport-only screenshot of the current browser state. - This captures only the browser content, not the OS UI or browser chrome. - """ page = self._assert_page() try: - # full_page=False captures only the viewport (browser content) buffer = await page.screenshot(full_page=False) base64_image = base64.b64encode(buffer).decode("utf-8") return {"base64_image": base64_image} @@ -321,14 +270,12 @@ async def screenshot(self) -> ToolResult: raise ToolError(f"Failed to take screenshot: {e}") def get_current_url(self) -> str: - """Get the current page URL.""" page = self._assert_page() return page.url def _get_coordinates( self, coords: tuple[int, int] | list[int] | None ) -> dict[str, int]: - """Convert n1 coordinates to dict format.""" if coords is None or len(coords) != 2: # Default to center of viewport return {"x": self.width // 2, "y": self.height // 2} @@ -340,10 +287,6 @@ def _get_coordinates( return {"x": int(x), "y": int(y)} def _map_key_to_playwright(self, key: str) -> str: - """ - Map key names to Playwright format. - n1 outputs keys in Playwright format, but some may need adjustment. - """ # Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") if "+" in key: parts = key.split("+") diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts index e9bbdce..afe51ba 100644 --- a/pkg/templates/typescript/yutori-computer-use/index.ts +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -79,9 +79,6 @@ app.action( }, ); -/** - * Extract the last assistant message text from the conversation. - */ function extractLastAssistantMessage(messages: { role: string; content: string | unknown[] }[]): string { for (let i = messages.length - 1; i >= 0; i--) { const msg = messages[i]; diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 2449f70..351aa9c 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -22,7 +22,6 @@ import { PlaywrightComputerTool } from './tools/playwright-computer'; /** Mode for browser interaction */ export type BrowserMode = 'computer_use' | 'playwright'; -/** Interface for tool that can execute n1 actions */ interface N1ComputerTool { execute(action: N1Action): Promise; screenshot(): Promise; @@ -72,9 +71,6 @@ interface SamplingLoopResult { finalAnswer?: string; } -/** - * Run the n1 sampling loop until the model returns a stop action or max iterations. - */ export async function samplingLoop({ model = 'n1-preview-2025-11', task, @@ -94,7 +90,6 @@ export async function samplingLoop({ baseURL: 'https://api.yutori.com/v1', }); - // Create the appropriate tool based on mode let computerTool: N1ComputerTool; let playwrightTool: PlaywrightComputerTool | null = null; @@ -115,12 +110,8 @@ export async function samplingLoop({ } try { - // Take initial screenshot const initialScreenshot = await computerTool.screenshot(); - // Build conversation per n1 format: - // 1. User message with task - // 2. Observation message with screenshot const conversationMessages: Message[] = [ { role: 'user', @@ -128,7 +119,6 @@ export async function samplingLoop({ }, ]; - // Add initial screenshot as observation (n1's required format) if (initialScreenshot.base64Image) { conversationMessages.push({ role: 'observation', @@ -150,7 +140,6 @@ export async function samplingLoop({ iteration++; console.log(`\n=== Iteration ${iteration} ===`); - // Call the n1 API (no system prompt - n1 uses its own) let response; try { response = await client.chat.completions.create({ @@ -177,14 +166,11 @@ export async function samplingLoop({ const responseContent = assistantMessage.content || ''; console.log('Assistant response:', responseContent); - // Add assistant message to conversation conversationMessages.push({ role: 'assistant', content: responseContent, }); - // Parse the action(s) from the response - // n1 returns JSON with "thoughts" and "actions" array const parsed = parseN1Response(responseContent); if (!parsed || !parsed.actions || parsed.actions.length === 0) { @@ -192,21 +178,17 @@ export async function samplingLoop({ break; } - // Execute each action in the actions array for (const action of parsed.actions) { console.log('Executing action:', action.action_type, action); - // Check for stop action if (action.action_type === 'stop') { finalAnswer = action.answer; console.log('Stop action received, final answer:', finalAnswer); return { messages: conversationMessages, finalAnswer }; } - // Scale coordinates from n1's 1000x1000 space to actual viewport const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); - // Execute the action let result: ToolResult; try { result = await computerTool.execute(scaledAction); @@ -217,11 +199,9 @@ export async function samplingLoop({ }; } - // After action, add observation with screenshot and optional text output if (result.base64Image || result.output) { const observationContent: MessageContent[] = []; - // Add text output first (e.g., from read_texts_and_links) if (result.output) { observationContent.push({ type: 'text', @@ -229,7 +209,6 @@ export async function samplingLoop({ }); } - // Add screenshot if (result.base64Image) { observationContent.push({ type: 'image_url', @@ -244,7 +223,6 @@ export async function samplingLoop({ content: observationContent, }); } else if (result.error) { - // If there was an error, add it as text observation conversationMessages.push({ role: 'observation', content: [{ type: 'text', text: `Action failed: ${result.error}` }], @@ -262,16 +240,12 @@ export async function samplingLoop({ finalAnswer, }; } finally { - // Clean up playwright connection if used if (playwrightTool) { await playwrightTool.disconnect(); } } } -/** - * Parse n1's response format: { "thoughts": "...", "actions": [...] } - */ function parseN1Response(content: string): { thoughts?: string; actions?: N1Action[] } | null { try { // The response should be JSON @@ -291,10 +265,6 @@ function parseN1Response(content: string): { thoughts?: string; actions?: N1Acti } } -/** - * Scale coordinates from n1's 1000x1000 space to actual viewport dimensions. - * Per docs: "n1-preview-2025-11 outputs relative coordinates in 1000×1000" - */ function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action { const scaled = { ...action }; diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori-computer-use/session.ts index 0e17baf..24b1b9b 100644 --- a/pkg/templates/typescript/yutori-computer-use/session.ts +++ b/pkg/templates/typescript/yutori-computer-use/session.ts @@ -37,8 +37,6 @@ const DEFAULT_OPTIONS: Required = { timeoutSeconds: 300, recordReplay: false, replayGracePeriod: 5.0, - // Yutori n1 recommended viewport: 1280x800, but Kernel supports 1200x800 - // See: https://docs.yutori.com/reference/n1#screenshot-requirements viewportWidth: 1200, viewportHeight: 800, }; @@ -112,12 +110,7 @@ export class KernelBrowserSession { }; } - /** - * Create a Kernel browser session and optionally start recording. - */ async start(): Promise { - // Create browser with Yutori n1's recommended viewport - // See: https://docs.yutori.com/reference/n1#screenshot-requirements const browser = await this.kernel.browsers.create({ stealth: this.options.stealth, timeout_seconds: this.options.timeoutSeconds, @@ -148,9 +141,6 @@ export class KernelBrowserSession { return this.info; } - /** - * Start recording a replay of the browser session. - */ private async startReplay(): Promise { if (!this._sessionId) { return; @@ -162,9 +152,6 @@ export class KernelBrowserSession { console.log(`Replay recording started: ${this._replayId}`); } - /** - * Stop recording and get the replay URL. - */ private async stopReplay(): Promise { if (!this._sessionId || !this._replayId) { return; @@ -210,9 +197,6 @@ export class KernelBrowserSession { } } - /** - * Stop recording, and delete the browser session. - */ async stop(): Promise { const info = this.info; @@ -229,7 +213,6 @@ export class KernelBrowserSession { info.replayViewUrl = this._replayViewUrl || undefined; } } finally { - // Always clean up the browser session, even if replay stopping fails console.log(`Destroying browser session: ${this._sessionId}`); await this.kernel.browsers.deleteByID(this._sessionId); console.log('Browser session destroyed.'); diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index ae2a950..46fd76e 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -92,9 +92,6 @@ const MODIFIER_MAP: Record = { 'cmd': 'super', }; -/** - * Computer tool for Yutori n1 actions using Kernel browser. - */ export class ComputerTool { private kernel: Kernel; private sessionId: string; @@ -108,9 +105,6 @@ export class ComputerTool { this.height = height; } - /** - * Execute an n1 action and return the result. - */ async execute(action: N1Action): Promise { const { action_type } = action; @@ -168,7 +162,6 @@ export class ComputerTool { throw new ToolError(`Invalid scroll direction: ${direction}`); } - // Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels const scrollDelta = amount * 100; let delta_x = 0; @@ -206,7 +199,6 @@ export class ComputerTool { throw new ToolError('text is required for type action'); } - // Clear existing text if requested if (action.clear_before_typing) { await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['ctrl+a'], @@ -218,13 +210,11 @@ export class ComputerTool { await this.sleep(100); } - // Type the text await this.kernel.browsers.computer.typeText(this.sessionId, { text, delay: TYPING_DELAY_MS, }); - // Press Enter if requested if (action.press_enter_after) { await this.sleep(100); await this.kernel.browsers.computer.pressKey(this.sessionId, { @@ -278,84 +268,58 @@ export class ComputerTool { } private async handleWait(): Promise { - // Default wait of 2 seconds for UI to update await this.sleep(2000); return this.screenshot(); } - /** - * Refresh the page using keyboard shortcut (Ctrl+R or F5) - */ private async handleRefresh(): Promise { await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['F5'], }); - // Wait for page to reload await this.sleep(2000); return this.screenshot(); } - /** - * Go back using keyboard shortcut (Alt+Left) - */ private async handleGoBack(): Promise { await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['alt+Left'], }); - // Wait for navigation await this.sleep(1500); return this.screenshot(); } - /** - * Navigate to URL using keyboard shortcuts: - * 1. Ctrl+L to focus the URL bar - * 2. Type the URL - * 3. Press Enter - */ private async handleGotoUrl(action: N1Action): Promise { const url = action.url; if (!url) { throw new ToolError('url is required for goto_url action'); } - // Focus URL bar with Ctrl+L await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['ctrl+l'], }); await this.sleep(ACTION_DELAY_MS); - // Select all existing text and type the new URL await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['ctrl+a'], }); await this.sleep(100); - // Type the URL await this.kernel.browsers.computer.typeText(this.sessionId, { text: url, delay: TYPING_DELAY_MS, }); await this.sleep(ACTION_DELAY_MS); - // Press Enter to navigate await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['Return'], }); - // Wait for page to load await this.sleep(2000); return this.screenshot(); } - /** - * Read texts and links using Playwright's _snapshotForAI(). - * Per n1 docs this is "implemented as an external VLM call" - we use - * Kernel's Playwright Execution API for the AI snapshot and - * Computer Controls API for the screenshot. - */ private async handleReadTextsAndLinks(): Promise { try { // Get AI snapshot via Playwright Execution API @@ -404,9 +368,6 @@ export class ComputerTool { }; } - /** - * Take a screenshot of the current browser state. - */ async screenshot(): Promise { try { const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); @@ -436,9 +397,6 @@ export class ComputerTool { return { x, y }; } - /** - * Map key names from Playwright format (n1 output) to xdotool format (Kernel). - */ private mapKey(key: string): string { // Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") if (key.includes('+')) { diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts index c9e8806..d6ce229 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts @@ -28,10 +28,6 @@ const MODIFIER_MAP: Record = { 'cmd': 'Meta', }; -/** - * Computer tool for Yutori n1 actions using Playwright via CDP connection. - * Provides viewport-only screenshots optimized for n1 model performance. - */ export class PlaywrightComputerTool { private cdpWsUrl: string; private width: number; @@ -46,10 +42,6 @@ export class PlaywrightComputerTool { this.height = height; } - /** - * Connect to the browser via CDP WebSocket. - * Must be called before executing any actions. - */ async connect(): Promise { if (this.browser) { return; // Already connected @@ -77,13 +69,8 @@ export class PlaywrightComputerTool { this.page.on('close', this.handlePageClose.bind(this)); } - /** - * Disconnect from the browser. - */ async disconnect(): Promise { if (this.browser) { - // Don't close the browser itself - just disconnect the CDP connection - // The browser lifecycle is managed by Kernel this.browser = null; this.context = null; this.page = null; @@ -115,9 +102,6 @@ export class PlaywrightComputerTool { } } - /** - * Execute an n1 action and return the result. - */ async execute(action: N1Action): Promise { this.assertPage(); const { action_type } = action; @@ -171,13 +155,10 @@ export class PlaywrightComputerTool { throw new ToolError(`Invalid scroll direction: ${direction}`); } - // Each scroll amount unit ≈ 10-15% of screen, roughly 100 pixels const scrollDelta = amount * 100; - // Move mouse to position first await this.page.mouse.move(coords.x, coords.y); - // Playwright's wheel method takes deltaX and deltaY let deltaX = 0; let deltaY = 0; @@ -208,7 +189,6 @@ export class PlaywrightComputerTool { throw new ToolError('text is required for type action'); } - // Clear existing text if requested if (action.clear_before_typing) { await this.page.keyboard.press('Control+a'); await this.sleep(100); @@ -216,10 +196,8 @@ export class PlaywrightComputerTool { await this.sleep(100); } - // Type the text await this.page.keyboard.type(text); - // Press Enter if requested if (action.press_enter_after) { await this.sleep(100); await this.page.keyboard.press('Enter'); @@ -258,19 +236,10 @@ export class PlaywrightComputerTool { const startCoords = this.getCoordinates(action.start_coordinates); const endCoords = this.getCoordinates(action.center_coordinates); - // Move to start position await this.page.mouse.move(startCoords.x, startCoords.y); - - // Press mouse button and wait for dragstart event await this.page.mouse.down(); await this.sleep(50); - - // Move gradually to end position using steps for proper drag-and-drop - // The steps parameter makes Playwright simulate intermediate mouse positions - // which is required for HTML5 drag-and-drop to work properly await this.page.mouse.move(endCoords.x, endCoords.y, { steps: 12 }); - - // Release mouse button await this.page.mouse.up(); await this.sleep(300); @@ -278,7 +247,6 @@ export class PlaywrightComputerTool { } private async handleWait(): Promise { - // Default wait of 2 seconds for UI to update await this.sleep(2000); return this.screenshot(); } @@ -286,8 +254,6 @@ export class PlaywrightComputerTool { private async handleRefresh(): Promise { this.assertPage(); await this.page.reload(); - - // Wait for page to reload await this.sleep(2000); return this.screenshot(); } @@ -295,8 +261,6 @@ export class PlaywrightComputerTool { private async handleGoBack(): Promise { this.assertPage(); await this.page.goBack(); - - // Wait for navigation await this.sleep(1500); return this.screenshot(); } @@ -309,26 +273,17 @@ export class PlaywrightComputerTool { } await this.page.goto(url); - - // Wait for page to load await this.sleep(2000); return this.screenshot(); } - /** - * Read texts and links using Playwright's ariaSnapshot() API. - * Returns accessibility tree representation of the page content. - */ private async handleReadTextsAndLinks(): Promise { this.assertPage(); try { - // Use the public ariaSnapshot() API on the body locator - // This provides an accessibility tree representation of the page const snapshot = await this.page.locator('body').ariaSnapshot(); const url = this.page.url(); const title = await this.page.title(); - // Get viewport-only screenshot const screenshotResult = await this.screenshot(); return { @@ -348,14 +303,9 @@ export class PlaywrightComputerTool { }; } - /** - * Take a viewport-only screenshot of the current browser state. - * This captures only the browser content, not the OS UI or browser chrome. - */ async screenshot(): Promise { this.assertPage(); try { - // fullPage: false captures only the viewport (browser content) const buffer = await this.page.screenshot({ fullPage: false }); return { @@ -366,9 +316,6 @@ export class PlaywrightComputerTool { } } - /** - * Get the current page URL. - */ getCurrentUrl(): string { this.assertPage(); return this.page.url(); @@ -388,10 +335,6 @@ export class PlaywrightComputerTool { return { x, y }; } - /** - * Map key names to Playwright format. - * n1 outputs keys in Playwright format, but some may need adjustment. - */ private mapKeyToPlaywright(key: string): string { // Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") if (key.includes('+')) { From f6be454bbd0035b7d51b8c76aa66a6a92ac6051a Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 17:45:26 -0500 Subject: [PATCH 11/12] CLI: Update SDK to cee2050 and add credential-providers commands (#82) (#88) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR updates the Go SDK to cee2050be3f8136505d41c20c2903dfca2cbc479 and adds CLI commands for new SDK methods. ## SDK Update - Updated kernel-go-sdk to cee2050be3f8136505d41c20c2903dfca2cbc479 ## Coverage Analysis This PR was generated by performing a full enumeration of SDK methods and CLI commands. ## New Commands - `kernel credential-providers list` - List configured external credential providers - `kernel credential-providers get ` - Get a credential provider by ID - `kernel credential-providers create` - Create a new credential provider (supports 1Password) - `kernel credential-providers update ` - Update a credential provider's configuration - `kernel credential-providers delete ` - Delete a credential provider - `kernel credential-providers test ` - Test a credential provider connection ## Breaking Changes Fixed - Fixed `browsers.Get()` calls to pass new required `BrowserGetParams` parameter Triggered by: kernel/kernel-go-sdk@cee2050be3f8136505d41c20c2903dfca2cbc479 Reviewer: @masnwilliams --- > [!NOTE] > Introduces new CLI surfaces and updates for latest SDK. > > - **Agent Auth CLI**: `kernel agents auth` with `create/get/list/delete`, `invocations {create/get/exchange/submit}`, and end‑to‑end `run` flow (auto field submission, TOTP, optional live view); docs and examples added to `README.md`. > - **Credential Providers CLI**: `kernel credential-providers {list/get/create/update/delete/test}` (supports 1Password), wired into root. > - **Browsers API updates**: adapt to SDK breaking change (`browsers.Get` now requires `BrowserGetParams`); add `process resize` and filesystem watch (`fs watch start/stop/events`) commands; tests updated accordingly. > - **Dependencies**: bump `kernel-go-sdk` to cee2050… and add `pquerna/otp`; regenerate `go.sum`. > > Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 0b27df6f798b970b7a62adc297bfa198db027ef1. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot). --------- Co-authored-by: Mason Williams <43387599+masnwilliams@users.noreply.github.com> Co-authored-by: Cursor Agent Co-authored-by: Cursor Agent Co-authored-by: Cursor Agent --- README.md | 99 +++ cmd/agents.go | 1360 +++++++++++++++++++++++++++++++++++ cmd/browsers.go | 262 ++++++- cmd/browsers_test.go | 27 +- cmd/credential_providers.go | 466 ++++++++++++ cmd/root.go | 2 + go.mod | 4 +- go.sum | 9 +- 8 files changed, 2177 insertions(+), 52 deletions(-) create mode 100644 cmd/agents.go create mode 100644 cmd/credential_providers.go diff --git a/README.md b/README.md index af92648..398bb45 100644 --- a/README.md +++ b/README.md @@ -427,6 +427,76 @@ Commands with JSON output support: - `kernel proxies delete ` - Delete a proxy configuration - `-y, --yes` - Skip confirmation prompt +### Agent Auth + +Automated authentication for web services. The `run` command orchestrates the full auth flow automatically. + +- `kernel agents auth run` - Run a complete authentication flow + - `--domain ` - Target domain for authentication (required) + - `--profile ` - Profile name to use/create (required) + - `--value ` - Field name=value pair (repeatable, e.g., `--value username=foo --value password=bar`) + - `--credential ` - Existing credential name to use + - `--save-credential-as ` - Save provided credentials under this name + - `--totp-secret ` - Base32 TOTP secret for automatic 2FA + - `--proxy-id ` - Proxy ID to use + - `--login-url ` - Custom login page URL + - `--allowed-domain ` - Additional allowed domains (repeatable) + - `--timeout ` - Maximum time to wait for auth completion (default: 5m) + - `--open` - Open live view URL in browser when human intervention needed + - `--output json`, `-o json` - Output JSONL events + +- `kernel agents auth create` - Create an auth agent + - `--domain ` - Target domain for authentication (required) + - `--profile-name ` - Name of the profile to use (required) + - `--credential-name ` - Optional credential name to link + - `--login-url ` - Optional login page URL + - `--allowed-domain ` - Additional allowed domains (repeatable) + - `--proxy-id ` - Optional proxy ID to use + - `--output json`, `-o json` - Output raw JSON object + +- `kernel agents auth list` - List auth agents + - `--domain ` - Filter by domain + - `--profile-name ` - Filter by profile name + - `--limit ` - Maximum number of results to return + - `--offset ` - Number of results to skip + - `--output json`, `-o json` - Output raw JSON array + +- `kernel agents auth get ` - Get an auth agent by ID + - `--output json`, `-o json` - Output raw JSON object + +- `kernel agents auth delete ` - Delete an auth agent + - `-y, --yes` - Skip confirmation prompt + +### Credentials + +- `kernel credentials create` - Create a new credential + - `--name ` - Unique name for the credential (required) + - `--domain ` - Target domain (required) + - `--value ` - Field name=value pair (repeatable) + - `--sso-provider ` - SSO provider (google, github, microsoft) + - `--totp-secret ` - Base32-encoded TOTP secret for 2FA + - `--output json`, `-o json` - Output raw JSON object + +- `kernel credentials list` - List credentials + - `--domain ` - Filter by domain + - `--output json`, `-o json` - Output raw JSON array + +- `kernel credentials get ` - Get a credential by ID or name + - `--output json`, `-o json` - Output raw JSON object + +- `kernel credentials update ` - Update a credential + - `--name ` - New name + - `--value ` - Field values to update (repeatable) + - `--sso-provider ` - SSO provider + - `--totp-secret ` - TOTP secret + - `--output json`, `-o json` - Output raw JSON object + +- `kernel credentials delete ` - Delete a credential + - `-y, --yes` - Skip confirmation prompt + +- `kernel credentials totp-code ` - Get current TOTP code + - `--output json`, `-o json` - Output raw JSON object + ## Examples ### Create a new app @@ -641,6 +711,35 @@ kernel proxies get prx_123 kernel proxies delete prx_123 --yes ``` +### Agent auth + +```bash +# Run a complete auth flow with inline credentials +kernel agents auth run --domain github.com --profile my-github \ + --value username=myuser --value password=mypass + +# Auth with TOTP for automatic 2FA handling +kernel agents auth run --domain github.com --profile my-github \ + --value username=myuser --value password=mypass \ + --totp-secret JBSWY3DPEHPK3PXP + +# Save credentials for future re-auth +kernel agents auth run --domain github.com --profile my-github \ + --value username=myuser --value password=mypass \ + --save-credential-as github-creds + +# Re-use existing saved credential +kernel agents auth run --domain github.com --profile my-github \ + --credential github-creds + +# Auto-open browser when human intervention is needed +kernel agents auth run --domain github.com --profile my-github \ + --credential github-creds --open + +# Use the authenticated profile with a browser +kernel browsers create --profile-name my-github +``` + ## Getting Help - `kernel --help` - Show all available commands diff --git a/cmd/agents.go b/cmd/agents.go new file mode 100644 index 0000000..718306f --- /dev/null +++ b/cmd/agents.go @@ -0,0 +1,1360 @@ +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/kernel/cli/pkg/util" + "github.com/kernel/kernel-go-sdk" + "github.com/kernel/kernel-go-sdk/option" + "github.com/kernel/kernel-go-sdk/packages/pagination" + "github.com/pkg/browser" + "github.com/pquerna/otp/totp" + "github.com/pterm/pterm" + "github.com/spf13/cobra" +) + +// AgentAuthService defines the subset of the Kernel SDK agent auth client that we use. +type AgentAuthService interface { + New(ctx context.Context, body kernel.AgentAuthNewParams, opts ...option.RequestOption) (res *kernel.AuthAgent, err error) + Get(ctx context.Context, id string, opts ...option.RequestOption) (res *kernel.AuthAgent, err error) + List(ctx context.Context, query kernel.AgentAuthListParams, opts ...option.RequestOption) (res *pagination.OffsetPagination[kernel.AuthAgent], err error) + Delete(ctx context.Context, id string, opts ...option.RequestOption) (err error) +} + +// AgentAuthInvocationsService defines the subset of the Kernel SDK agent auth invocations client that we use. +type AgentAuthInvocationsService interface { + New(ctx context.Context, body kernel.AgentAuthInvocationNewParams, opts ...option.RequestOption) (res *kernel.AuthAgentInvocationCreateResponse, err error) + Get(ctx context.Context, invocationID string, opts ...option.RequestOption) (res *kernel.AgentAuthInvocationResponse, err error) + Exchange(ctx context.Context, invocationID string, body kernel.AgentAuthInvocationExchangeParams, opts ...option.RequestOption) (res *kernel.AgentAuthInvocationExchangeResponse, err error) + Submit(ctx context.Context, invocationID string, body kernel.AgentAuthInvocationSubmitParams, opts ...option.RequestOption) (res *kernel.AgentAuthSubmitResponse, err error) +} + +// AgentAuthCmd handles agent auth operations independent of cobra. +type AgentAuthCmd struct { + auth AgentAuthService + invocations AgentAuthInvocationsService +} + +type AgentAuthCreateInput struct { + Domain string + ProfileName string + CredentialName string + LoginURL string + AllowedDomains []string + ProxyID string + Output string +} + +type AgentAuthGetInput struct { + ID string + Output string +} + +type AgentAuthListInput struct { + Domain string + ProfileName string + Limit int + Offset int + Output string +} + +type AgentAuthDeleteInput struct { + ID string + SkipConfirm bool +} + +type AgentAuthInvocationCreateInput struct { + AuthAgentID string + SaveCredentialAs string + Output string +} + +type AgentAuthInvocationGetInput struct { + InvocationID string + Output string +} + +type AgentAuthInvocationExchangeInput struct { + InvocationID string + Code string + Output string +} + +type AgentAuthInvocationSubmitInput struct { + InvocationID string + FieldValues map[string]string + SSOButton string + SelectedMfaType string + Output string +} + +// AgentAuthRunInput contains all parameters for the automated auth run flow. +type AgentAuthRunInput struct { + Domain string + ProfileName string + Values map[string]string + CredentialName string + SaveCredentialAs string + TotpSecret string + ProxyID string + LoginURL string + AllowedDomains []string + Timeout time.Duration + OpenLiveView bool + Output string +} + +// AgentAuthRunResult is the result of a successful auth run. +type AgentAuthRunResult struct { + ProfileName string `json:"profile_name"` + ProfileID string `json:"profile_id"` + Domain string `json:"domain"` + AuthAgentID string `json:"auth_agent_id"` +} + +// AgentAuthRunEvent represents a status update during the auth run (for JSON output). +type AgentAuthRunEvent struct { + Type string `json:"type"` // status, error, success, waiting + Step string `json:"step,omitempty"` + Status string `json:"status,omitempty"` + Message string `json:"message,omitempty"` + LiveViewURL string `json:"live_view_url,omitempty"` +} + +// AgentAuthRunCmd handles the automated auth run flow. +type AgentAuthRunCmd struct { + auth AgentAuthService + invocations AgentAuthInvocationsService + profiles ProfilesService + credentials CredentialsService +} + +func (c AgentAuthCmd) Create(ctx context.Context, in AgentAuthCreateInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if in.Domain == "" { + return fmt.Errorf("--domain is required") + } + if in.ProfileName == "" { + return fmt.Errorf("--profile-name is required") + } + + params := kernel.AgentAuthNewParams{ + AuthAgentCreateRequest: kernel.AuthAgentCreateRequestParam{ + Domain: in.Domain, + ProfileName: in.ProfileName, + }, + } + if in.CredentialName != "" { + params.AuthAgentCreateRequest.CredentialName = kernel.Opt(in.CredentialName) + } + if in.LoginURL != "" { + params.AuthAgentCreateRequest.LoginURL = kernel.Opt(in.LoginURL) + } + if len(in.AllowedDomains) > 0 { + params.AuthAgentCreateRequest.AllowedDomains = in.AllowedDomains + } + if in.ProxyID != "" { + params.AuthAgentCreateRequest.Proxy = kernel.AuthAgentCreateRequestProxyParam{ + ProxyID: kernel.Opt(in.ProxyID), + } + } + + if in.Output != "json" { + pterm.Info.Printf("Creating auth agent for %s...\n", in.Domain) + } + + agent, err := c.auth.New(ctx, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(agent) + } + + pterm.Success.Printf("Created auth agent: %s\n", agent.ID) + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"ID", agent.ID}, + {"Domain", agent.Domain}, + {"Profile Name", agent.ProfileName}, + {"Status", string(agent.Status)}, + {"Can Reauth", fmt.Sprintf("%t", agent.CanReauth)}, + } + if agent.CredentialName != "" { + tableData = append(tableData, []string{"Credential Name", agent.CredentialName}) + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c AgentAuthCmd) Get(ctx context.Context, in AgentAuthGetInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + agent, err := c.auth.Get(ctx, in.ID) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(agent) + } + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"ID", agent.ID}, + {"Domain", agent.Domain}, + {"Profile Name", agent.ProfileName}, + {"Status", string(agent.Status)}, + {"Can Reauth", fmt.Sprintf("%t", agent.CanReauth)}, + {"Has Selectors", fmt.Sprintf("%t", agent.HasSelectors)}, + } + if agent.CredentialID != "" { + tableData = append(tableData, []string{"Credential ID", agent.CredentialID}) + } + if agent.CredentialName != "" { + tableData = append(tableData, []string{"Credential Name", agent.CredentialName}) + } + if agent.PostLoginURL != "" { + tableData = append(tableData, []string{"Post-Login URL", agent.PostLoginURL}) + } + if !agent.LastAuthCheckAt.IsZero() { + tableData = append(tableData, []string{"Last Auth Check", util.FormatLocal(agent.LastAuthCheckAt)}) + } + if len(agent.AllowedDomains) > 0 { + tableData = append(tableData, []string{"Allowed Domains", strings.Join(agent.AllowedDomains, ", ")}) + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c AgentAuthCmd) List(ctx context.Context, in AgentAuthListInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + params := kernel.AgentAuthListParams{} + if in.Domain != "" { + params.Domain = kernel.Opt(in.Domain) + } + if in.ProfileName != "" { + params.ProfileName = kernel.Opt(in.ProfileName) + } + if in.Limit > 0 { + params.Limit = kernel.Opt(int64(in.Limit)) + } + if in.Offset > 0 { + params.Offset = kernel.Opt(int64(in.Offset)) + } + + page, err := c.auth.List(ctx, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + var agents []kernel.AuthAgent + if page != nil { + agents = page.Items + } + + if in.Output == "json" { + if len(agents) == 0 { + fmt.Println("[]") + return nil + } + return util.PrintPrettyJSONSlice(agents) + } + + if len(agents) == 0 { + pterm.Info.Println("No auth agents found") + return nil + } + + tableData := pterm.TableData{{"ID", "Domain", "Profile Name", "Status", "Can Reauth"}} + for _, agent := range agents { + tableData = append(tableData, []string{ + agent.ID, + agent.Domain, + agent.ProfileName, + string(agent.Status), + fmt.Sprintf("%t", agent.CanReauth), + }) + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c AgentAuthCmd) Delete(ctx context.Context, in AgentAuthDeleteInput) error { + if !in.SkipConfirm { + msg := fmt.Sprintf("Are you sure you want to delete auth agent '%s'?", in.ID) + pterm.DefaultInteractiveConfirm.DefaultText = msg + ok, _ := pterm.DefaultInteractiveConfirm.Show() + if !ok { + pterm.Info.Println("Deletion cancelled") + return nil + } + } + + if err := c.auth.Delete(ctx, in.ID); err != nil { + if util.IsNotFound(err) { + pterm.Info.Printf("Auth agent '%s' not found\n", in.ID) + return nil + } + return util.CleanedUpSdkError{Err: err} + } + pterm.Success.Printf("Deleted auth agent: %s\n", in.ID) + return nil +} + +func (c AgentAuthCmd) InvocationCreate(ctx context.Context, in AgentAuthInvocationCreateInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if in.AuthAgentID == "" { + return fmt.Errorf("--auth-agent-id is required") + } + + params := kernel.AgentAuthInvocationNewParams{ + AuthAgentInvocationCreateRequest: kernel.AuthAgentInvocationCreateRequestParam{ + AuthAgentID: in.AuthAgentID, + }, + } + if in.SaveCredentialAs != "" { + params.AuthAgentInvocationCreateRequest.SaveCredentialAs = kernel.Opt(in.SaveCredentialAs) + } + + if in.Output != "json" { + pterm.Info.Println("Creating auth invocation...") + } + + resp, err := c.invocations.New(ctx, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(resp) + } + + pterm.Success.Printf("Created invocation: %s\n", resp.InvocationID) + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"Invocation ID", resp.InvocationID}, + {"Type", string(resp.Type)}, + {"Handoff Code", resp.HandoffCode}, + {"Hosted URL", resp.HostedURL}, + {"Expires At", util.FormatLocal(resp.ExpiresAt)}, + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c AgentAuthCmd) InvocationGet(ctx context.Context, in AgentAuthInvocationGetInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + resp, err := c.invocations.Get(ctx, in.InvocationID) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(resp) + } + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"App Name", resp.AppName}, + {"Domain", resp.Domain}, + {"Type", string(resp.Type)}, + {"Status", string(resp.Status)}, + {"Step", string(resp.Step)}, + {"Expires At", util.FormatLocal(resp.ExpiresAt)}, + } + if resp.LiveViewURL != "" { + tableData = append(tableData, []string{"Live View URL", resp.LiveViewURL}) + } + if resp.ErrorMessage != "" { + tableData = append(tableData, []string{"Error Message", resp.ErrorMessage}) + } + if resp.ExternalActionMessage != "" { + tableData = append(tableData, []string{"External Action", resp.ExternalActionMessage}) + } + if len(resp.PendingFields) > 0 { + var fields []string + for _, f := range resp.PendingFields { + fields = append(fields, f.Name) + } + tableData = append(tableData, []string{"Pending Fields", strings.Join(fields, ", ")}) + } + if len(resp.SubmittedFields) > 0 { + tableData = append(tableData, []string{"Submitted Fields", strings.Join(resp.SubmittedFields, ", ")}) + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c AgentAuthCmd) InvocationExchange(ctx context.Context, in AgentAuthInvocationExchangeInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if in.Code == "" { + return fmt.Errorf("--code is required") + } + + params := kernel.AgentAuthInvocationExchangeParams{ + Code: in.Code, + } + + resp, err := c.invocations.Exchange(ctx, in.InvocationID, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(resp) + } + + pterm.Success.Printf("Exchanged code for JWT\n") + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"Invocation ID", resp.InvocationID}, + {"JWT", resp.Jwt}, + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c AgentAuthCmd) InvocationSubmit(ctx context.Context, in AgentAuthInvocationSubmitInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + // Validate that exactly one of the submit types is provided + hasFields := len(in.FieldValues) > 0 + hasSSO := in.SSOButton != "" + hasMFA := in.SelectedMfaType != "" + + count := 0 + if hasFields { + count++ + } + if hasSSO { + count++ + } + if hasMFA { + count++ + } + + if count == 0 { + return fmt.Errorf("must provide one of: --field (field values), --sso-button, or --mfa-type") + } + if count > 1 { + return fmt.Errorf("can only provide one of: --field (field values), --sso-button, or --mfa-type") + } + + var params kernel.AgentAuthInvocationSubmitParams + if hasFields { + params.OfFieldValues = &kernel.AgentAuthInvocationSubmitParamsBodyFieldValues{ + FieldValues: in.FieldValues, + } + } else if hasSSO { + params.OfSSOButton = &kernel.AgentAuthInvocationSubmitParamsBodySSOButton{ + SSOButton: in.SSOButton, + } + } else if hasMFA { + params.OfSelectedMfaType = &kernel.AgentAuthInvocationSubmitParamsBodySelectedMfaType{ + SelectedMfaType: in.SelectedMfaType, + } + } + + if in.Output != "json" { + pterm.Info.Println("Submitting to invocation...") + } + + resp, err := c.invocations.Submit(ctx, in.InvocationID, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(resp) + } + + if resp.Accepted { + pterm.Success.Println("Submission accepted") + } else { + pterm.Warning.Println("Submission not accepted") + } + return nil +} + +const ( + totpPeriod = 30 // TOTP codes are valid for 30-second windows + minSecondsRemaining = 5 // Minimum seconds remaining before we wait for next window +) + +// generateTOTPCode generates a TOTP code from a base32 secret. +// Waits for a fresh window if needed to ensure enough time to submit the code. +// If quiet is true, suppresses human-readable console output (for JSON mode). +func generateTOTPCode(secret string, quiet bool) (string, error) { + // Check if we have enough time in the current window + now := time.Now().Unix() + secondsIntoWindow := now % totpPeriod + remaining := totpPeriod - secondsIntoWindow + + if remaining < minSecondsRemaining { + waitTime := remaining + 1 // Wait until just after the new window starts + if !quiet { + pterm.Info.Printf("TOTP window has only %ds remaining, waiting %ds for fresh window...\n", remaining, waitTime) + } + time.Sleep(time.Duration(waitTime) * time.Second) + } + + // Clean the secret (remove spaces that may be added for readability) + cleanSecret := strings.ReplaceAll(strings.ToUpper(secret), " ", "") + + code, err := totp.GenerateCode(cleanSecret, time.Now()) + if err != nil { + return "", fmt.Errorf("failed to generate TOTP code: %w", err) + } + return code, nil +} + +// Run executes the full automated auth flow: create profile, credential, auth agent, and run invocation to completion. +func (c AgentAuthRunCmd) Run(ctx context.Context, in AgentAuthRunInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if in.Domain == "" { + return fmt.Errorf("--domain is required") + } + if in.ProfileName == "" { + return fmt.Errorf("--profile is required") + } + + // Validate that we have credentials to work with + if in.CredentialName == "" && len(in.Values) == 0 { + return fmt.Errorf("must provide either --credential or --value flags with credentials") + } + + jsonOutput := in.Output == "json" + emitEvent := func(event AgentAuthRunEvent) { + if jsonOutput { + data, _ := json.Marshal(event) + fmt.Println(string(data)) + } + } + + // Step 1: Find or create the profile + if !jsonOutput { + pterm.Info.Printf("Looking for profile '%s'...\n", in.ProfileName) + } + emitEvent(AgentAuthRunEvent{Type: "status", Message: "Looking for profile"}) + + var profileID string + profile, err := c.profiles.Get(ctx, in.ProfileName) + if err != nil { + if !util.IsNotFound(err) { + return util.CleanedUpSdkError{Err: err} + } + // Profile not found, create it + if !jsonOutput { + pterm.Info.Printf("Creating profile '%s'...\n", in.ProfileName) + } + emitEvent(AgentAuthRunEvent{Type: "status", Message: "Creating profile"}) + + newProfile, err := c.profiles.New(ctx, kernel.ProfileNewParams{ + Name: kernel.Opt(in.ProfileName), + }) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + profileID = newProfile.ID + if !jsonOutput { + pterm.Success.Printf("Created profile: %s\n", newProfile.ID) + } + } else { + profileID = profile.ID + if !jsonOutput { + pterm.Success.Printf("Found existing profile: %s\n", profile.ID) + } + } + + // Step 2: Handle credentials + var credentialName string + if in.CredentialName != "" { + // Using existing credential + credentialName = in.CredentialName + if !jsonOutput { + pterm.Info.Printf("Using existing credential '%s'\n", credentialName) + } + emitEvent(AgentAuthRunEvent{Type: "status", Message: "Using existing credential"}) + } else if in.SaveCredentialAs != "" { + // Create new credential with provided values + credentialName = in.SaveCredentialAs + if !jsonOutput { + pterm.Info.Printf("Creating credential '%s'...\n", credentialName) + } + emitEvent(AgentAuthRunEvent{Type: "status", Message: "Creating credential"}) + + params := kernel.CredentialNewParams{ + CreateCredentialRequest: kernel.CreateCredentialRequestParam{ + Name: credentialName, + Domain: in.Domain, + Values: in.Values, + }, + } + if in.TotpSecret != "" { + params.CreateCredentialRequest.TotpSecret = kernel.Opt(in.TotpSecret) + } + + _, err := c.credentials.New(ctx, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + if !jsonOutput { + pterm.Success.Printf("Created credential: %s\n", credentialName) + } + } + + // Step 3: Create auth agent + if !jsonOutput { + pterm.Info.Printf("Creating auth agent for %s...\n", in.Domain) + } + emitEvent(AgentAuthRunEvent{Type: "status", Message: "Creating auth agent"}) + + agentParams := kernel.AgentAuthNewParams{ + AuthAgentCreateRequest: kernel.AuthAgentCreateRequestParam{ + Domain: in.Domain, + ProfileName: in.ProfileName, + }, + } + if credentialName != "" { + agentParams.AuthAgentCreateRequest.CredentialName = kernel.Opt(credentialName) + } + if in.LoginURL != "" { + agentParams.AuthAgentCreateRequest.LoginURL = kernel.Opt(in.LoginURL) + } + if len(in.AllowedDomains) > 0 { + agentParams.AuthAgentCreateRequest.AllowedDomains = in.AllowedDomains + } + if in.ProxyID != "" { + agentParams.AuthAgentCreateRequest.Proxy = kernel.AuthAgentCreateRequestProxyParam{ + ProxyID: kernel.Opt(in.ProxyID), + } + } + + agent, err := c.auth.New(ctx, agentParams) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + if !jsonOutput { + pterm.Success.Printf("Created auth agent: %s\n", agent.ID) + } + + // Step 4: Create invocation + if !jsonOutput { + pterm.Info.Println("Starting authentication flow...") + } + emitEvent(AgentAuthRunEvent{Type: "status", Message: "Starting authentication"}) + + invocationParams := kernel.AgentAuthInvocationNewParams{ + AuthAgentInvocationCreateRequest: kernel.AuthAgentInvocationCreateRequestParam{ + AuthAgentID: agent.ID, + }, + } + if in.SaveCredentialAs != "" && credentialName == "" { + // Save credential during invocation if we have values but didn't create upfront + invocationParams.AuthAgentInvocationCreateRequest.SaveCredentialAs = kernel.Opt(in.SaveCredentialAs) + } + + invocation, err := c.invocations.New(ctx, invocationParams) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + // Step 5: Polling loop + deadline := time.Now().Add(in.Timeout) + pollInterval := 2 * time.Second + var lastStep string + liveViewShown := false + fieldsSubmitted := make(map[string]bool) + + if !jsonOutput { + pterm.Info.Println("Waiting for authentication to complete...") + } + + for { + if time.Now().After(deadline) { + emitEvent(AgentAuthRunEvent{Type: "error", Message: "Timeout waiting for authentication"}) + return fmt.Errorf("timeout waiting for authentication to complete") + } + + resp, err := c.invocations.Get(ctx, invocation.InvocationID) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + // Emit status update if step changed + if string(resp.Step) != lastStep { + lastStep = string(resp.Step) + emitEvent(AgentAuthRunEvent{ + Type: "status", + Step: lastStep, + Status: string(resp.Status), + LiveViewURL: resp.LiveViewURL, + }) + if !jsonOutput { + pterm.Info.Printf("Step: %s (Status: %s)\n", resp.Step, resp.Status) + } + } + + // Check terminal states + switch resp.Status { + case kernel.AgentAuthInvocationResponseStatusSuccess: + if !jsonOutput { + pterm.Success.Println("Authentication successful!") + pterm.Success.Printf("Profile '%s' is now authenticated for %s\n", in.ProfileName, in.Domain) + } + result := AgentAuthRunResult{ + ProfileName: in.ProfileName, + ProfileID: profileID, + Domain: in.Domain, + AuthAgentID: agent.ID, + } + if jsonOutput { + emitEvent(AgentAuthRunEvent{Type: "success", Message: "Authentication successful"}) + data, err := json.MarshalIndent(result, "", " ") + if err != nil { + return err + } + fmt.Println(string(data)) + return nil + } + return nil + + case kernel.AgentAuthInvocationResponseStatusFailed: + errMsg := "Authentication failed" + if resp.ErrorMessage != "" { + errMsg = resp.ErrorMessage + } + emitEvent(AgentAuthRunEvent{Type: "error", Message: errMsg}) + return fmt.Errorf("authentication failed: %s", errMsg) + + case kernel.AgentAuthInvocationResponseStatusExpired: + emitEvent(AgentAuthRunEvent{Type: "error", Message: "Authentication session expired"}) + return fmt.Errorf("authentication session expired") + + case kernel.AgentAuthInvocationResponseStatusCanceled: + emitEvent(AgentAuthRunEvent{Type: "error", Message: "Authentication was canceled"}) + return fmt.Errorf("authentication was canceled") + } + + // Handle awaiting_input step + if resp.Step == kernel.AgentAuthInvocationResponseStepAwaitingInput { + // Check for pending fields + if len(resp.PendingFields) > 0 { + // Build field values to submit + submitValues := make(map[string]string) + missingFields := []string{} + + for _, field := range resp.PendingFields { + fieldName := field.Name + // Check if we already submitted this field + if fieldsSubmitted[fieldName] { + continue + } + + // Try to find a matching value + if val, ok := in.Values[fieldName]; ok { + submitValues[fieldName] = val + } else { + // Check common field name aliases + matched := false + aliases := map[string][]string{ + "identifier": {"username", "email", "login"}, + "username": {"identifier", "email", "login"}, + "email": {"identifier", "username", "login"}, + "password": {"pass", "passwd"}, + } + if alts, ok := aliases[fieldName]; ok { + for _, alt := range alts { + if val, ok := in.Values[alt]; ok { + submitValues[fieldName] = val + matched = true + break + } + } + } + + // Check if this looks like a TOTP/verification code field + if !matched && in.TotpSecret != "" { + fieldLower := strings.ToLower(fieldName) + totpPatterns := []string{"totp", "code", "verification", "otp", "2fa", "mfa", "authenticator", "token"} + for _, pattern := range totpPatterns { + if strings.Contains(fieldLower, pattern) { + code, err := generateTOTPCode(in.TotpSecret, jsonOutput) + if err == nil { + submitValues[fieldName] = code + matched = true + if !jsonOutput { + pterm.Info.Printf("Generated TOTP code for field: %s\n", fieldName) + } + } + break + } + } + } + + if !matched { + missingFields = append(missingFields, fieldName) + } + } + } + + // Submit if we have values + if len(submitValues) > 0 { + if !jsonOutput { + var fieldNames []string + for k := range submitValues { + fieldNames = append(fieldNames, k) + } + pterm.Info.Printf("Submitting fields: %s\n", strings.Join(fieldNames, ", ")) + } + + submitParams := kernel.AgentAuthInvocationSubmitParams{ + OfFieldValues: &kernel.AgentAuthInvocationSubmitParamsBodyFieldValues{ + FieldValues: submitValues, + }, + } + _, err := c.invocations.Submit(ctx, invocation.InvocationID, submitParams) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + // Mark fields as submitted + for k := range submitValues { + fieldsSubmitted[k] = true + } + } + + // Show live view if we have missing fields + if len(missingFields) > 0 && !liveViewShown && resp.LiveViewURL != "" { + liveViewShown = true + emitEvent(AgentAuthRunEvent{ + Type: "waiting", + Message: fmt.Sprintf("Need human input for: %s", strings.Join(missingFields, ", ")), + LiveViewURL: resp.LiveViewURL, + }) + if !jsonOutput { + pterm.Warning.Printf("Missing values for fields: %s\n", strings.Join(missingFields, ", ")) + pterm.Info.Printf("Live view: %s\n", resp.LiveViewURL) + } + if in.OpenLiveView { + _ = browser.OpenURL(resp.LiveViewURL) + } + } + } + + // Check for MFA options + if len(resp.MfaOptions) > 0 { + // Check if TOTP is available and we have a secret + hasTOTP := false + for _, opt := range resp.MfaOptions { + if opt.Type == "totp" { + hasTOTP = true + break + } + } + + if hasTOTP && in.TotpSecret != "" { + // Generate and submit TOTP code + code, err := generateTOTPCode(in.TotpSecret, jsonOutput) + if err != nil { + return err + } + + if !jsonOutput { + pterm.Info.Println("Submitting TOTP code...") + } + + submitParams := kernel.AgentAuthInvocationSubmitParams{ + OfFieldValues: &kernel.AgentAuthInvocationSubmitParamsBodyFieldValues{ + FieldValues: map[string]string{"totp": code}, + }, + } + _, err = c.invocations.Submit(ctx, invocation.InvocationID, submitParams) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + } else if !liveViewShown && resp.LiveViewURL != "" { + // Need human for MFA + liveViewShown = true + var optTypes []string + for _, opt := range resp.MfaOptions { + optTypes = append(optTypes, opt.Type) + } + emitEvent(AgentAuthRunEvent{ + Type: "waiting", + Message: fmt.Sprintf("MFA required: %s", strings.Join(optTypes, ", ")), + LiveViewURL: resp.LiveViewURL, + }) + if !jsonOutput { + pterm.Warning.Printf("MFA required. Options: %s\n", strings.Join(optTypes, ", ")) + pterm.Info.Printf("Complete MFA at: %s\n", resp.LiveViewURL) + } + if in.OpenLiveView { + _ = browser.OpenURL(resp.LiveViewURL) + } + } + } + } + + // Handle awaiting_external_action step + if resp.Step == kernel.AgentAuthInvocationResponseStepAwaitingExternalAction && !liveViewShown { + liveViewShown = true + msg := "External action required" + if resp.ExternalActionMessage != "" { + msg = resp.ExternalActionMessage + } + emitEvent(AgentAuthRunEvent{ + Type: "waiting", + Message: msg, + LiveViewURL: resp.LiveViewURL, + }) + if !jsonOutput { + pterm.Warning.Printf("%s\n", msg) + if resp.LiveViewURL != "" { + pterm.Info.Printf("Live view: %s\n", resp.LiveViewURL) + } + } + if in.OpenLiveView && resp.LiveViewURL != "" { + _ = browser.OpenURL(resp.LiveViewURL) + } + } + + // Wait before next poll + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(pollInterval): + } + } +} + +// --- Cobra wiring --- + +var agentsCmd = &cobra.Command{ + Use: "agents", + Short: "Manage agents", + Long: "Commands for managing Kernel agents (auth, etc.)", +} + +var agentsAuthCmd = &cobra.Command{ + Use: "auth", + Short: "Manage auth agents", + Long: "Commands for managing authentication agents that handle login flows", +} + +var agentsAuthCreateCmd = &cobra.Command{ + Use: "create", + Short: "Create an auth agent", + Long: "Create or find an auth agent for a specific domain and profile combination", + Args: cobra.NoArgs, + RunE: runAgentsAuthCreate, +} + +var agentsAuthGetCmd = &cobra.Command{ + Use: "get ", + Short: "Get an auth agent by ID", + Args: cobra.ExactArgs(1), + RunE: runAgentsAuthGet, +} + +var agentsAuthListCmd = &cobra.Command{ + Use: "list", + Short: "List auth agents", + Args: cobra.NoArgs, + RunE: runAgentsAuthList, +} + +var agentsAuthDeleteCmd = &cobra.Command{ + Use: "delete ", + Short: "Delete an auth agent", + Args: cobra.ExactArgs(1), + RunE: runAgentsAuthDelete, +} + +var agentsAuthInvocationsCmd = &cobra.Command{ + Use: "invocations", + Short: "Manage auth invocations", + Long: "Commands for managing authentication invocations (login flows)", +} + +var agentsAuthInvocationsCreateCmd = &cobra.Command{ + Use: "create", + Short: "Create an auth invocation", + Long: "Start a new authentication flow for an auth agent", + Args: cobra.NoArgs, + RunE: runAgentsAuthInvocationsCreate, +} + +var agentsAuthInvocationsGetCmd = &cobra.Command{ + Use: "get ", + Short: "Get an auth invocation", + Args: cobra.ExactArgs(1), + RunE: runAgentsAuthInvocationsGet, +} + +var agentsAuthInvocationsExchangeCmd = &cobra.Command{ + Use: "exchange ", + Short: "Exchange a handoff code for a JWT", + Args: cobra.ExactArgs(1), + RunE: runAgentsAuthInvocationsExchange, +} + +var agentsAuthInvocationsSubmitCmd = &cobra.Command{ + Use: "submit ", + Short: "Submit field values to an invocation", + Long: `Submit field values, SSO button click, or MFA selection to an auth invocation. + +Examples: + # Submit field values + kernel agents auth invocations submit --field username=myuser --field password=mypass + + # Click an SSO button + kernel agents auth invocations submit --sso-button "//button[@id='google-sso']" + + # Select an MFA method + kernel agents auth invocations submit --mfa-type sms`, + Args: cobra.ExactArgs(1), + RunE: runAgentsAuthInvocationsSubmit, +} + +var agentsAuthRunCmd = &cobra.Command{ + Use: "run", + Short: "Run a complete auth flow", + Long: `Run a complete authentication flow for a domain, automatically handling credential submission and polling. + +This command orchestrates the entire agent auth process: +1. Creates or finds a profile with the given name +2. Creates a credential if --save-credential-as is specified +3. Creates an auth agent linking domain, profile, and credential +4. Starts an invocation and polls until completion +5. Auto-submits credentials when prompted +6. Auto-submits TOTP codes if --totp-secret is provided +7. Shows live view URL when human intervention is needed + +Examples: + # Basic auth with inline credentials + kernel agents auth run --domain github.com --profile my-github \ + --value username=myuser --value password=mypass + + # With TOTP for automatic 2FA + kernel agents auth run --domain github.com --profile my-github \ + --value username=myuser --value password=mypass \ + --totp-secret JBSWY3DPEHPK3PXP + + # Save credentials for future re-auth + kernel agents auth run --domain github.com --profile my-github \ + --value username=myuser --value password=mypass \ + --save-credential-as github-creds + + # Re-use existing saved credential + kernel agents auth run --domain github.com --profile my-github \ + --credential github-creds + + # Auto-open browser for human intervention + kernel agents auth run --domain github.com --profile my-github \ + --credential github-creds --open`, + Args: cobra.NoArgs, + RunE: runAgentsAuthRun, +} + +func init() { + // Auth create flags + agentsAuthCreateCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + agentsAuthCreateCmd.Flags().String("domain", "", "Target domain for authentication (required)") + agentsAuthCreateCmd.Flags().String("profile-name", "", "Name of the profile to use (required)") + agentsAuthCreateCmd.Flags().String("credential-name", "", "Optional credential name to link for auto-fill") + agentsAuthCreateCmd.Flags().String("login-url", "", "Optional login page URL") + agentsAuthCreateCmd.Flags().StringSlice("allowed-domain", []string{}, "Additional allowed domains (repeatable)") + agentsAuthCreateCmd.Flags().String("proxy-id", "", "Optional proxy ID to use") + _ = agentsAuthCreateCmd.MarkFlagRequired("domain") + _ = agentsAuthCreateCmd.MarkFlagRequired("profile-name") + + // Auth get flags + agentsAuthGetCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + + // Auth list flags + agentsAuthListCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + agentsAuthListCmd.Flags().String("domain", "", "Filter by domain") + agentsAuthListCmd.Flags().String("profile-name", "", "Filter by profile name") + agentsAuthListCmd.Flags().Int("limit", 0, "Maximum number of results to return") + agentsAuthListCmd.Flags().Int("offset", 0, "Number of results to skip") + + // Auth delete flags + agentsAuthDeleteCmd.Flags().BoolP("yes", "y", false, "Skip confirmation prompt") + + // Invocations create flags + agentsAuthInvocationsCreateCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + agentsAuthInvocationsCreateCmd.Flags().String("auth-agent-id", "", "ID of the auth agent (required)") + agentsAuthInvocationsCreateCmd.Flags().String("save-credential-as", "", "Save credentials under this name on success") + _ = agentsAuthInvocationsCreateCmd.MarkFlagRequired("auth-agent-id") + + // Invocations get flags + agentsAuthInvocationsGetCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + + // Invocations exchange flags + agentsAuthInvocationsExchangeCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + agentsAuthInvocationsExchangeCmd.Flags().String("code", "", "Handoff code from the start endpoint (required)") + _ = agentsAuthInvocationsExchangeCmd.MarkFlagRequired("code") + + // Invocations submit flags + agentsAuthInvocationsSubmitCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + agentsAuthInvocationsSubmitCmd.Flags().StringArray("field", []string{}, "Field name=value pair (repeatable)") + agentsAuthInvocationsSubmitCmd.Flags().String("sso-button", "", "Selector of SSO button to click") + agentsAuthInvocationsSubmitCmd.Flags().String("mfa-type", "", "MFA type to select (sms, call, email, totp, push, security_key)") + + // Auth run flags + agentsAuthRunCmd.Flags().StringP("output", "o", "", "Output format: json for JSONL events") + agentsAuthRunCmd.Flags().String("domain", "", "Target domain for authentication (required)") + agentsAuthRunCmd.Flags().String("profile", "", "Profile name to use/create (required)") + agentsAuthRunCmd.Flags().StringArray("value", []string{}, "Field name=value pair (e.g., --value username=foo --value password=bar)") + agentsAuthRunCmd.Flags().String("credential", "", "Existing credential name to use") + agentsAuthRunCmd.Flags().String("save-credential-as", "", "Save provided credentials under this name") + agentsAuthRunCmd.Flags().String("totp-secret", "", "Base32 TOTP secret for automatic 2FA") + agentsAuthRunCmd.Flags().String("proxy-id", "", "Proxy ID to use") + agentsAuthRunCmd.Flags().String("login-url", "", "Custom login page URL") + agentsAuthRunCmd.Flags().StringSlice("allowed-domain", []string{}, "Additional allowed domains") + agentsAuthRunCmd.Flags().Duration("timeout", 5*time.Minute, "Maximum time to wait for auth completion") + agentsAuthRunCmd.Flags().Bool("open", false, "Open live view URL in browser when human intervention needed") + _ = agentsAuthRunCmd.MarkFlagRequired("domain") + _ = agentsAuthRunCmd.MarkFlagRequired("profile") + + // Wire up commands + agentsAuthInvocationsCmd.AddCommand(agentsAuthInvocationsCreateCmd) + agentsAuthInvocationsCmd.AddCommand(agentsAuthInvocationsGetCmd) + agentsAuthInvocationsCmd.AddCommand(agentsAuthInvocationsExchangeCmd) + agentsAuthInvocationsCmd.AddCommand(agentsAuthInvocationsSubmitCmd) + + agentsAuthCmd.AddCommand(agentsAuthCreateCmd) + agentsAuthCmd.AddCommand(agentsAuthGetCmd) + agentsAuthCmd.AddCommand(agentsAuthListCmd) + agentsAuthCmd.AddCommand(agentsAuthDeleteCmd) + agentsAuthCmd.AddCommand(agentsAuthInvocationsCmd) + agentsAuthCmd.AddCommand(agentsAuthRunCmd) + + agentsCmd.AddCommand(agentsAuthCmd) +} + +func runAgentsAuthCreate(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + domain, _ := cmd.Flags().GetString("domain") + profileName, _ := cmd.Flags().GetString("profile-name") + credentialName, _ := cmd.Flags().GetString("credential-name") + loginURL, _ := cmd.Flags().GetString("login-url") + allowedDomains, _ := cmd.Flags().GetStringSlice("allowed-domain") + proxyID, _ := cmd.Flags().GetString("proxy-id") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.Create(cmd.Context(), AgentAuthCreateInput{ + Domain: domain, + ProfileName: profileName, + CredentialName: credentialName, + LoginURL: loginURL, + AllowedDomains: allowedDomains, + ProxyID: proxyID, + Output: output, + }) +} + +func runAgentsAuthGet(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.Get(cmd.Context(), AgentAuthGetInput{ + ID: args[0], + Output: output, + }) +} + +func runAgentsAuthList(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + domain, _ := cmd.Flags().GetString("domain") + profileName, _ := cmd.Flags().GetString("profile-name") + limit, _ := cmd.Flags().GetInt("limit") + offset, _ := cmd.Flags().GetInt("offset") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.List(cmd.Context(), AgentAuthListInput{ + Domain: domain, + ProfileName: profileName, + Limit: limit, + Offset: offset, + Output: output, + }) +} + +func runAgentsAuthDelete(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + skip, _ := cmd.Flags().GetBool("yes") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.Delete(cmd.Context(), AgentAuthDeleteInput{ + ID: args[0], + SkipConfirm: skip, + }) +} + +func runAgentsAuthInvocationsCreate(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + authAgentID, _ := cmd.Flags().GetString("auth-agent-id") + saveCredentialAs, _ := cmd.Flags().GetString("save-credential-as") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.InvocationCreate(cmd.Context(), AgentAuthInvocationCreateInput{ + AuthAgentID: authAgentID, + SaveCredentialAs: saveCredentialAs, + Output: output, + }) +} + +func runAgentsAuthInvocationsGet(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.InvocationGet(cmd.Context(), AgentAuthInvocationGetInput{ + InvocationID: args[0], + Output: output, + }) +} + +func runAgentsAuthInvocationsExchange(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + code, _ := cmd.Flags().GetString("code") + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.InvocationExchange(cmd.Context(), AgentAuthInvocationExchangeInput{ + InvocationID: args[0], + Code: code, + Output: output, + }) +} + +func runAgentsAuthInvocationsSubmit(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + fieldPairs, _ := cmd.Flags().GetStringArray("field") + ssoButton, _ := cmd.Flags().GetString("sso-button") + mfaType, _ := cmd.Flags().GetString("mfa-type") + + // Parse field pairs into map + fieldValues := make(map[string]string) + for _, pair := range fieldPairs { + parts := strings.SplitN(pair, "=", 2) + if len(parts) != 2 { + return fmt.Errorf("invalid field format: %s (expected key=value)", pair) + } + fieldValues[parts[0]] = parts[1] + } + + svc := client.Agents.Auth + c := AgentAuthCmd{auth: &svc, invocations: &svc.Invocations} + return c.InvocationSubmit(cmd.Context(), AgentAuthInvocationSubmitInput{ + InvocationID: args[0], + FieldValues: fieldValues, + SSOButton: ssoButton, + SelectedMfaType: mfaType, + Output: output, + }) +} + +func runAgentsAuthRun(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + + output, _ := cmd.Flags().GetString("output") + domain, _ := cmd.Flags().GetString("domain") + profileName, _ := cmd.Flags().GetString("profile") + valuePairs, _ := cmd.Flags().GetStringArray("value") + credentialName, _ := cmd.Flags().GetString("credential") + saveCredentialAs, _ := cmd.Flags().GetString("save-credential-as") + totpSecret, _ := cmd.Flags().GetString("totp-secret") + proxyID, _ := cmd.Flags().GetString("proxy-id") + loginURL, _ := cmd.Flags().GetString("login-url") + allowedDomains, _ := cmd.Flags().GetStringSlice("allowed-domain") + timeout, _ := cmd.Flags().GetDuration("timeout") + openLiveView, _ := cmd.Flags().GetBool("open") + + // Parse value pairs into map + values := make(map[string]string) + for _, pair := range valuePairs { + parts := strings.SplitN(pair, "=", 2) + if len(parts) != 2 { + return fmt.Errorf("invalid value format: %s (expected key=value)", pair) + } + values[parts[0]] = parts[1] + } + + authSvc := client.Agents.Auth + profilesSvc := client.Profiles + credentialsSvc := client.Credentials + + c := AgentAuthRunCmd{ + auth: &authSvc, + invocations: &authSvc.Invocations, + profiles: &profilesSvc, + credentials: &credentialsSvc, + } + + return c.Run(cmd.Context(), AgentAuthRunInput{ + Domain: domain, + ProfileName: profileName, + Values: values, + CredentialName: credentialName, + SaveCredentialAs: saveCredentialAs, + TotpSecret: totpSecret, + ProxyID: proxyID, + LoginURL: loginURL, + AllowedDomains: allowedDomains, + Timeout: timeout, + OpenLiveView: openLiveView, + Output: output, + }) +} diff --git a/cmd/browsers.go b/cmd/browsers.go index 9297ef7..3f3291a 100644 --- a/cmd/browsers.go +++ b/cmd/browsers.go @@ -29,7 +29,7 @@ import ( // BrowsersService defines the subset of the Kernel SDK browser client that we use. // See https://github.com/kernel/kernel-go-sdk/blob/main/browser.go type BrowsersService interface { - Get(ctx context.Context, id string, opts ...option.RequestOption) (res *kernel.BrowserGetResponse, err error) + Get(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (res *kernel.BrowserGetResponse, err error) List(ctx context.Context, query kernel.BrowserListParams, opts ...option.RequestOption) (res *pagination.OffsetPagination[kernel.BrowserListResponse], err error) New(ctx context.Context, body kernel.BrowserNewParams, opts ...option.RequestOption) (res *kernel.BrowserNewResponse, err error) Update(ctx context.Context, id string, body kernel.BrowserUpdateParams, opts ...option.RequestOption) (res *kernel.BrowserUpdateResponse, err error) @@ -66,12 +66,20 @@ type BrowserFSService interface { type BrowserProcessService interface { Exec(ctx context.Context, id string, body kernel.BrowserProcessExecParams, opts ...option.RequestOption) (res *kernel.BrowserProcessExecResponse, err error) Kill(ctx context.Context, processID string, params kernel.BrowserProcessKillParams, opts ...option.RequestOption) (res *kernel.BrowserProcessKillResponse, err error) + Resize(ctx context.Context, processID string, params kernel.BrowserProcessResizeParams, opts ...option.RequestOption) (res *kernel.BrowserProcessResizeResponse, err error) Spawn(ctx context.Context, id string, body kernel.BrowserProcessSpawnParams, opts ...option.RequestOption) (res *kernel.BrowserProcessSpawnResponse, err error) Status(ctx context.Context, processID string, query kernel.BrowserProcessStatusParams, opts ...option.RequestOption) (res *kernel.BrowserProcessStatusResponse, err error) Stdin(ctx context.Context, processID string, params kernel.BrowserProcessStdinParams, opts ...option.RequestOption) (res *kernel.BrowserProcessStdinResponse, err error) StdoutStreamStreaming(ctx context.Context, processID string, query kernel.BrowserProcessStdoutStreamParams, opts ...option.RequestOption) (stream *ssestream.Stream[kernel.BrowserProcessStdoutStreamResponse]) } +// BrowserFWatchService defines the subset we use for browser filesystem watch APIs. +type BrowserFWatchService interface { + EventsStreaming(ctx context.Context, watchID string, query kernel.BrowserFWatchEventsParams, opts ...option.RequestOption) (stream *ssestream.Stream[kernel.BrowserFWatchEventsResponse]) + Start(ctx context.Context, id string, body kernel.BrowserFWatchStartParams, opts ...option.RequestOption) (res *kernel.BrowserFWatchStartResponse, err error) + Stop(ctx context.Context, watchID string, body kernel.BrowserFWatchStopParams, opts ...option.RequestOption) (err error) +} + // BrowserLogService defines the subset we use for browser log APIs. type BrowserLogService interface { StreamStreaming(ctx context.Context, id string, query kernel.BrowserLogStreamParams, opts ...option.RequestOption) (stream *ssestream.Stream[shared.LogEvent]) @@ -100,6 +108,12 @@ type BoolFlag struct { Value bool } +// Int64Flag captures whether an int64 flag was set explicitly and its value. +type Int64Flag struct { + Set bool + Value int64 +} + // Regular expression to validate CUID2 identifiers (24 lowercase alphanumeric characters). var cuidRegex = regexp.MustCompile(`^[a-z0-9]{24}$`) @@ -193,6 +207,7 @@ type BrowsersCmd struct { browsers BrowsersService replays BrowserReplaysService fs BrowserFSService + fsWatch BrowserFWatchService process BrowserProcessService logs BrowserLogService computer BrowserComputerService @@ -437,7 +452,7 @@ func (b BrowsersCmd) View(ctx context.Context, in BrowsersViewInput) error { return fmt.Errorf("unsupported --output value: use 'json'") } - browser, err := b.browsers.Get(ctx, in.Identifier) + browser, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -471,7 +486,7 @@ func (b BrowsersCmd) Get(ctx context.Context, in BrowsersGetInput) error { return fmt.Errorf("unsupported --output value: use 'json'") } - browser, err := b.browsers.Get(ctx, in.Identifier) + browser, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -574,7 +589,7 @@ func (b BrowsersCmd) LogsStream(ctx context.Context, in BrowsersLogsStreamInput) pterm.Error.Println("logs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -676,7 +691,7 @@ func (b BrowsersCmd) ComputerClickMouse(ctx context.Context, in BrowsersComputer pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -705,7 +720,7 @@ func (b BrowsersCmd) ComputerMoveMouse(ctx context.Context, in BrowsersComputerM pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -725,7 +740,7 @@ func (b BrowsersCmd) ComputerScreenshot(ctx context.Context, in BrowsersComputer pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -761,7 +776,7 @@ func (b BrowsersCmd) ComputerTypeText(ctx context.Context, in BrowsersComputerTy pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -781,7 +796,7 @@ func (b BrowsersCmd) ComputerPressKey(ctx context.Context, in BrowsersComputerPr pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -808,7 +823,7 @@ func (b BrowsersCmd) ComputerScroll(ctx context.Context, in BrowsersComputerScro pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -834,7 +849,7 @@ func (b BrowsersCmd) ComputerDragMouse(ctx context.Context, in BrowsersComputerD pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -870,7 +885,7 @@ func (b BrowsersCmd) ComputerSetCursor(ctx context.Context, in BrowsersComputerS pterm.Error.Println("computer service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -916,7 +931,7 @@ func (b BrowsersCmd) ReplaysList(ctx context.Context, in BrowsersReplaysListInpu return fmt.Errorf("unsupported --output value: use 'json'") } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -950,7 +965,7 @@ func (b BrowsersCmd) ReplaysStart(ctx context.Context, in BrowsersReplaysStartIn return fmt.Errorf("unsupported --output value: use 'json'") } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -976,7 +991,7 @@ func (b BrowsersCmd) ReplaysStart(ctx context.Context, in BrowsersReplaysStartIn } func (b BrowsersCmd) ReplaysStop(ctx context.Context, in BrowsersReplaysStopInput) error { - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -989,7 +1004,7 @@ func (b BrowsersCmd) ReplaysStop(ctx context.Context, in BrowsersReplaysStopInpu } func (b BrowsersCmd) ReplaysDownload(ctx context.Context, in BrowsersReplaysDownloadInput) error { - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1062,6 +1077,31 @@ type BrowsersProcessStdoutStreamInput struct { ProcessID string } +type BrowsersProcessResizeInput struct { + Identifier string + ProcessID string + Cols int64 + Rows int64 +} + +// FS Watch +type BrowsersFSWatchStartInput struct { + Identifier string + Path string + Recursive BoolFlag + Output string +} + +type BrowsersFSWatchStopInput struct { + Identifier string + WatchID string +} + +type BrowsersFSWatchEventsInput struct { + Identifier string + WatchID string +} + // Playwright type BrowsersPlaywrightExecuteInput struct { Identifier string @@ -1074,7 +1114,7 @@ func (b BrowsersCmd) PlaywrightExecute(ctx context.Context, in BrowsersPlaywrigh pterm.Error.Println("playwright service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1120,7 +1160,7 @@ func (b BrowsersCmd) ProcessExec(ctx context.Context, in BrowsersProcessExecInpu pterm.Error.Println("process service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1187,7 +1227,7 @@ func (b BrowsersCmd) ProcessSpawn(ctx context.Context, in BrowsersProcessSpawnIn pterm.Error.Println("process service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1226,7 +1266,7 @@ func (b BrowsersCmd) ProcessKill(ctx context.Context, in BrowsersProcessKillInpu pterm.Error.Println("process service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1244,7 +1284,7 @@ func (b BrowsersCmd) ProcessStatus(ctx context.Context, in BrowsersProcessStatus pterm.Error.Println("process service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1262,7 +1302,7 @@ func (b BrowsersCmd) ProcessStdin(ctx context.Context, in BrowsersProcessStdinIn pterm.Error.Println("process service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1279,7 +1319,7 @@ func (b BrowsersCmd) ProcessStdoutStream(ctx context.Context, in BrowsersProcess pterm.Error.Println("process service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1308,6 +1348,97 @@ func (b BrowsersCmd) ProcessStdoutStream(ctx context.Context, in BrowsersProcess return nil } +func (b BrowsersCmd) ProcessResize(ctx context.Context, in BrowsersProcessResizeInput) error { + if b.process == nil { + pterm.Error.Println("process service not available") + return nil + } + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + params := kernel.BrowserProcessResizeParams{ID: br.SessionID, Cols: in.Cols, Rows: in.Rows} + _, err = b.process.Resize(ctx, in.ProcessID, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + pterm.Success.Printf("Resized process %s PTY to %dx%d\n", in.ProcessID, in.Cols, in.Rows) + return nil +} + +// FS Watch +func (b BrowsersCmd) FSWatchStart(ctx context.Context, in BrowsersFSWatchStartInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if b.fsWatch == nil { + pterm.Error.Println("fs watch service not available") + return nil + } + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + params := kernel.BrowserFWatchStartParams{Path: in.Path} + if in.Recursive.Set { + params.Recursive = kernel.Opt(in.Recursive.Value) + } + res, err := b.fsWatch.Start(ctx, br.SessionID, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(res) + } + + pterm.Success.Printf("Started watch on %s with ID: %s\n", in.Path, res.WatchID) + return nil +} + +func (b BrowsersCmd) FSWatchStop(ctx context.Context, in BrowsersFSWatchStopInput) error { + if b.fsWatch == nil { + pterm.Error.Println("fs watch service not available") + return nil + } + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + err = b.fsWatch.Stop(ctx, in.WatchID, kernel.BrowserFWatchStopParams{ID: br.SessionID}) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + pterm.Success.Printf("Stopped watch %s\n", in.WatchID) + return nil +} + +func (b BrowsersCmd) FSWatchEvents(ctx context.Context, in BrowsersFSWatchEventsInput) error { + if b.fsWatch == nil { + pterm.Error.Println("fs watch service not available") + return nil + } + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + stream := b.fsWatch.EventsStreaming(ctx, in.WatchID, kernel.BrowserFWatchEventsParams{ID: br.SessionID}) + if stream == nil { + pterm.Error.Println("failed to open watch events stream") + return nil + } + defer stream.Close() + for stream.Next() { + ev := stream.Current() + pterm.Printf("[%s] %s: %s\n", ev.Type, ev.Name, ev.Path) + } + if err := stream.Err(); err != nil { + return util.CleanedUpSdkError{Err: err} + } + return nil +} + // FS (minimal scaffolding) type BrowsersFSNewDirInput struct { Identifier string @@ -1397,7 +1528,7 @@ func (b BrowsersCmd) FSNewDirectory(ctx context.Context, in BrowsersFSNewDirInpu pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1417,7 +1548,7 @@ func (b BrowsersCmd) FSDeleteDirectory(ctx context.Context, in BrowsersFSDeleteD pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1433,7 +1564,7 @@ func (b BrowsersCmd) FSDeleteFile(ctx context.Context, in BrowsersFSDeleteFileIn pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1449,7 +1580,7 @@ func (b BrowsersCmd) FSDownloadDirZip(ctx context.Context, in BrowsersFSDownload pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1486,7 +1617,7 @@ func (b BrowsersCmd) FSFileInfo(ctx context.Context, in BrowsersFSFileInfoInput) pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1513,7 +1644,7 @@ func (b BrowsersCmd) FSListFiles(ctx context.Context, in BrowsersFSListFilesInpu pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1547,7 +1678,7 @@ func (b BrowsersCmd) FSMove(ctx context.Context, in BrowsersFSMoveInput) error { pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1563,7 +1694,7 @@ func (b BrowsersCmd) FSReadFile(ctx context.Context, in BrowsersFSReadFileInput) pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1595,7 +1726,7 @@ func (b BrowsersCmd) FSSetPermissions(ctx context.Context, in BrowsersFSSetPerms pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1618,7 +1749,7 @@ func (b BrowsersCmd) FSUpload(ctx context.Context, in BrowsersFSUploadInput) err pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1676,7 +1807,7 @@ func (b BrowsersCmd) FSUploadZip(ctx context.Context, in BrowsersFSUploadZipInpu pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1698,7 +1829,7 @@ func (b BrowsersCmd) FSWriteFile(ctx context.Context, in BrowsersFSWriteFileInpu pterm.Error.Println("fs service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1731,7 +1862,7 @@ func (b BrowsersCmd) ExtensionsUpload(ctx context.Context, in BrowsersExtensions pterm.Error.Println("browsers service not available") return nil } - br, err := b.browsers.Get(ctx, in.Identifier) + br, err := b.browsers.Get(ctx, in.Identifier, kernel.BrowserGetParams{}) if err != nil { return util.CleanedUpSdkError{Err: err} } @@ -1947,7 +2078,12 @@ func init() { procStdin.Flags().String("data-b64", "", "Base64-encoded data to write to stdin") _ = procStdin.MarkFlagRequired("data-b64") procStdoutStream := &cobra.Command{Use: "stdout-stream ", Short: "Stream process stdout/stderr", Args: cobra.ExactArgs(2), RunE: runBrowsersProcessStdoutStream} - procRoot.AddCommand(procExec, procSpawn, procKill, procStatus, procStdin, procStdoutStream) + procResize := &cobra.Command{Use: "resize ", Short: "Resize a PTY-backed process terminal", Args: cobra.ExactArgs(2), RunE: runBrowsersProcessResize} + procResize.Flags().Int64("cols", 0, "New terminal columns (required)") + procResize.Flags().Int64("rows", 0, "New terminal rows (required)") + _ = procResize.MarkFlagRequired("cols") + _ = procResize.MarkFlagRequired("rows") + procRoot.AddCommand(procExec, procSpawn, procKill, procStatus, procStdin, procStdoutStream, procResize) browsersCmd.AddCommand(procRoot) // fs @@ -2012,7 +2148,18 @@ func init() { fsWriteFile.Flags().String("source", "", "Local source file path") _ = fsWriteFile.MarkFlagRequired("source") - fsRoot.AddCommand(fsNewDir, fsDelDir, fsDelFile, fsDownloadZip, fsFileInfo, fsListFiles, fsMove, fsReadFile, fsSetPerms, fsUpload, fsUploadZip, fsWriteFile) + // fs watch + fsWatchRoot := &cobra.Command{Use: "watch", Short: "Watch directories for changes"} + fsWatchStart := &cobra.Command{Use: "start ", Short: "Start watching a directory", Args: cobra.ExactArgs(1), RunE: runBrowsersFSWatchStart} + fsWatchStart.Flags().String("path", "", "Directory to watch (required)") + _ = fsWatchStart.MarkFlagRequired("path") + fsWatchStart.Flags().Bool("recursive", false, "Watch recursively") + fsWatchStart.Flags().StringP("output", "o", "", "Output format: json for raw API response") + fsWatchStop := &cobra.Command{Use: "stop ", Short: "Stop watching a directory", Args: cobra.ExactArgs(2), RunE: runBrowsersFSWatchStop} + fsWatchEvents := &cobra.Command{Use: "events ", Short: "Stream filesystem events", Args: cobra.ExactArgs(2), RunE: runBrowsersFSWatchEvents} + fsWatchRoot.AddCommand(fsWatchStart, fsWatchStop, fsWatchEvents) + + fsRoot.AddCommand(fsNewDir, fsDelDir, fsDelFile, fsDownloadZip, fsFileInfo, fsListFiles, fsMove, fsReadFile, fsSetPerms, fsUpload, fsUploadZip, fsWriteFile, fsWatchRoot) browsersCmd.AddCommand(fsRoot) // extensions @@ -2112,7 +2259,6 @@ func init() { browsersCreateCmd.Flags().String("pool-id", "", "Browser pool ID to acquire from (mutually exclusive with --pool-name)") browsersCreateCmd.Flags().String("pool-name", "", "Browser pool name to acquire from (mutually exclusive with --pool-id)") - // no flags for view; it takes a single positional argument } @@ -2441,6 +2587,44 @@ func runBrowsersProcessStdoutStream(cmd *cobra.Command, args []string) error { return b.ProcessStdoutStream(cmd.Context(), BrowsersProcessStdoutStreamInput{Identifier: args[0], ProcessID: args[1]}) } +func runBrowsersProcessResize(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + svc := client.Browsers + cols, _ := cmd.Flags().GetInt64("cols") + rows, _ := cmd.Flags().GetInt64("rows") + b := BrowsersCmd{browsers: &svc, process: &svc.Process} + return b.ProcessResize(cmd.Context(), BrowsersProcessResizeInput{Identifier: args[0], ProcessID: args[1], Cols: cols, Rows: rows}) +} + +func runBrowsersFSWatchStart(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + svc := client.Browsers + path, _ := cmd.Flags().GetString("path") + recursive, _ := cmd.Flags().GetBool("recursive") + output, _ := cmd.Flags().GetString("output") + b := BrowsersCmd{browsers: &svc, fsWatch: &svc.Fs.Watch} + return b.FSWatchStart(cmd.Context(), BrowsersFSWatchStartInput{ + Identifier: args[0], + Path: path, + Recursive: BoolFlag{Set: cmd.Flags().Changed("recursive"), Value: recursive}, + Output: output, + }) +} + +func runBrowsersFSWatchStop(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + svc := client.Browsers + b := BrowsersCmd{browsers: &svc, fsWatch: &svc.Fs.Watch} + return b.FSWatchStop(cmd.Context(), BrowsersFSWatchStopInput{Identifier: args[0], WatchID: args[1]}) +} + +func runBrowsersFSWatchEvents(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + svc := client.Browsers + b := BrowsersCmd{browsers: &svc, fsWatch: &svc.Fs.Watch} + return b.FSWatchEvents(cmd.Context(), BrowsersFSWatchEventsInput{Identifier: args[0], WatchID: args[1]}) +} + func runBrowsersPlaywrightExecute(cmd *cobra.Command, args []string) error { client := getKernelClient(cmd) svc := client.Browsers diff --git a/cmd/browsers_test.go b/cmd/browsers_test.go index 2e9a6e8..447b6bd 100644 --- a/cmd/browsers_test.go +++ b/cmd/browsers_test.go @@ -54,7 +54,7 @@ func setupStdoutCapture(t *testing.T) { // FakeBrowsersService is a configurable fake implementing BrowsersService. type FakeBrowsersService struct { - GetFunc func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) + GetFunc func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) ListFunc func(ctx context.Context, query kernel.BrowserListParams, opts ...option.RequestOption) (*pagination.OffsetPagination[kernel.BrowserListResponse], error) NewFunc func(ctx context.Context, body kernel.BrowserNewParams, opts ...option.RequestOption) (*kernel.BrowserNewResponse, error) UpdateFunc func(ctx context.Context, id string, body kernel.BrowserUpdateParams, opts ...option.RequestOption) (*kernel.BrowserUpdateResponse, error) @@ -63,9 +63,9 @@ type FakeBrowsersService struct { LoadExtensionsFunc func(ctx context.Context, id string, body kernel.BrowserLoadExtensionsParams, opts ...option.RequestOption) error } -func (f *FakeBrowsersService) Get(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { +func (f *FakeBrowsersService) Get(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { if f.GetFunc != nil { - return f.GetFunc(ctx, id, opts...) + return f.GetFunc(ctx, id, query, opts...) } return nil, errors.New("not found") } @@ -292,7 +292,7 @@ func TestBrowsersView_ByID_PrintsURL(t *testing.T) { }) fake := &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { return &kernel.BrowserGetResponse{ SessionID: "abc", BrowserLiveViewURL: "http://live-url", @@ -325,7 +325,7 @@ func TestBrowsersView_HeadlessBrowser_ShowsWarning(t *testing.T) { setupStdoutCapture(t) fake := &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { return &kernel.BrowserGetResponse{ SessionID: "abc", Headless: true, @@ -344,7 +344,7 @@ func TestBrowsersView_PrintsErrorOnGetFailure(t *testing.T) { setupStdoutCapture(t) fake := &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { return nil, errors.New("get error") }, } @@ -360,7 +360,7 @@ func TestBrowsersGet_PrintsDetails(t *testing.T) { created := time.Date(2025, 1, 2, 3, 4, 5, 0, time.UTC) fake := &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { return &kernel.BrowserGetResponse{ SessionID: "sess-123", CdpWsURL: "ws://cdp-url", @@ -404,7 +404,7 @@ func TestBrowsersGet_JSONOutput(t *testing.T) { }) fake := &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { // Unmarshal JSON to populate RawJSON() properly jsonData := `{"session_id": "sess-json", "cdp_ws_url": "ws://cdp", "created_at": "2024-01-01T00:00:00Z", "headless": false, "stealth": false, "timeout_seconds": 60}` var resp kernel.BrowserGetResponse @@ -442,7 +442,7 @@ func TestBrowsersGet_Error(t *testing.T) { setupStdoutCapture(t) fake := &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { return nil, errors.New("get failed") }, } @@ -580,6 +580,7 @@ func (f *FakeFSService) WriteFile(ctx context.Context, id string, contents io.Re type FakeProcessService struct { ExecFunc func(ctx context.Context, id string, body kernel.BrowserProcessExecParams, opts ...option.RequestOption) (*kernel.BrowserProcessExecResponse, error) KillFunc func(ctx context.Context, processID string, params kernel.BrowserProcessKillParams, opts ...option.RequestOption) (*kernel.BrowserProcessKillResponse, error) + ResizeFunc func(ctx context.Context, processID string, params kernel.BrowserProcessResizeParams, opts ...option.RequestOption) (*kernel.BrowserProcessResizeResponse, error) SpawnFunc func(ctx context.Context, id string, body kernel.BrowserProcessSpawnParams, opts ...option.RequestOption) (*kernel.BrowserProcessSpawnResponse, error) StatusFunc func(ctx context.Context, processID string, query kernel.BrowserProcessStatusParams, opts ...option.RequestOption) (*kernel.BrowserProcessStatusResponse, error) StdinFunc func(ctx context.Context, processID string, params kernel.BrowserProcessStdinParams, opts ...option.RequestOption) (*kernel.BrowserProcessStdinResponse, error) @@ -598,6 +599,12 @@ func (f *FakeProcessService) Kill(ctx context.Context, processID string, params } return &kernel.BrowserProcessKillResponse{Ok: true}, nil } +func (f *FakeProcessService) Resize(ctx context.Context, processID string, params kernel.BrowserProcessResizeParams, opts ...option.RequestOption) (*kernel.BrowserProcessResizeResponse, error) { + if f.ResizeFunc != nil { + return f.ResizeFunc(ctx, processID, params, opts...) + } + return &kernel.BrowserProcessResizeResponse{Ok: true}, nil +} func (f *FakeProcessService) Spawn(ctx context.Context, id string, body kernel.BrowserProcessSpawnParams, opts ...option.RequestOption) (*kernel.BrowserProcessSpawnResponse, error) { if f.SpawnFunc != nil { return f.SpawnFunc(ctx, id, body, opts...) @@ -730,7 +737,7 @@ func (f *FakeComputerService) SetCursorVisibility(ctx context.Context, id string // newFakeBrowsersServiceWithSimpleGet returns a FakeBrowsersService with a GetFunc that returns a browser with SessionID "id". func newFakeBrowsersServiceWithSimpleGet() *FakeBrowsersService { return &FakeBrowsersService{ - GetFunc: func(ctx context.Context, id string, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { + GetFunc: func(ctx context.Context, id string, query kernel.BrowserGetParams, opts ...option.RequestOption) (*kernel.BrowserGetResponse, error) { return &kernel.BrowserGetResponse{SessionID: "id"}, nil }, } diff --git a/cmd/credential_providers.go b/cmd/credential_providers.go new file mode 100644 index 0000000..1da25f0 --- /dev/null +++ b/cmd/credential_providers.go @@ -0,0 +1,466 @@ +package cmd + +import ( + "context" + "fmt" + "strings" + + "github.com/kernel/cli/pkg/util" + "github.com/kernel/kernel-go-sdk" + "github.com/kernel/kernel-go-sdk/option" + "github.com/pterm/pterm" + "github.com/spf13/cobra" +) + +// CredentialProvidersService defines the subset of the Kernel SDK credential provider client that we use. +type CredentialProvidersService interface { + New(ctx context.Context, body kernel.CredentialProviderNewParams, opts ...option.RequestOption) (res *kernel.CredentialProvider, err error) + Get(ctx context.Context, id string, opts ...option.RequestOption) (res *kernel.CredentialProvider, err error) + Update(ctx context.Context, id string, body kernel.CredentialProviderUpdateParams, opts ...option.RequestOption) (res *kernel.CredentialProvider, err error) + List(ctx context.Context, opts ...option.RequestOption) (res *[]kernel.CredentialProvider, err error) + Delete(ctx context.Context, id string, opts ...option.RequestOption) (err error) + Test(ctx context.Context, id string, opts ...option.RequestOption) (res *kernel.CredentialProviderTestResult, err error) +} + +// CredentialProvidersCmd handles credential provider operations independent of cobra. +type CredentialProvidersCmd struct { + providers CredentialProvidersService +} + +type CredentialProvidersListInput struct { + Output string +} + +type CredentialProvidersGetInput struct { + ID string + Output string +} + +type CredentialProvidersCreateInput struct { + ProviderType string + Token string + CacheTtlSeconds int64 + Output string +} + +type CredentialProvidersUpdateInput struct { + ID string + Token string + CacheTtlSeconds int64 + Enabled BoolFlag + Priority Int64Flag + Output string +} + +type CredentialProvidersDeleteInput struct { + ID string + SkipConfirm bool +} + +type CredentialProvidersTestInput struct { + ID string + Output string +} + +func (c CredentialProvidersCmd) List(ctx context.Context, in CredentialProvidersListInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + providers, err := c.providers.List(ctx) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + if providers == nil || len(*providers) == 0 { + fmt.Println("[]") + return nil + } + return util.PrintPrettyJSONSlice(*providers) + } + + if providers == nil || len(*providers) == 0 { + pterm.Info.Println("No credential providers found") + return nil + } + + tableData := pterm.TableData{{"ID", "Provider Type", "Enabled", "Priority", "Created At"}} + for _, p := range *providers { + tableData = append(tableData, []string{ + p.ID, + string(p.ProviderType), + fmt.Sprintf("%t", p.Enabled), + fmt.Sprintf("%d", p.Priority), + util.FormatLocal(p.CreatedAt), + }) + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c CredentialProvidersCmd) Get(ctx context.Context, in CredentialProvidersGetInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + provider, err := c.providers.Get(ctx, in.ID) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(provider) + } + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"ID", provider.ID}, + {"Provider Type", string(provider.ProviderType)}, + {"Enabled", fmt.Sprintf("%t", provider.Enabled)}, + {"Priority", fmt.Sprintf("%d", provider.Priority)}, + {"Created At", util.FormatLocal(provider.CreatedAt)}, + {"Updated At", util.FormatLocal(provider.UpdatedAt)}, + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c CredentialProvidersCmd) Create(ctx context.Context, in CredentialProvidersCreateInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if in.ProviderType == "" { + return fmt.Errorf("--provider-type is required") + } + if in.Token == "" { + return fmt.Errorf("--token is required") + } + + // Validate provider type + providerType := strings.ToLower(in.ProviderType) + if providerType != "onepassword" { + return fmt.Errorf("invalid provider type: %s (must be 'onepassword')", in.ProviderType) + } + + params := kernel.CredentialProviderNewParams{ + CreateCredentialProviderRequest: kernel.CreateCredentialProviderRequestParam{ + Token: in.Token, + ProviderType: kernel.CreateCredentialProviderRequestProviderTypeOnepassword, + }, + } + if in.CacheTtlSeconds > 0 { + params.CreateCredentialProviderRequest.CacheTtlSeconds = kernel.Opt(in.CacheTtlSeconds) + } + + if in.Output != "json" { + pterm.Info.Printf("Creating credential provider (%s)...\n", providerType) + } + + provider, err := c.providers.New(ctx, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(provider) + } + + pterm.Success.Printf("Created credential provider: %s\n", provider.ID) + + tableData := pterm.TableData{ + {"Property", "Value"}, + {"ID", provider.ID}, + {"Provider Type", string(provider.ProviderType)}, + {"Enabled", fmt.Sprintf("%t", provider.Enabled)}, + {"Priority", fmt.Sprintf("%d", provider.Priority)}, + } + + PrintTableNoPad(tableData, true) + return nil +} + +func (c CredentialProvidersCmd) Update(ctx context.Context, in CredentialProvidersUpdateInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + params := kernel.CredentialProviderUpdateParams{ + UpdateCredentialProviderRequest: kernel.UpdateCredentialProviderRequestParam{}, + } + if in.Token != "" { + params.UpdateCredentialProviderRequest.Token = kernel.Opt(in.Token) + } + if in.CacheTtlSeconds > 0 { + params.UpdateCredentialProviderRequest.CacheTtlSeconds = kernel.Opt(in.CacheTtlSeconds) + } + if in.Enabled.Set { + params.UpdateCredentialProviderRequest.Enabled = kernel.Opt(in.Enabled.Value) + } + if in.Priority.Set { + params.UpdateCredentialProviderRequest.Priority = kernel.Opt(in.Priority.Value) + } + + if in.Output != "json" { + pterm.Info.Printf("Updating credential provider '%s'...\n", in.ID) + } + + provider, err := c.providers.Update(ctx, in.ID, params) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(provider) + } + + pterm.Success.Printf("Updated credential provider: %s\n", provider.ID) + return nil +} + +func (c CredentialProvidersCmd) Delete(ctx context.Context, in CredentialProvidersDeleteInput) error { + if !in.SkipConfirm { + msg := fmt.Sprintf("Are you sure you want to delete credential provider '%s'?", in.ID) + pterm.DefaultInteractiveConfirm.DefaultText = msg + ok, _ := pterm.DefaultInteractiveConfirm.Show() + if !ok { + pterm.Info.Println("Deletion cancelled") + return nil + } + } + + if err := c.providers.Delete(ctx, in.ID); err != nil { + if util.IsNotFound(err) { + pterm.Info.Printf("Credential provider '%s' not found\n", in.ID) + return nil + } + return util.CleanedUpSdkError{Err: err} + } + pterm.Success.Printf("Deleted credential provider: %s\n", in.ID) + return nil +} + +func (c CredentialProvidersCmd) Test(ctx context.Context, in CredentialProvidersTestInput) error { + if in.Output != "" && in.Output != "json" { + return fmt.Errorf("unsupported --output value: use 'json'") + } + + if in.Output != "json" { + pterm.Info.Printf("Testing credential provider '%s'...\n", in.ID) + } + + result, err := c.providers.Test(ctx, in.ID) + if err != nil { + return util.CleanedUpSdkError{Err: err} + } + + if in.Output == "json" { + return util.PrintPrettyJSON(result) + } + + if result.Success { + pterm.Success.Println("Connection test successful") + } else { + pterm.Error.Printf("Connection test failed: %s\n", result.Error) + } + + if len(result.Vaults) > 0 { + pterm.Info.Println("Accessible vaults:") + tableData := pterm.TableData{{"Vault ID", "Vault Name"}} + for _, v := range result.Vaults { + tableData = append(tableData, []string{v.ID, v.Name}) + } + PrintTableNoPad(tableData, true) + } else { + pterm.Info.Println("No vaults accessible") + } + + return nil +} + +// --- Cobra wiring --- + +var credentialProvidersCmd = &cobra.Command{ + Use: "credential-providers", + Aliases: []string{"credential-provider", "cred-providers", "cred-provider"}, + Short: "Manage external credential providers", + Long: "Commands for managing external credential providers (e.g., 1Password) for automatic credential lookup", +} + +var credentialProvidersListCmd = &cobra.Command{ + Use: "list", + Short: "List credential providers", + Args: cobra.NoArgs, + RunE: runCredentialProvidersList, +} + +var credentialProvidersGetCmd = &cobra.Command{ + Use: "get ", + Short: "Get a credential provider by ID", + Args: cobra.ExactArgs(1), + RunE: runCredentialProvidersGet, +} + +var credentialProvidersCreateCmd = &cobra.Command{ + Use: "create", + Short: "Create a new credential provider", + Long: `Create a new external credential provider for automatic credential lookup. + +Currently supported provider types: + - onepassword: 1Password service account integration + +Examples: + # Create a 1Password credential provider + kernel credential-providers create --provider-type onepassword --token "ops_xxx..." + + # Create with custom cache TTL + kernel credential-providers create --provider-type onepassword --token "ops_xxx..." --cache-ttl 600`, + Args: cobra.NoArgs, + RunE: runCredentialProvidersCreate, +} + +var credentialProvidersUpdateCmd = &cobra.Command{ + Use: "update ", + Short: "Update a credential provider", + Long: `Update a credential provider's configuration (token, cache TTL, enabled status, or priority).`, + Args: cobra.ExactArgs(1), + RunE: runCredentialProvidersUpdate, +} + +var credentialProvidersDeleteCmd = &cobra.Command{ + Use: "delete ", + Short: "Delete a credential provider", + Args: cobra.ExactArgs(1), + RunE: runCredentialProvidersDelete, +} + +var credentialProvidersTestCmd = &cobra.Command{ + Use: "test ", + Short: "Test a credential provider connection", + Long: `Validate the credential provider's token and list accessible vaults.`, + Args: cobra.ExactArgs(1), + RunE: runCredentialProvidersTest, +} + +func init() { + credentialProvidersCmd.AddCommand(credentialProvidersListCmd) + credentialProvidersCmd.AddCommand(credentialProvidersGetCmd) + credentialProvidersCmd.AddCommand(credentialProvidersCreateCmd) + credentialProvidersCmd.AddCommand(credentialProvidersUpdateCmd) + credentialProvidersCmd.AddCommand(credentialProvidersDeleteCmd) + credentialProvidersCmd.AddCommand(credentialProvidersTestCmd) + + // List flags + credentialProvidersListCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + + // Get flags + credentialProvidersGetCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + + // Create flags + credentialProvidersCreateCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + credentialProvidersCreateCmd.Flags().String("provider-type", "", "Provider type (e.g., onepassword)") + credentialProvidersCreateCmd.Flags().String("token", "", "Service account token for the provider") + credentialProvidersCreateCmd.Flags().Int64("cache-ttl", 0, "How long to cache credential lists in seconds (default 300)") + _ = credentialProvidersCreateCmd.MarkFlagRequired("provider-type") + _ = credentialProvidersCreateCmd.MarkFlagRequired("token") + + // Update flags + credentialProvidersUpdateCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") + credentialProvidersUpdateCmd.Flags().String("token", "", "New service account token (to rotate credentials)") + credentialProvidersUpdateCmd.Flags().Int64("cache-ttl", 0, "How long to cache credential lists in seconds") + credentialProvidersUpdateCmd.Flags().Bool("enabled", true, "Whether the provider is enabled for credential lookups") + credentialProvidersUpdateCmd.Flags().Int64("priority", 0, "Priority order for credential lookups (lower numbers are checked first)") + + // Delete flags + credentialProvidersDeleteCmd.Flags().BoolP("yes", "y", false, "Skip confirmation prompt") + + // Test flags + credentialProvidersTestCmd.Flags().StringP("output", "o", "", "Output format: json for raw API response") +} + +func runCredentialProvidersList(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + + svc := client.CredentialProviders + c := CredentialProvidersCmd{providers: &svc} + return c.List(cmd.Context(), CredentialProvidersListInput{ + Output: output, + }) +} + +func runCredentialProvidersGet(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + + svc := client.CredentialProviders + c := CredentialProvidersCmd{providers: &svc} + return c.Get(cmd.Context(), CredentialProvidersGetInput{ + ID: args[0], + Output: output, + }) +} + +func runCredentialProvidersCreate(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + providerType, _ := cmd.Flags().GetString("provider-type") + token, _ := cmd.Flags().GetString("token") + cacheTtl, _ := cmd.Flags().GetInt64("cache-ttl") + + svc := client.CredentialProviders + c := CredentialProvidersCmd{providers: &svc} + return c.Create(cmd.Context(), CredentialProvidersCreateInput{ + ProviderType: providerType, + Token: token, + CacheTtlSeconds: cacheTtl, + Output: output, + }) +} + +func runCredentialProvidersUpdate(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + token, _ := cmd.Flags().GetString("token") + cacheTtl, _ := cmd.Flags().GetInt64("cache-ttl") + enabled, _ := cmd.Flags().GetBool("enabled") + priority, _ := cmd.Flags().GetInt64("priority") + + svc := client.CredentialProviders + c := CredentialProvidersCmd{providers: &svc} + return c.Update(cmd.Context(), CredentialProvidersUpdateInput{ + ID: args[0], + Token: token, + CacheTtlSeconds: cacheTtl, + Enabled: BoolFlag{Set: cmd.Flags().Changed("enabled"), Value: enabled}, + Priority: Int64Flag{Set: cmd.Flags().Changed("priority"), Value: priority}, + Output: output, + }) +} + +func runCredentialProvidersDelete(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + skip, _ := cmd.Flags().GetBool("yes") + + svc := client.CredentialProviders + c := CredentialProvidersCmd{providers: &svc} + return c.Delete(cmd.Context(), CredentialProvidersDeleteInput{ + ID: args[0], + SkipConfirm: skip, + }) +} + +func runCredentialProvidersTest(cmd *cobra.Command, args []string) error { + client := getKernelClient(cmd) + output, _ := cmd.Flags().GetString("output") + + svc := client.CredentialProviders + c := CredentialProvidersCmd{providers: &svc} + return c.Test(cmd.Context(), CredentialProvidersTestInput{ + ID: args[0], + Output: output, + }) +} diff --git a/cmd/root.go b/cmd/root.go index 318a14d..de4b5bb 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -140,6 +140,8 @@ func init() { rootCmd.AddCommand(proxies.ProxiesCmd) rootCmd.AddCommand(extensionsCmd) rootCmd.AddCommand(credentialsCmd) + rootCmd.AddCommand(credentialProvidersCmd) + rootCmd.AddCommand(agentsCmd) rootCmd.AddCommand(createCmd) rootCmd.AddCommand(mcp.MCPCmd) rootCmd.AddCommand(upgradeCmd) diff --git a/go.mod b/go.mod index ff373fe..343f7db 100644 --- a/go.mod +++ b/go.mod @@ -9,8 +9,9 @@ require ( github.com/charmbracelet/lipgloss/v2 v2.0.0-beta.1 github.com/golang-jwt/jwt/v5 v5.2.2 github.com/joho/godotenv v1.5.1 - github.com/kernel/kernel-go-sdk v0.26.0 + github.com/kernel/kernel-go-sdk v0.27.1-0.20260121054822-cee2050be3f8 github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c + github.com/pquerna/otp v1.5.0 github.com/pterm/pterm v0.12.80 github.com/samber/lo v1.51.0 github.com/spf13/cobra v1.9.1 @@ -25,6 +26,7 @@ require ( atomicgo.dev/cursor v0.2.0 // indirect atomicgo.dev/keyboard v0.2.9 // indirect atomicgo.dev/schedule v0.1.0 // indirect + github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc // indirect github.com/charmbracelet/colorprofile v0.3.0 // indirect github.com/charmbracelet/x/ansi v0.8.0 // indirect github.com/charmbracelet/x/cellbuf v0.0.13 // indirect diff --git a/go.sum b/go.sum index 02041b3..f760efe 100644 --- a/go.sum +++ b/go.sum @@ -22,6 +22,8 @@ github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lpr github.com/atomicgo/cursor v0.0.1/go.mod h1:cBON2QmmrysudxNBFthvMtN32r3jxVRIvzkUiF/RuIk= github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8= github.com/aymanbagabas/go-udiff v0.2.0/go.mod h1:RE4Ex0qsGkTAJoQdQQCA0uG+nAzJO/pI/QwceO5fgrA= +github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc h1:biVzkmvwrH8WK8raXaxBx6fRVTlJILwEwQGL1I/ByEI= +github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/boyter/gocodewalker v1.4.0 h1:fVmFeQxKpj5tlpjPcyTtJ96btgaHYd9yn6m+T/66et4= github.com/boyter/gocodewalker v1.4.0/go.mod h1:hXG8xzR1uURS+99P5/3xh3uWHjaV2XfoMMmvPyhrCDg= github.com/charmbracelet/colorprofile v0.3.0 h1:KtLh9uuu1RCt+Hml4s6Hz+kB1PfV3wi++1h5ia65yKQ= @@ -64,8 +66,8 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/kernel/kernel-go-sdk v0.26.0 h1:IBiEohSSZN5MEZjmnfqseT3tEip6+xg7Zxr79vJYMBA= -github.com/kernel/kernel-go-sdk v0.26.0/go.mod h1:EeZzSuHZVeHKxKCPUzxou2bovNGhXaz0RXrSqKNf1AQ= +github.com/kernel/kernel-go-sdk v0.27.1-0.20260121054822-cee2050be3f8 h1:D44gjEjkLww0lwnhNNJgaNLNVwkEgtkyt5w66epvE/Y= +github.com/kernel/kernel-go-sdk v0.27.1-0.20260121054822-cee2050be3f8/go.mod h1:EeZzSuHZVeHKxKCPUzxou2bovNGhXaz0RXrSqKNf1AQ= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.10/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= @@ -97,6 +99,8 @@ github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmd github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pquerna/otp v1.5.0 h1:NMMR+WrmaqXU4EzdGJEE1aUUI0AMRzsp96fFFWNPwxs= +github.com/pquerna/otp v1.5.0/go.mod h1:dkJfzwRKNiegxyNb54X/3fLwhCynbMspSyWKnvi1AEg= github.com/pterm/pterm v0.12.27/go.mod h1:PhQ89w4i95rhgE+xedAoqous6K9X+r6aSOI2eFF7DZI= github.com/pterm/pterm v0.12.29/go.mod h1:WI3qxgvoQFFGKGjGnJR849gU0TsEOvKn5Q8LlY1U7lg= github.com/pterm/pterm v0.12.30/go.mod h1:MOqLIyMOgmTDz9yorcYbcw+HsgoZo3BQfg2wtl3HEFE= @@ -121,6 +125,7 @@ github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= From 1277f3eba2f5a553d062f26467c08333790c2dea Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 21 Jan 2026 20:09:10 -0500 Subject: [PATCH 12/12] Handle cursor bugbot comments. Update viewport width in sampling loop and remove unused dependencies from Python and TypeScript templates. --- pkg/templates/python/yutori-computer-use/loop.py | 2 +- pkg/templates/python/yutori-computer-use/pyproject.toml | 3 --- pkg/templates/typescript/yutori-computer-use/package.json | 2 -- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index ef5748c..236d4ad 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -49,7 +49,7 @@ async def sampling_loop( cdp_ws_url: Optional[str] = None, max_tokens: int = 4096, max_iterations: int = 50, - viewport_width: int = 1280, + viewport_width: int = 1200, viewport_height: int = 800, mode: BrowserMode = "computer_use", ) -> dict[str, Any]: diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori-computer-use/pyproject.toml index 8a3bd44..cca32fc 100644 --- a/pkg/templates/python/yutori-computer-use/pyproject.toml +++ b/pkg/templates/python/yutori-computer-use/pyproject.toml @@ -5,9 +5,6 @@ description = "Kernel reference app for Yutori n1 Computer Use" requires-python = ">=3.9" dependencies = [ "openai>=1.58.0", - "python-dateutil>=2.9.0", - "pydantic>=2.12.5", - "typing-extensions>=4.15.0", "kernel>=0.24.0", "python-dotenv>=1.2.1", "playwright>=1.52.0", diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori-computer-use/package.json index 86461f9..2bc4fbe 100644 --- a/pkg/templates/typescript/yutori-computer-use/package.json +++ b/pkg/templates/typescript/yutori-computer-use/package.json @@ -6,12 +6,10 @@ "dependencies": { "@onkernel/sdk": "^0.24.0", "openai": "^4.77.0", - "luxon": "^3.7.2", "playwright-core": "^1.52.0" }, "devDependencies": { "@types/node": "^22.15.17", - "@types/luxon": "^3.6.2", "typescript": "^5.9.3" } }