diff --git a/README.md b/README.md index af92648..c40d2f0 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,7 @@ Commands with JSON output support: - `captcha-solver` - Template demonstrating Kernel's auto-CAPTCHA solver - `stagehand` - Template with Stagehand SDK (TypeScript only) - `browser-use` - Template with Browser Use SDK (Python only) + - `lead-scraper` - Google Maps lead scraper using Browser Use (Python only) - `anthropic-computer-use` - Anthropic Computer Use prompt loop - `openai-computer-use` - OpenAI Computer Use Agent sample - `gemini-computer-use` - Implements a Gemini computer use agent (TypeScript only) @@ -449,6 +450,9 @@ kernel create --name my-cu-app --language py --template anthropic-computer-use # Create a Claude Agent SDK app (TypeScript or Python) kernel create --name my-claude-agent --language ts --template claude-agent-sdk + +# Create a Google Maps Lead Scraper (Python) +kernel create --name my-lead-scraper --language python --template lead-scraper ``` ### Deploy with environment variables diff --git a/pkg/templates/python/lead-scraper/.env.example b/pkg/templates/python/lead-scraper/.env.example new file mode 100644 index 0000000..b74e0a2 --- /dev/null +++ b/pkg/templates/python/lead-scraper/.env.example @@ -0,0 +1,2 @@ +# Copy this file to .env and fill in your API key +OPENAI_API_KEY=your_openai_api_key_here diff --git a/pkg/templates/python/lead-scraper/README.md b/pkg/templates/python/lead-scraper/README.md new file mode 100644 index 0000000..e9e98b0 --- /dev/null +++ b/pkg/templates/python/lead-scraper/README.md @@ -0,0 +1,113 @@ +# Kernel Lead Scraper Template - Google Maps + +A ready-to-use lead scraper that extracts local business data from Google Maps using [browser-use](https://github.com/browser-use/browser-use) and the Kernel platform. + +## What It Does + +This template creates an AI-powered web scraper that: +1. Navigates to Google Maps +2. Searches for businesses by type and location +3. Scrolls through results to load more listings +4. Extracts structured lead data (name, phone, address, website, rating, reviews) +5. Returns clean JSON ready for your CRM or outreach tools + +## Quick Start + +### 1. Install Dependencies + +```bash +uv sync +``` + +### 2. Set Up Environment + +```bash +cp .env.example .env +# Edit .env and add your OpenAI API key +``` + +### 3. Deploy to Kernel + +```bash +kernel deploy main.py -e OPENAI_API_KEY=$OPENAI_API_KEY +``` + +### 4. Run the Scraper + +```bash +kernel run lead-scraper scrape-leads \ + --data '{"query": "restaurants", "location": "Austin, TX", "max_results": 10}' +``` + +## API Reference + +### Action: `scrape-leads` + +**Input Parameters:** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `query` | string | ✅ | - | Business type to search (e.g., "plumbers", "gyms") | +| `location` | string | ✅ | - | Geographic location (e.g., "Miami, FL") | +| `max_results` | integer | ❌ | 20 | Maximum leads to scrape (1-50) | + +**Example Output:** + +```json +{ + "leads": [ + { + "name": "Joe's Pizza", + "phone": "(512) 555-0123", + "address": "123 Main St, Austin, TX 78701", + "website": "https://joespizza.com", + "rating": 4.5, + "review_count": 234, + "category": "Pizza restaurant" + } + ], + "total_found": 1, + "query": "pizza restaurants", + "location": "Austin, TX" +} +``` + +## Use Cases + +- **Sales Teams**: Build targeted prospect lists for cold outreach +- **Marketing Agencies**: Find local businesses needing marketing services +- **Service Providers**: Identify potential B2B clients in your area +- **Market Research**: Analyze competitor density and ratings by location + +## Customization + +### Modify the Search Prompt + +Edit the `SCRAPER_PROMPT` in `main.py` to customize what data the AI extracts: + +```python +SCRAPER_PROMPT = """ +Navigate to Google Maps and search for {query} in {location}. +# Add your custom extraction instructions here +""" +``` + +### Add New Fields + +1. Update `BusinessLead` model in `models.py` +2. Modify the prompt to extract the new fields +3. Redeploy with `kernel deploy main.py` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| No results found | Try a broader search query or different location | +| Timeout errors | Reduce `max_results` or check your network | +| Rate limiting | Add delays between requests in production | + +## Resources + +- [Kernel Documentation](https://www.kernel.sh/docs) +- [Browser Use Docs](https://docs.browser-use.com) +- [Pydantic Models](https://docs.pydantic.dev) diff --git a/pkg/templates/python/lead-scraper/_gitignore b/pkg/templates/python/lead-scraper/_gitignore new file mode 100644 index 0000000..75475bc --- /dev/null +++ b/pkg/templates/python/lead-scraper/_gitignore @@ -0,0 +1,79 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +.project +.pydevproject +.settings/ + +# Testing +.coverage +htmlcov/ +.pytest_cache/ +.tox/ +.nox/ +coverage.xml +*.cover +.hypothesis/ + +# Logs +*.log +logs/ + +# OS +.DS_Store +Thumbs.db + +# Browser Use specific +.playwright-screenshots/ +.playwright-videos/ +.playwright-report/ +test-results/ +blob-report/ +playwright/.cache/ +playwright/.local-browsers/ + +# Lead Scraper specific +leads_output/ +*.csv +*.json + +# Misc +.cache/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.temp/ +.tmp/ diff --git a/pkg/templates/python/lead-scraper/formaters.py b/pkg/templates/python/lead-scraper/formaters.py new file mode 100644 index 0000000..60256c2 --- /dev/null +++ b/pkg/templates/python/lead-scraper/formaters.py @@ -0,0 +1,208 @@ +import json +import re +from typing import Any, Iterable +from models import BusinessLead + +_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.IGNORECASE | re.DOTALL) +_TRAILING_COMMA_RE = re.compile(r",\s*([\]}])") +_SMART_QUOTES = { + "\u201c": '"', "\u201d": '"', # “ ” + "\u2018": "'", "\u2019": "'", # ‘ ’ +} + + +def parse_leads_from_result(result_text: str) -> list[BusinessLead]: + """ + Robustly extract a JSON array of leads from an LLM/browser agent output and + convert it into BusinessLead objects. + + Strategy: + 1) Prefer JSON inside ```json ... ``` fenced blocks + 2) Else try to decode from the first '[' onwards using JSONDecoder.raw_decode + 3) Normalize a few common LLM issues (smart quotes, trailing commas, "null" strings) + """ + if not result_text or not result_text.strip(): + return [] + + candidates = _extract_json_candidates(result_text) + + for candidate in candidates: + parsed = _try_parse_json_list(candidate) + if parsed is None: + continue + + leads: list[BusinessLead] = [] + for raw in parsed: + lead = _to_business_lead(raw) + if lead is not None: + leads.append(lead) + + if leads: + return leads # first successful parse wins + + # Fallback: try to parse markdown format (when agent returns numbered lists) + leads = _parse_markdown_leads(result_text) + if leads: + return leads + + return [] + + +def _parse_markdown_leads(text: str) -> list[BusinessLead]: + """ + Parse markdown-formatted lead data when JSON parsing fails. + Handles format like: + 1. **Business Name** + - Address: 123 Main St + - Rating: 4.5 + - Phone: +1 555-1234 + """ + leads = [] + + # Pattern to match numbered entries with bold names + entry_pattern = re.compile( + r'\d+\.\s*\*\*(.+?)\*\*\s*\n((?:\s*-\s*.+\n?)+)', + re.MULTILINE + ) + + for match in entry_pattern.finditer(text): + name = match.group(1).strip() + details = match.group(2) + + # Extract fields from the dash-prefixed lines + def extract_field(pattern: str, txt: str) -> str | None: + m = re.search(pattern, txt, re.IGNORECASE) + return m.group(1).strip() if m else None + + address = extract_field(r'-\s*Address:\s*(.+?)(?:\n|$)', details) + rating_str = extract_field(r'-\s*Rating:\s*([\d.]+)', details) + review_str = extract_field(r'-\s*Review\s*Count:\s*([\d,]+)', details) + category = extract_field(r'-\s*Category:\s*(.+?)(?:\n|$)', details) + phone = extract_field(r'-\s*Phone:\s*(.+?)(?:\n|$)', details) + website = extract_field(r'-\s*Website:\s*(.+?)(?:\n|$)', details) + + # Clean up "Not available" etc + if phone and phone.lower() in ('not available', 'n/a', 'none'): + phone = None + if website and website.lower() in ('not available', 'n/a', 'none'): + website = None + + try: + lead = BusinessLead( + name=name, + address=address, + rating=float(rating_str) if rating_str else None, + review_count=int(review_str.replace(',', '')) if review_str else None, + category=category, + phone=phone, + website=website, + ) + leads.append(lead) + except Exception: + continue + + return leads + + +def _extract_json_candidates(text: str) -> list[str]: + """ + Return possible JSON snippets, ordered from most to least likely. + """ + # 1) Fenced code blocks first + fenced = [m.group(1) for m in _JSON_FENCE_RE.finditer(text)] + if fenced: + return fenced + + # 2) Otherwise try from first '[' onward (common "Return ONLY a JSON array") + idx = text.find("[") + return [text[idx:]] if idx != -1 else [] + + +def _normalize_llm_json(s: str) -> str: + # Replace smart quotes + for k, v in _SMART_QUOTES.items(): + s = s.replace(k, v) + + # Some models do ``key``: ``value``. Convert double-backticks to quotes carefully. + # (Keep this minimal: it can still be wrong, but it helps common cases.) + s = s.replace("``", '"') + + # Convert string "null" to JSON null + s = s.replace('"null"', "null") + + # Remove trailing commas before ] or } + s = _TRAILING_COMMA_RE.sub(r"\1", s) + + return s.strip() + + +def _try_parse_json_list(candidate: str) -> list[dict[str, Any]] | None: + """ + Attempt to parse a JSON array from a candidate snippet. + Returns a list of dicts or None. + """ + candidate = _normalize_llm_json(candidate) + + # 1) Direct parse + try: + data = json.loads(candidate) + return data if isinstance(data, list) else None + except json.JSONDecodeError: + pass + + # 2) Decoder-based parse from first '[' (more robust than find/rfind slicing) + start = candidate.find("[") + if start == -1: + return None + + decoder = json.JSONDecoder() + try: + obj, _end = decoder.raw_decode(candidate[start:]) + return obj if isinstance(obj, list) else None + except json.JSONDecodeError: + return None + + +def _to_business_lead(raw: Any) -> BusinessLead | None: + """ + Convert one raw object into a BusinessLead, best-effort. + """ + if not isinstance(raw, dict): + return None + + try: + # Optionally coerce some common fields + rating = raw.get("rating") + if isinstance(rating, str): + rating = _safe_float(rating) + + review_count = raw.get("review_count") + if isinstance(review_count, str): + review_count = _safe_int(review_count) + + return BusinessLead( + name=(raw.get("name") or "Unknown").strip() if isinstance(raw.get("name"), str) else (raw.get("name") or "Unknown"), + phone=raw.get("phone"), + address=raw.get("address"), + website=raw.get("website"), + rating=rating, + review_count=review_count, + category=raw.get("category"), + ) + except Exception: + # Keep parsing the rest; caller decides how to log + return None + + +def _safe_float(x: str) -> float | None: + try: + return float(x.replace(",", "").strip()) + except Exception: + return None + + +def _safe_int(x: str) -> int | None: + try: + return int(x.replace(",", "").strip()) + except Exception: + return None diff --git a/pkg/templates/python/lead-scraper/main.py b/pkg/templates/python/lead-scraper/main.py new file mode 100644 index 0000000..aa7a7eb --- /dev/null +++ b/pkg/templates/python/lead-scraper/main.py @@ -0,0 +1,170 @@ +""" +Google Maps Lead Scraper - Kernel Template + +This template demonstrates how to build a lead scraper using browser-use +to extract local business data from Google Maps. + +Usage: + kernel deploy main.py -e OPENAI_API_KEY=$OPENAI_API_KEY + kernel invoke lead-scraper scrape-leads --data '{"query": "restaurants", "location": "Austin, TX"}' +""" + +import json + +import kernel +from browser_use import Agent, Browser +from browser_use.llm import ChatOpenAI +from kernel import Kernel +from formaters import parse_leads_from_result + +from models import BusinessLead, ScrapeInput, ScrapeOutput + +# Initialize Kernel client and app +client = Kernel() +app = kernel.App("lead-scraper") + +# LLM for the browser-use agent +# API key is set via: kernel deploy main.py -e OPENAI_API_KEY=XXX +llm = ChatOpenAI(model="gpt-4.1") + +# ============================================================================ +# SCRAPER PROMPT +# Customize this prompt to change what data the agent extracts +# ============================================================================ +SCRAPER_PROMPT = """ +You are a lead generation assistant. Scrape business information from Google Maps. + +**Instructions:** +1. Navigate to https://www.google.com/maps +2. Search for: "{query} in {location}" +3. Wait for results to load +4. For each of the max {max_results} businesses in the list: + a. Click on the listing to open its detail view + b. SCROLL DOWN in the detail panel to see all info (phone/website are often below) + c. Extract: name, address, rating, review count, category, phone number, website + d. Click back or the X to close the detail view and return to the list +5. After collecting data for max {max_results} businesses, return the JSON + +**What to extract:** +- Business name (REQUIRED) +- Address (REQUIRED) +- Star rating (REQUIRED) +- Review count (optional) +- Category (optional) +- Phone number (scroll down in detail view to find it, null if not shown) +- Website URL (scroll down in detail view to find it, null if not shown) + +**Important:** +- SCROLL DOWN inside each business detail panel to find phone/website +- Use null for any field that isn't available +- Task is SUCCESSFUL when you return at least 1 complete business + +**CRITICAL - Output Format:** +You MUST return ONLY a valid JSON array. No markdown, no explanations, no numbered lists. +Return EXACTLY this format: +[ + {{"name": "Business Name", "address": "123 Main St", "rating": 4.5, "review_count": 100, "category": "Restaurant", "phone": "+1 555-1234", "website": "https://example.com"}} +] +""" + +@app.action("scrape-leads") +async def scrape_leads(ctx: kernel.KernelContext, input_data: dict) -> dict: + """ + Scrape local business leads from Google Maps. + + This action uses browser-use to navigate Google Maps, search for businesses, + and extract structured lead data. + + Args: + ctx: Kernel context containing invocation information + input_data: Dictionary with query, location, and max_results + + Returns: + ScrapeOutput containing list of leads and metadata + + Example: + kernel invoke lead-scraper scrape-leads \ + --data '{"query": "plumbers", "location": "Miami, FL", "max_results": 15}' + """ + # Validate input - default to empty dict if no payload provided + scrape_input = ScrapeInput(**(input_data or {})) + + # Use attribute access for Pydantic model (not dictionary subscript) + input_query = scrape_input.query + input_location = scrape_input.location + input_max_results = scrape_input.max_results + + # Format the prompt with user parameters + task_prompt = SCRAPER_PROMPT.format( + query=input_query, + location=input_location, + max_results=input_max_results, + ) + + print(f"Starting lead scrape: {input_query} in {input_location}") + print(f"Target: {input_max_results} leads") + + # Create Kernel browser session + kernel_browser = None + + try: + + kernel_browser = client.browsers.create( + invocation_id=ctx.invocation_id, + stealth=True, # Use stealth mode to avoid detection + ) + print(f"Browser live view: {kernel_browser.browser_live_view_url}") + + # Connect browser-use to the Kernel browser + browser = Browser( + cdp_url=kernel_browser.cdp_ws_url, + headless=False, + window_size={"width": 1920, "height": 1080}, + viewport={"width": 1920, "height": 1080}, + device_scale_factor=1.0, + ) + + # Create and run the browser-use agent + agent = Agent( + task=task_prompt, + llm=llm, + browser_session=browser, + ) + + print("Running browser-use agent...") + # Limit steps to prevent timeouts (this is a template demo) + result = await agent.run(max_steps=25) + + # Parse the result from final_result + leads = [] + final_text = result.final_result() + + if final_text: + print(f"Parsing final_result ({len(final_text)} chars)...") + leads = parse_leads_from_result(final_text) + else: + # If no final_result, check the last action for done text + print("No final_result, checking last action...") + action_results = result.action_results() + if action_results: + last_action = action_results[-1] + if hasattr(last_action, 'extracted_content') and last_action.extracted_content: + content = last_action.extracted_content + print(f"Found content in last action ({len(content)} chars)...") + leads = parse_leads_from_result(content) + + print(f"Successfully extracted {len(leads)} leads") + + output = ScrapeOutput( + leads=leads, + total_found=len(leads), + query=input_query, + location=input_location, + ) + return output.model_dump() + + finally: + # Always clean up the browsers session + if kernel_browser is not None: + client.browsers.delete_by_id(kernel_browser.session_id) + print("Browser session cleaned up") diff --git a/pkg/templates/python/lead-scraper/models.py b/pkg/templates/python/lead-scraper/models.py new file mode 100644 index 0000000..2d3c6e4 --- /dev/null +++ b/pkg/templates/python/lead-scraper/models.py @@ -0,0 +1,65 @@ +from pydantic import BaseModel, Field +from typing import Optional + + +class ScrapeInput(BaseModel): + """Input parameters for the lead scraper. + + Attributes: + query: The type of business to search (e.g., "restaurants", "plumbers", "gyms") + location: The geographic location to search (e.g., "Austin, TX", "New York, NY") + max_results: Maximum number of leads to scrape (default: 2, max: 5) + """ + + query: str = Field( + default="restaurants", + description="Type of business to search for (e.g., 'restaurants', 'plumbers')" + ) + location: str = Field( + default="New York, NY", + description="Geographic location (e.g., 'Austin, TX', 'New York, NY')" + ) + max_results: int = Field( + default=1, + ge=1, + le=5, + description="Maximum number of leads to scrape (1-5)", + ) + + +class BusinessLead(BaseModel): + """Structured data for a business lead scraped from Google Maps. + + Attributes: + name: Business name + phone: Phone number (if available) + address: Full address + website: Website URL (if available) + rating: Star rating (1-5) + review_count: Number of reviews + category: Business category/type + """ + + name: str = Field(description="Business name") + phone: Optional[str] = Field(default=None, description="Phone number") + address: Optional[str] = Field(default=None, description="Full address") + website: Optional[str] = Field(default=None, description="Website URL") + rating: Optional[float] = Field(default=None, ge=1, le=5, description="Star rating") + review_count: Optional[int] = Field(default=None, ge=0, description="Number of reviews") + category: Optional[str] = Field(default=None, description="Business category") + + +class ScrapeOutput(BaseModel): + """Output from the lead scraper. + + Attributes: + leads: List of scraped business leads + total_found: Total number of leads found + query: The original search query + location: The original search location + """ + + leads: list[BusinessLead] = Field(default_factory=list, description="List of scraped leads") + total_found: int = Field(default=0, description="Total number of leads found") + query: str = Field(description="Original search query") + location: str = Field(description="Original search location") diff --git a/pkg/templates/python/lead-scraper/pyproject.toml b/pkg/templates/python/lead-scraper/pyproject.toml new file mode 100644 index 0000000..2c33639 --- /dev/null +++ b/pkg/templates/python/lead-scraper/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "lead-scraper" +version = "0.1.0" +description = "Google Maps Lead Scraper - A Kernel template for scraping local business leads" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "browser-use>=0.11.1", + "kernel>=0.23.0", + "pydantic>=2.12.5", +]