"""agent_tools.py — Tools que la IA local puede llamar para investigar properties. PRINCIPIO: La IA local (Ollama) tiene un reasoning loop que decide qué tool llamar next. Cada tool retorna data estructurada que la IA usa para decidir el próximo paso. Tools disponibles: - web_search(query, max_results) - fetch_url(url) - extract_with_local_llm(html_or_text, schema, instructions) - save_document(deal_id, category, filename, content) - download_pdf(url, deal_id, category, filename) - remember_portal(state, county, portal_type, url, notes) - lookup_portal(state, county, portal_type) - geocode_address(address) - finish(summary) ← termina el loop COSTO: - web_search: $0 (DuckDuckGo HTML) - fetch_url: $0 (Playwright local) - extract_with_local_llm: $0 (Ollama local) - save_document, remember_portal, lookup_portal: $0 (filesystem/JSON) - geocode_address: $0 (Census) """ from __future__ import annotations import hashlib import json import re import time from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional # ──────────────────────────────────────────────────────────────────────────── # Paths # ──────────────────────────────────────────────────────────────────────────── PROJECT_ROOT = Path(__file__).resolve().parent PORTAL_DIR_PATH = PROJECT_ROOT / "portal_directory.json" PROPERTIES_ROOT = PROJECT_ROOT / "properties" # ──────────────────────────────────────────────────────────────────────────── # Tool 1: web_search (DuckDuckGo, gratis) # ──────────────────────────────────────────────────────────────────────────── def web_search(query: str, max_results: int = 5) -> dict: """Search the web via DuckDuckGo (free, no API key). Returns: {"query": ..., "results": [{"title", "url", "snippet"}, ...]} """ try: from ddgs import DDGS except ImportError: return {"query": query, "error": "ddgs package not installed", "results": []} try: with DDGS() as ddgs: raw = ddgs.text(query, max_results=max_results, region="us-en") results = [ { "title": r.get("title", "")[:200], "url": r.get("href", ""), "snippet": r.get("body", "")[:300], } for r in raw ] return {"query": query, "results": results} except Exception as e: return {"query": query, "error": f"{type(e).__name__}: {e}", "results": []} # ──────────────────────────────────────────────────────────────────────────── # Tool 2: fetch_url (Playwright, gratis) # ──────────────────────────────────────────────────────────────────────────── _UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36" def fetch_url(url: str, wait_seconds: int = 5, timeout_seconds: int = 20) -> dict: """Fetch a URL via Playwright (free, handles JS). Returns: {"url", "status", "title", "text", "links": [{"href","text"}], "html_size", "error"} """ out = {"url": url, "status": None, "title": "", "text": "", "links": [], "html_size": 0, "error": None} try: from playwright.sync_api import sync_playwright except ImportError: out["error"] = "playwright not available" return out try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) try: ctx = browser.new_context(user_agent=_UA, viewport={"width": 1280, "height": 900}) page = ctx.new_page() page.set_default_timeout(timeout_seconds * 1000) resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout_seconds * 1000) time.sleep(wait_seconds) out["status"] = resp.status if resp else None out["title"] = page.title()[:200] out["text"] = page.evaluate("document.body.innerText")[:8000] out["html_size"] = len(page.content()) # Top 20 links links = page.evaluate(""" Array.from(document.querySelectorAll('a[href]')) .slice(0, 30) .map(a => ({href: a.href, text: (a.innerText || a.textContent || '').trim().substring(0,80)})) .filter(l => l.href && !l.href.startsWith('javascript:') && !l.href.startsWith('mailto:')) """) out["links"] = links[:20] finally: browser.close() except Exception as e: out["error"] = f"{type(e).__name__}: {e}" return out # ──────────────────────────────────────────────────────────────────────────── # Tool 3: extract_with_local_llm (Ollama JSON mode, gratis) # ──────────────────────────────────────────────────────────────────────────── def extract_with_local_llm( *, text: str, schema: dict, instructions: str, model: str = "llama3.1:8b", ) -> dict: """Use a local Ollama model to extract structured data from text. Args: text: raw text or HTML chunk (will be truncated to ~6K chars to fit context) schema: JSON Schema describing fields to extract instructions: prose instructions for the model model: Ollama model name (default llama3.1:8b) Returns: {"extracted": {}, "error": None | str} """ try: import ollama except ImportError: return {"extracted": {}, "error": "ollama not installed"} # Cap text to avoid OOM (context window protection) text = text[:6000] prompt = ( f"{instructions}\n\n" f"Schema to fill (JSON):\n```json\n{json.dumps(schema, indent=2)}\n```\n\n" f"Text to extract from:\n```\n{text}\n```\n\n" f"Return ONLY valid JSON matching the schema. No prose. No markdown wrapper. " f"If a field is not present in the text, use null." ) try: resp = ollama.chat( model=model, messages=[{"role": "user", "content": prompt}], format="json", options={"temperature": 0.1, "num_ctx": 8192, "num_predict": 800}, ) content = resp["message"]["content"] try: data = json.loads(content) return {"extracted": data, "error": None} except json.JSONDecodeError as e: return {"extracted": {}, "error": f"JSON decode: {e}: {content[:200]}"} except Exception as e: return {"extracted": {}, "error": f"{type(e).__name__}: {e}"} # ──────────────────────────────────────────────────────────────────────────── # Tool 4: save_document (filesystem) # ──────────────────────────────────────────────────────────────────────────── def save_document( *, deal_id: int, category: str, filename: str, content: str | bytes, ) -> dict: """Save text/binary content to properties/{deal_folder}/{category}/{filename}. category: deeds | liens | court_records | property_appraiser | photos | due_diligence | offers """ valid_categories = {"deeds", "liens", "court_records", "property_appraiser", "photos", "due_diligence", "offers"} if category not in valid_categories: return {"error": f"invalid category {category!r}; valid: {sorted(valid_categories)}"} try: from deals_db import get_deal_by_id from properties_store import ensure_property_folder deal = get_deal_by_id(deal_id) if not deal: return {"error": f"deal_id={deal_id} not found"} folder = ensure_property_folder(deal) except Exception as e: return {"error": f"{type(e).__name__}: {e}"} # Sanitize filename safe_name = re.sub(r"[^A-Za-z0-9._\-]+", "_", filename)[:120] target = folder / category / safe_name target.parent.mkdir(parents=True, exist_ok=True) try: if isinstance(content, bytes): target.write_bytes(content) else: target.write_text(content, encoding="utf-8") return {"saved_to": str(target), "size": target.stat().st_size} except Exception as e: return {"error": f"write failed: {e}"} def download_pdf(*, deal_id: int, category: str, filename: str, url: str) -> dict: """Download a PDF from URL and save to properties folder.""" import urllib.request try: req = urllib.request.Request(url, headers={"User-Agent": _UA}) with urllib.request.urlopen(req, timeout=20) as resp: content = resp.read() content_type = resp.headers.get("Content-Type", "") except Exception as e: return {"error": f"download failed: {type(e).__name__}: {e}"} # Sanity check: must be a PDF or image is_pdf = content_type.startswith("application/pdf") or content[:4] == b"%PDF" is_img = content_type.startswith("image/") or content[:4] in (b"\xff\xd8\xff\xe0", b"\xff\xd8\xff\xe1", b"\x89PNG") if not (is_pdf or is_img): return {"error": f"unexpected content-type {content_type!r}; first bytes: {content[:8].hex()}"} if not filename.lower().endswith((".pdf", ".jpg", ".jpeg", ".png")): filename = filename + (".pdf" if is_pdf else ".jpg") return save_document(deal_id=deal_id, category=category, filename=filename, content=content) # ──────────────────────────────────────────────────────────────────────────── # Tool 5: portal directory (memory de portales por county) # ──────────────────────────────────────────────────────────────────────────── VALID_PORTAL_TYPES = { "court_records", # case docket / lis pendens / civil suit search "property_appraiser",# PA, tax assessor "recorder", # Official Records: deeds, mortgages, liens "tax_collector", # tax bills, delinquencies "code_enforcement", # violations "clerk_auction", # foreclosure auction site (realauction.com etc.) "building_dept", # permits, COs } def _load_portal_directory() -> dict: if PORTAL_DIR_PATH.exists(): try: return json.loads(PORTAL_DIR_PATH.read_text(encoding="utf-8")) except Exception: pass return {} def _save_portal_directory(data: dict) -> None: PORTAL_DIR_PATH.write_text( json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True), encoding="utf-8", ) def lookup_portal(state: str, county: str, portal_type: str) -> dict: """Get known portal URL for (state, county, portal_type). Returns: {"found": bool, "url": str|None, "notes": str|None, "last_verified": str|None} """ if portal_type not in VALID_PORTAL_TYPES: return {"found": False, "error": f"invalid portal_type {portal_type!r}; valid: {sorted(VALID_PORTAL_TYPES)}"} data = _load_portal_directory() state_data = data.get(state.upper(), {}) county_data = state_data.get(county, {}) entry = county_data.get(portal_type) if not entry: return {"found": False, "state": state, "county": county, "portal_type": portal_type} return { "found": True, "state": state, "county": county, "portal_type": portal_type, "url": entry.get("url"), "notes": entry.get("notes"), "last_verified": entry.get("last_verified"), } def remember_portal( *, state: str, county: str, portal_type: str, url: str, notes: Optional[str] = None, ) -> dict: """Persist: 'For {county}, {state} the {portal_type} is {url}'. Lets the agent build up institutional memory over time. """ if portal_type not in VALID_PORTAL_TYPES: return {"error": f"invalid portal_type {portal_type!r}; valid: {sorted(VALID_PORTAL_TYPES)}"} data = _load_portal_directory() state_key = state.upper() data.setdefault(state_key, {}) data[state_key].setdefault(county, {}) data[state_key][county][portal_type] = { "url": url, "notes": notes, "last_verified": datetime.now(timezone.utc).isoformat()[:10], } _save_portal_directory(data) return {"ok": True, "state": state, "county": county, "portal_type": portal_type, "url": url} # ──────────────────────────────────────────────────────────────────────────── # Tool 6: geocode_address (Census, gratis) # ──────────────────────────────────────────────────────────────────────────── def geocode_address(address: str) -> dict: """Geocode an address using Census (free).""" try: from data_fetchers.census_geocode import fetch_geocode return fetch_geocode(address) or {"error": "no result"} except Exception as e: return {"error": f"{type(e).__name__}: {e}"} # ──────────────────────────────────────────────────────────────────────────── # Tool spec dictionary for Ollama tool calling # ──────────────────────────────────────────────────────────────────────────── OLLAMA_TOOL_SPECS = [ { "type": "function", "function": { "name": "web_search", "description": "Search the web (DuckDuckGo, free). Use for discovering county portals, finding case# info, etc.", "parameters": { "type": "object", "properties": { "query": {"type": "string", "description": "Search query in English"}, "max_results": {"type": "integer", "description": "Max results to return (default 5)", "default": 5}, }, "required": ["query"], }, }, }, { "type": "function", "function": { "name": "fetch_url", "description": "Load a URL with Playwright (handles JS). Returns title, text, and top links.", "parameters": { "type": "object", "properties": { "url": {"type": "string"}, "wait_seconds": {"type": "integer", "description": "Seconds to wait for JS to render after page load (default 5)", "default": 5}, }, "required": ["url"], }, }, }, { "type": "function", "function": { "name": "extract_with_local_llm", "description": "Extract structured JSON from text/HTML using a local LLM. Use after fetch_url to parse the page.", "parameters": { "type": "object", "properties": { "text": {"type": "string", "description": "Text or HTML to extract from (cap 6K chars)"}, "schema": {"type": "object", "description": "JSON schema describing fields to extract"}, "instructions": {"type": "string", "description": "Prose instructions for the extractor"}, }, "required": ["text", "schema", "instructions"], }, }, }, { "type": "function", "function": { "name": "lookup_portal", "description": "Check if we already know the portal URL for a (state, county, portal_type). ALWAYS call this first before web_search.", "parameters": { "type": "object", "properties": { "state": {"type": "string", "description": "2-letter state code (e.g. NC, FL)"}, "county": {"type": "string", "description": "County name (e.g. Wake, Miami-Dade)"}, "portal_type": {"type": "string", "enum": list(VALID_PORTAL_TYPES)}, }, "required": ["state", "county", "portal_type"], }, }, }, { "type": "function", "function": { "name": "remember_portal", "description": "Save a portal URL we discovered. Call this AFTER you confirm a working portal so future searches in the same county are faster.", "parameters": { "type": "object", "properties": { "state": {"type": "string"}, "county": {"type": "string"}, "portal_type": {"type": "string", "enum": list(VALID_PORTAL_TYPES)}, "url": {"type": "string"}, "notes": {"type": "string", "description": "Brief notes about the portal (optional)"}, }, "required": ["state", "county", "portal_type", "url"], }, }, }, { "type": "function", "function": { "name": "save_document", "description": "Save text content to a deal's property folder.", "parameters": { "type": "object", "properties": { "deal_id": {"type": "integer"}, "category": {"type": "string", "enum": ["deeds","liens","court_records","property_appraiser","photos","due_diligence","offers"]}, "filename": {"type": "string"}, "content": {"type": "string"}, }, "required": ["deal_id", "category", "filename", "content"], }, }, }, { "type": "function", "function": { "name": "download_pdf", "description": "Download a PDF (or image) from URL and save to deal's folder.", "parameters": { "type": "object", "properties": { "deal_id": {"type": "integer"}, "category": {"type": "string", "enum": ["deeds","liens","court_records","property_appraiser","photos","due_diligence","offers"]}, "filename": {"type": "string"}, "url": {"type": "string"}, }, "required": ["deal_id", "category", "filename", "url"], }, }, }, { "type": "function", "function": { "name": "finish", "description": "Call this when research is complete. Provide a final summary of findings.", "parameters": { "type": "object", "properties": { "summary": {"type": "string", "description": "Brief summary of what was found"}, "portals_used": {"type": "array", "items": {"type": "string"}, "description": "List of portal URLs successfully used"}, "documents_saved": {"type": "array", "items": {"type": "string"}, "description": "List of paths of documents saved"}, "findings": {"type": "object", "description": "Key structured findings (plaintiff, owner, etc)"}, }, "required": ["summary"], }, }, }, ] # Dispatch dict so the loop can call the right Python function by name TOOL_DISPATCH = { "web_search": web_search, "fetch_url": fetch_url, "extract_with_local_llm": extract_with_local_llm, "save_document": save_document, "download_pdf": download_pdf, "lookup_portal": lookup_portal, "remember_portal": remember_portal, }