Files
AR-House/agent_tools.py
2026-07-03 12:24:58 -04:00

500 lines
21 KiB
Python

"""agent_tools.py — Tools que la IA local puede llamar para investigar properties.
PRINCIPIO:
La IA local (Ollama) tiene un reasoning loop que decide qué tool llamar next.
Cada tool retorna data estructurada que la IA usa para decidir el próximo paso.
Tools disponibles:
- web_search(query, max_results)
- fetch_url(url)
- extract_with_local_llm(html_or_text, schema, instructions)
- save_document(deal_id, category, filename, content)
- download_pdf(url, deal_id, category, filename)
- remember_portal(state, county, portal_type, url, notes)
- lookup_portal(state, county, portal_type)
- geocode_address(address)
- finish(summary) ← termina el loop
COSTO:
- web_search: $0 (DuckDuckGo HTML)
- fetch_url: $0 (Playwright local)
- extract_with_local_llm: $0 (Ollama local)
- save_document, remember_portal, lookup_portal: $0 (filesystem/JSON)
- geocode_address: $0 (Census)
"""
from __future__ import annotations
import hashlib
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
# ────────────────────────────────────────────────────────────────────────────
# Paths
# ────────────────────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(__file__).resolve().parent
PORTAL_DIR_PATH = PROJECT_ROOT / "portal_directory.json"
PROPERTIES_ROOT = PROJECT_ROOT / "properties"
# ────────────────────────────────────────────────────────────────────────────
# Tool 1: web_search (DuckDuckGo, gratis)
# ────────────────────────────────────────────────────────────────────────────
def web_search(query: str, max_results: int = 5) -> dict:
"""Search the web via DuckDuckGo (free, no API key).
Returns:
{"query": ..., "results": [{"title", "url", "snippet"}, ...]}
"""
try:
from ddgs import DDGS
except ImportError:
return {"query": query, "error": "ddgs package not installed", "results": []}
try:
with DDGS() as ddgs:
raw = ddgs.text(query, max_results=max_results, region="us-en")
results = [
{
"title": r.get("title", "")[:200],
"url": r.get("href", ""),
"snippet": r.get("body", "")[:300],
}
for r in raw
]
return {"query": query, "results": results}
except Exception as e:
return {"query": query, "error": f"{type(e).__name__}: {e}", "results": []}
# ────────────────────────────────────────────────────────────────────────────
# Tool 2: fetch_url (Playwright, gratis)
# ────────────────────────────────────────────────────────────────────────────
_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"
def fetch_url(url: str, wait_seconds: int = 5, timeout_seconds: int = 20) -> dict:
"""Fetch a URL via Playwright (free, handles JS).
Returns:
{"url", "status", "title", "text", "links": [{"href","text"}],
"html_size", "error"}
"""
out = {"url": url, "status": None, "title": "", "text": "",
"links": [], "html_size": 0, "error": None}
try:
from playwright.sync_api import sync_playwright
except ImportError:
out["error"] = "playwright not available"
return out
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
try:
ctx = browser.new_context(user_agent=_UA, viewport={"width": 1280, "height": 900})
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout_seconds * 1000)
time.sleep(wait_seconds)
out["status"] = resp.status if resp else None
out["title"] = page.title()[:200]
out["text"] = page.evaluate("document.body.innerText")[:8000]
out["html_size"] = len(page.content())
# Top 20 links
links = page.evaluate("""
Array.from(document.querySelectorAll('a[href]'))
.slice(0, 30)
.map(a => ({href: a.href, text: (a.innerText || a.textContent || '').trim().substring(0,80)}))
.filter(l => l.href && !l.href.startsWith('javascript:') && !l.href.startsWith('mailto:'))
""")
out["links"] = links[:20]
finally:
browser.close()
except Exception as e:
out["error"] = f"{type(e).__name__}: {e}"
return out
# ────────────────────────────────────────────────────────────────────────────
# Tool 3: extract_with_local_llm (Ollama JSON mode, gratis)
# ────────────────────────────────────────────────────────────────────────────
def extract_with_local_llm(
*,
text: str,
schema: dict,
instructions: str,
model: str = "llama3.1:8b",
) -> dict:
"""Use a local Ollama model to extract structured data from text.
Args:
text: raw text or HTML chunk (will be truncated to ~6K chars to fit context)
schema: JSON Schema describing fields to extract
instructions: prose instructions for the model
model: Ollama model name (default llama3.1:8b)
Returns:
{"extracted": {<fields>}, "error": None | str}
"""
try:
import ollama
except ImportError:
return {"extracted": {}, "error": "ollama not installed"}
# Cap text to avoid OOM (context window protection)
text = text[:6000]
prompt = (
f"{instructions}\n\n"
f"Schema to fill (JSON):\n```json\n{json.dumps(schema, indent=2)}\n```\n\n"
f"Text to extract from:\n```\n{text}\n```\n\n"
f"Return ONLY valid JSON matching the schema. No prose. No markdown wrapper. "
f"If a field is not present in the text, use null."
)
try:
resp = ollama.chat(
model=model,
messages=[{"role": "user", "content": prompt}],
format="json",
options={"temperature": 0.1, "num_ctx": 8192, "num_predict": 800},
)
content = resp["message"]["content"]
try:
data = json.loads(content)
return {"extracted": data, "error": None}
except json.JSONDecodeError as e:
return {"extracted": {}, "error": f"JSON decode: {e}: {content[:200]}"}
except Exception as e:
return {"extracted": {}, "error": f"{type(e).__name__}: {e}"}
# ────────────────────────────────────────────────────────────────────────────
# Tool 4: save_document (filesystem)
# ────────────────────────────────────────────────────────────────────────────
def save_document(
*,
deal_id: int,
category: str,
filename: str,
content: str | bytes,
) -> dict:
"""Save text/binary content to properties/{deal_folder}/{category}/{filename}.
category: deeds | liens | court_records | property_appraiser | photos |
due_diligence | offers
"""
valid_categories = {"deeds", "liens", "court_records", "property_appraiser",
"photos", "due_diligence", "offers"}
if category not in valid_categories:
return {"error": f"invalid category {category!r}; valid: {sorted(valid_categories)}"}
try:
from deals_db import get_deal_by_id
from properties_store import ensure_property_folder
deal = get_deal_by_id(deal_id)
if not deal:
return {"error": f"deal_id={deal_id} not found"}
folder = ensure_property_folder(deal)
except Exception as e:
return {"error": f"{type(e).__name__}: {e}"}
# Sanitize filename
safe_name = re.sub(r"[^A-Za-z0-9._\-]+", "_", filename)[:120]
target = folder / category / safe_name
target.parent.mkdir(parents=True, exist_ok=True)
try:
if isinstance(content, bytes):
target.write_bytes(content)
else:
target.write_text(content, encoding="utf-8")
return {"saved_to": str(target), "size": target.stat().st_size}
except Exception as e:
return {"error": f"write failed: {e}"}
def download_pdf(*, deal_id: int, category: str, filename: str, url: str) -> dict:
"""Download a PDF from URL and save to properties folder."""
import urllib.request
try:
req = urllib.request.Request(url, headers={"User-Agent": _UA})
with urllib.request.urlopen(req, timeout=20) as resp:
content = resp.read()
content_type = resp.headers.get("Content-Type", "")
except Exception as e:
return {"error": f"download failed: {type(e).__name__}: {e}"}
# Sanity check: must be a PDF or image
is_pdf = content_type.startswith("application/pdf") or content[:4] == b"%PDF"
is_img = content_type.startswith("image/") or content[:4] in (b"\xff\xd8\xff\xe0", b"\xff\xd8\xff\xe1", b"\x89PNG")
if not (is_pdf or is_img):
return {"error": f"unexpected content-type {content_type!r}; first bytes: {content[:8].hex()}"}
if not filename.lower().endswith((".pdf", ".jpg", ".jpeg", ".png")):
filename = filename + (".pdf" if is_pdf else ".jpg")
return save_document(deal_id=deal_id, category=category, filename=filename, content=content)
# ────────────────────────────────────────────────────────────────────────────
# Tool 5: portal directory (memory de portales por county)
# ────────────────────────────────────────────────────────────────────────────
VALID_PORTAL_TYPES = {
"court_records", # case docket / lis pendens / civil suit search
"property_appraiser",# PA, tax assessor
"recorder", # Official Records: deeds, mortgages, liens
"tax_collector", # tax bills, delinquencies
"code_enforcement", # violations
"clerk_auction", # foreclosure auction site (realauction.com etc.)
"building_dept", # permits, COs
}
def _load_portal_directory() -> dict:
if PORTAL_DIR_PATH.exists():
try:
return json.loads(PORTAL_DIR_PATH.read_text(encoding="utf-8"))
except Exception:
pass
return {}
def _save_portal_directory(data: dict) -> None:
PORTAL_DIR_PATH.write_text(
json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True),
encoding="utf-8",
)
def lookup_portal(state: str, county: str, portal_type: str) -> dict:
"""Get known portal URL for (state, county, portal_type).
Returns: {"found": bool, "url": str|None, "notes": str|None, "last_verified": str|None}
"""
if portal_type not in VALID_PORTAL_TYPES:
return {"found": False, "error": f"invalid portal_type {portal_type!r}; valid: {sorted(VALID_PORTAL_TYPES)}"}
data = _load_portal_directory()
state_data = data.get(state.upper(), {})
county_data = state_data.get(county, {})
entry = county_data.get(portal_type)
if not entry:
return {"found": False, "state": state, "county": county, "portal_type": portal_type}
return {
"found": True,
"state": state,
"county": county,
"portal_type": portal_type,
"url": entry.get("url"),
"notes": entry.get("notes"),
"last_verified": entry.get("last_verified"),
}
def remember_portal(
*,
state: str,
county: str,
portal_type: str,
url: str,
notes: Optional[str] = None,
) -> dict:
"""Persist: 'For {county}, {state} the {portal_type} is {url}'.
Lets the agent build up institutional memory over time.
"""
if portal_type not in VALID_PORTAL_TYPES:
return {"error": f"invalid portal_type {portal_type!r}; valid: {sorted(VALID_PORTAL_TYPES)}"}
data = _load_portal_directory()
state_key = state.upper()
data.setdefault(state_key, {})
data[state_key].setdefault(county, {})
data[state_key][county][portal_type] = {
"url": url,
"notes": notes,
"last_verified": datetime.now(timezone.utc).isoformat()[:10],
}
_save_portal_directory(data)
return {"ok": True, "state": state, "county": county, "portal_type": portal_type, "url": url}
# ────────────────────────────────────────────────────────────────────────────
# Tool 6: geocode_address (Census, gratis)
# ────────────────────────────────────────────────────────────────────────────
def geocode_address(address: str) -> dict:
"""Geocode an address using Census (free)."""
try:
from data_fetchers.census_geocode import fetch_geocode
return fetch_geocode(address) or {"error": "no result"}
except Exception as e:
return {"error": f"{type(e).__name__}: {e}"}
# ────────────────────────────────────────────────────────────────────────────
# Tool spec dictionary for Ollama tool calling
# ────────────────────────────────────────────────────────────────────────────
OLLAMA_TOOL_SPECS = [
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web (DuckDuckGo, free). Use for discovering county portals, finding case# info, etc.",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query in English"},
"max_results": {"type": "integer", "description": "Max results to return (default 5)", "default": 5},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "fetch_url",
"description": "Load a URL with Playwright (handles JS). Returns title, text, and top links.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string"},
"wait_seconds": {"type": "integer", "description": "Seconds to wait for JS to render after page load (default 5)", "default": 5},
},
"required": ["url"],
},
},
},
{
"type": "function",
"function": {
"name": "extract_with_local_llm",
"description": "Extract structured JSON from text/HTML using a local LLM. Use after fetch_url to parse the page.",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text or HTML to extract from (cap 6K chars)"},
"schema": {"type": "object", "description": "JSON schema describing fields to extract"},
"instructions": {"type": "string", "description": "Prose instructions for the extractor"},
},
"required": ["text", "schema", "instructions"],
},
},
},
{
"type": "function",
"function": {
"name": "lookup_portal",
"description": "Check if we already know the portal URL for a (state, county, portal_type). ALWAYS call this first before web_search.",
"parameters": {
"type": "object",
"properties": {
"state": {"type": "string", "description": "2-letter state code (e.g. NC, FL)"},
"county": {"type": "string", "description": "County name (e.g. Wake, Miami-Dade)"},
"portal_type": {"type": "string", "enum": list(VALID_PORTAL_TYPES)},
},
"required": ["state", "county", "portal_type"],
},
},
},
{
"type": "function",
"function": {
"name": "remember_portal",
"description": "Save a portal URL we discovered. Call this AFTER you confirm a working portal so future searches in the same county are faster.",
"parameters": {
"type": "object",
"properties": {
"state": {"type": "string"},
"county": {"type": "string"},
"portal_type": {"type": "string", "enum": list(VALID_PORTAL_TYPES)},
"url": {"type": "string"},
"notes": {"type": "string", "description": "Brief notes about the portal (optional)"},
},
"required": ["state", "county", "portal_type", "url"],
},
},
},
{
"type": "function",
"function": {
"name": "save_document",
"description": "Save text content to a deal's property folder.",
"parameters": {
"type": "object",
"properties": {
"deal_id": {"type": "integer"},
"category": {"type": "string", "enum": ["deeds","liens","court_records","property_appraiser","photos","due_diligence","offers"]},
"filename": {"type": "string"},
"content": {"type": "string"},
},
"required": ["deal_id", "category", "filename", "content"],
},
},
},
{
"type": "function",
"function": {
"name": "download_pdf",
"description": "Download a PDF (or image) from URL and save to deal's folder.",
"parameters": {
"type": "object",
"properties": {
"deal_id": {"type": "integer"},
"category": {"type": "string", "enum": ["deeds","liens","court_records","property_appraiser","photos","due_diligence","offers"]},
"filename": {"type": "string"},
"url": {"type": "string"},
},
"required": ["deal_id", "category", "filename", "url"],
},
},
},
{
"type": "function",
"function": {
"name": "finish",
"description": "Call this when research is complete. Provide a final summary of findings.",
"parameters": {
"type": "object",
"properties": {
"summary": {"type": "string", "description": "Brief summary of what was found"},
"portals_used": {"type": "array", "items": {"type": "string"}, "description": "List of portal URLs successfully used"},
"documents_saved": {"type": "array", "items": {"type": "string"}, "description": "List of paths of documents saved"},
"findings": {"type": "object", "description": "Key structured findings (plaintiff, owner, etc)"},
},
"required": ["summary"],
},
},
},
]
# Dispatch dict so the loop can call the right Python function by name
TOOL_DISPATCH = {
"web_search": web_search,
"fetch_url": fetch_url,
"extract_with_local_llm": extract_with_local_llm,
"save_document": save_document,
"download_pdf": download_pdf,
"lookup_portal": lookup_portal,
"remember_portal": remember_portal,
}