AR-House/agent_tools.py

"""agent_tools.py — Tools que la IA local puede llamar para investigar properties.

PRINCIPIO:
La IA local (Ollama) tiene un reasoning loop que decide qué tool llamar next.
Cada tool retorna data estructurada que la IA usa para decidir el próximo paso.

Tools disponibles:
  - web_search(query, max_results)
  - fetch_url(url)
  - extract_with_local_llm(html_or_text, schema, instructions)
  - save_document(deal_id, category, filename, content)
  - download_pdf(url, deal_id, category, filename)
  - remember_portal(state, county, portal_type, url, notes)
  - lookup_portal(state, county, portal_type)
  - geocode_address(address)
  - finish(summary)   ← termina el loop

COSTO:
- web_search: $0 (DuckDuckGo HTML)
- fetch_url: $0 (Playwright local)
- extract_with_local_llm: $0 (Ollama local)
- save_document, remember_portal, lookup_portal: $0 (filesystem/JSON)
- geocode_address: $0 (Census)
"""
from __future__ import annotations

import hashlib
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional


# ────────────────────────────────────────────────────────────────────────────
# Paths
# ────────────────────────────────────────────────────────────────────────────

PROJECT_ROOT = Path(__file__).resolve().parent
PORTAL_DIR_PATH = PROJECT_ROOT / "portal_directory.json"
PROPERTIES_ROOT = PROJECT_ROOT / "properties"


# ────────────────────────────────────────────────────────────────────────────
# Tool 1: web_search (DuckDuckGo, gratis)
# ────────────────────────────────────────────────────────────────────────────

def web_search(query: str, max_results: int = 5) -> dict:
    """Search the web via DuckDuckGo (free, no API key).

    Returns:
        {"query": ..., "results": [{"title", "url", "snippet"}, ...]}
    """
    try:
        from ddgs import DDGS
    except ImportError:
        return {"query": query, "error": "ddgs package not installed", "results": []}

    try:
        with DDGS() as ddgs:
            raw = ddgs.text(query, max_results=max_results, region="us-en")
            results = [
                {
                    "title": r.get("title", "")[:200],
                    "url": r.get("href", ""),
                    "snippet": r.get("body", "")[:300],
                }
                for r in raw
            ]
        return {"query": query, "results": results}
    except Exception as e:
        return {"query": query, "error": f"{type(e).__name__}: {e}", "results": []}


# ────────────────────────────────────────────────────────────────────────────
# Tool 2: fetch_url (Playwright, gratis)
# ────────────────────────────────────────────────────────────────────────────

_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"


def fetch_url(url: str, wait_seconds: int = 5, timeout_seconds: int = 20) -> dict:
    """Fetch a URL via Playwright (free, handles JS).

    Returns:
        {"url", "status", "title", "text", "links": [{"href","text"}],
         "html_size", "error"}
    """
    out = {"url": url, "status": None, "title": "", "text": "",
           "links": [], "html_size": 0, "error": None}
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        out["error"] = "playwright not available"
        return out

    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            try:
                ctx = browser.new_context(user_agent=_UA, viewport={"width": 1280, "height": 900})
                page = ctx.new_page()
                page.set_default_timeout(timeout_seconds * 1000)
                resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout_seconds * 1000)
                time.sleep(wait_seconds)

                out["status"] = resp.status if resp else None
                out["title"] = page.title()[:200]
                out["text"] = page.evaluate("document.body.innerText")[:8000]
                out["html_size"] = len(page.content())

                # Top 20 links
                links = page.evaluate("""
                    Array.from(document.querySelectorAll('a[href]'))
                        .slice(0, 30)
                        .map(a => ({href: a.href, text: (a.innerText || a.textContent || '').trim().substring(0,80)}))
                        .filter(l => l.href && !l.href.startsWith('javascript:') && !l.href.startsWith('mailto:'))
                """)
                out["links"] = links[:20]
            finally:
                browser.close()
    except Exception as e:
        out["error"] = f"{type(e).__name__}: {e}"
    return out


# ────────────────────────────────────────────────────────────────────────────
# Tool 3: extract_with_local_llm (Ollama JSON mode, gratis)
# ────────────────────────────────────────────────────────────────────────────

def extract_with_local_llm(
    *,
    text: str,
    schema: dict,
    instructions: str,
    model: str = "llama3.1:8b",
) -> dict:
    """Use a local Ollama model to extract structured data from text.

    Args:
        text: raw text or HTML chunk (will be truncated to ~6K chars to fit context)
        schema: JSON Schema describing fields to extract
        instructions: prose instructions for the model
        model: Ollama model name (default llama3.1:8b)

    Returns:
        {"extracted": {<fields>}, "error": None | str}
    """
    try:
        import ollama
    except ImportError:
        return {"extracted": {}, "error": "ollama not installed"}

    # Cap text to avoid OOM (context window protection)
    text = text[:6000]

    prompt = (
        f"{instructions}\n\n"
        f"Schema to fill (JSON):\n```json\n{json.dumps(schema, indent=2)}\n```\n\n"
        f"Text to extract from:\n```\n{text}\n```\n\n"
        f"Return ONLY valid JSON matching the schema. No prose. No markdown wrapper. "
        f"If a field is not present in the text, use null."
    )

    try:
        resp = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            format="json",
            options={"temperature": 0.1, "num_ctx": 8192, "num_predict": 800},
        )
        content = resp["message"]["content"]
        try:
            data = json.loads(content)
            return {"extracted": data, "error": None}
        except json.JSONDecodeError as e:
            return {"extracted": {}, "error": f"JSON decode: {e}: {content[:200]}"}
    except Exception as e:
        return {"extracted": {}, "error": f"{type(e).__name__}: {e}"}


# ────────────────────────────────────────────────────────────────────────────
# Tool 4: save_document (filesystem)
# ────────────────────────────────────────────────────────────────────────────

def save_document(
    *,
    deal_id: int,
    category: str,
    filename: str,
    content: str | bytes,
) -> dict:
    """Save text/binary content to properties/{deal_folder}/{category}/{filename}.

    category: deeds | liens | court_records | property_appraiser | photos |
              due_diligence | offers
    """
    valid_categories = {"deeds", "liens", "court_records", "property_appraiser",
                        "photos", "due_diligence", "offers"}
    if category not in valid_categories:
        return {"error": f"invalid category {category!r}; valid: {sorted(valid_categories)}"}

    try:
        from deals_db import get_deal_by_id
        from properties_store import ensure_property_folder
        deal = get_deal_by_id(deal_id)
        if not deal:
            return {"error": f"deal_id={deal_id} not found"}
        folder = ensure_property_folder(deal)
    except Exception as e:
        return {"error": f"{type(e).__name__}: {e}"}

    # Sanitize filename
    safe_name = re.sub(r"[^A-Za-z0-9._\-]+", "_", filename)[:120]
    target = folder / category / safe_name
    target.parent.mkdir(parents=True, exist_ok=True)

    try:
        if isinstance(content, bytes):
            target.write_bytes(content)
        else:
            target.write_text(content, encoding="utf-8")
        return {"saved_to": str(target), "size": target.stat().st_size}
    except Exception as e:
        return {"error": f"write failed: {e}"}


def download_pdf(*, deal_id: int, category: str, filename: str, url: str) -> dict:
    """Download a PDF from URL and save to properties folder."""
    import urllib.request
    try:
        req = urllib.request.Request(url, headers={"User-Agent": _UA})
        with urllib.request.urlopen(req, timeout=20) as resp:
            content = resp.read()
            content_type = resp.headers.get("Content-Type", "")
    except Exception as e:
        return {"error": f"download failed: {type(e).__name__}: {e}"}

    # Sanity check: must be a PDF or image
    is_pdf = content_type.startswith("application/pdf") or content[:4] == b"%PDF"
    is_img = content_type.startswith("image/") or content[:4] in (b"\xff\xd8\xff\xe0", b"\xff\xd8\xff\xe1", b"\x89PNG")

    if not (is_pdf or is_img):
        return {"error": f"unexpected content-type {content_type!r}; first bytes: {content[:8].hex()}"}

    if not filename.lower().endswith((".pdf", ".jpg", ".jpeg", ".png")):
        filename = filename + (".pdf" if is_pdf else ".jpg")

    return save_document(deal_id=deal_id, category=category, filename=filename, content=content)


# ────────────────────────────────────────────────────────────────────────────
# Tool 5: portal directory (memory de portales por county)
# ────────────────────────────────────────────────────────────────────────────

VALID_PORTAL_TYPES = {
    "court_records",     # case docket / lis pendens / civil suit search
    "property_appraiser",# PA, tax assessor
    "recorder",          # Official Records: deeds, mortgages, liens
    "tax_collector",     # tax bills, delinquencies
    "code_enforcement",  # violations
    "clerk_auction",     # foreclosure auction site (realauction.com etc.)
    "building_dept",     # permits, COs
}


def _load_portal_directory() -> dict:
    if PORTAL_DIR_PATH.exists():
        try:
            return json.loads(PORTAL_DIR_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {}


def _save_portal_directory(data: dict) -> None:
    PORTAL_DIR_PATH.write_text(
        json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True),
        encoding="utf-8",
    )


def lookup_portal(state: str, county: str, portal_type: str) -> dict:
    """Get known portal URL for (state, county, portal_type).

    Returns: {"found": bool, "url": str|None, "notes": str|None, "last_verified": str|None}
    """
    if portal_type not in VALID_PORTAL_TYPES:
        return {"found": False, "error": f"invalid portal_type {portal_type!r}; valid: {sorted(VALID_PORTAL_TYPES)}"}

    data = _load_portal_directory()
    state_data = data.get(state.upper(), {})
    county_data = state_data.get(county, {})
    entry = county_data.get(portal_type)

    if not entry:
        return {"found": False, "state": state, "county": county, "portal_type": portal_type}
    return {
        "found": True,
        "state": state,
        "county": county,
        "portal_type": portal_type,
        "url": entry.get("url"),
        "notes": entry.get("notes"),
        "last_verified": entry.get("last_verified"),
    }


def remember_portal(
    *,
    state: str,
    county: str,
    portal_type: str,
    url: str,
    notes: Optional[str] = None,
) -> dict:
    """Persist: 'For {county}, {state} the {portal_type} is {url}'.

    Lets the agent build up institutional memory over time.
    """
    if portal_type not in VALID_PORTAL_TYPES:
        return {"error": f"invalid portal_type {portal_type!r}; valid: {sorted(VALID_PORTAL_TYPES)}"}

    data = _load_portal_directory()
    state_key = state.upper()
    data.setdefault(state_key, {})
    data[state_key].setdefault(county, {})
    data[state_key][county][portal_type] = {
        "url": url,
        "notes": notes,
        "last_verified": datetime.now(timezone.utc).isoformat()[:10],
    }
    _save_portal_directory(data)
    return {"ok": True, "state": state, "county": county, "portal_type": portal_type, "url": url}


# ────────────────────────────────────────────────────────────────────────────
# Tool 6: geocode_address (Census, gratis)
# ────────────────────────────────────────────────────────────────────────────

def geocode_address(address: str) -> dict:
    """Geocode an address using Census (free)."""
    try:
        from data_fetchers.census_geocode import fetch_geocode
        return fetch_geocode(address) or {"error": "no result"}
    except Exception as e:
        return {"error": f"{type(e).__name__}: {e}"}


# ────────────────────────────────────────────────────────────────────────────
# Tool spec dictionary for Ollama tool calling
# ────────────────────────────────────────────────────────────────────────────

OLLAMA_TOOL_SPECS = [
    {
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web (DuckDuckGo, free). Use for discovering county portals, finding case# info, etc.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string", "description": "Search query in English"},
                    "max_results": {"type": "integer", "description": "Max results to return (default 5)", "default": 5},
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "fetch_url",
            "description": "Load a URL with Playwright (handles JS). Returns title, text, and top links.",
            "parameters": {
                "type": "object",
                "properties": {
                    "url": {"type": "string"},
                    "wait_seconds": {"type": "integer", "description": "Seconds to wait for JS to render after page load (default 5)", "default": 5},
                },
                "required": ["url"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "extract_with_local_llm",
            "description": "Extract structured JSON from text/HTML using a local LLM. Use after fetch_url to parse the page.",
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {"type": "string", "description": "Text or HTML to extract from (cap 6K chars)"},
                    "schema": {"type": "object", "description": "JSON schema describing fields to extract"},
                    "instructions": {"type": "string", "description": "Prose instructions for the extractor"},
                },
                "required": ["text", "schema", "instructions"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "lookup_portal",
            "description": "Check if we already know the portal URL for a (state, county, portal_type). ALWAYS call this first before web_search.",
            "parameters": {
                "type": "object",
                "properties": {
                    "state": {"type": "string", "description": "2-letter state code (e.g. NC, FL)"},
                    "county": {"type": "string", "description": "County name (e.g. Wake, Miami-Dade)"},
                    "portal_type": {"type": "string", "enum": list(VALID_PORTAL_TYPES)},
                },
                "required": ["state", "county", "portal_type"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "remember_portal",
            "description": "Save a portal URL we discovered. Call this AFTER you confirm a working portal so future searches in the same county are faster.",
            "parameters": {
                "type": "object",
                "properties": {
                    "state": {"type": "string"},
                    "county": {"type": "string"},
                    "portal_type": {"type": "string", "enum": list(VALID_PORTAL_TYPES)},
                    "url": {"type": "string"},
                    "notes": {"type": "string", "description": "Brief notes about the portal (optional)"},
                },
                "required": ["state", "county", "portal_type", "url"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "save_document",
            "description": "Save text content to a deal's property folder.",
            "parameters": {
                "type": "object",
                "properties": {
                    "deal_id": {"type": "integer"},
                    "category": {"type": "string", "enum": ["deeds","liens","court_records","property_appraiser","photos","due_diligence","offers"]},
                    "filename": {"type": "string"},
                    "content": {"type": "string"},
                },
                "required": ["deal_id", "category", "filename", "content"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "download_pdf",
            "description": "Download a PDF (or image) from URL and save to deal's folder.",
            "parameters": {
                "type": "object",
                "properties": {
                    "deal_id": {"type": "integer"},
                    "category": {"type": "string", "enum": ["deeds","liens","court_records","property_appraiser","photos","due_diligence","offers"]},
                    "filename": {"type": "string"},
                    "url": {"type": "string"},
                },
                "required": ["deal_id", "category", "filename", "url"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "finish",
            "description": "Call this when research is complete. Provide a final summary of findings.",
            "parameters": {
                "type": "object",
                "properties": {
                    "summary": {"type": "string", "description": "Brief summary of what was found"},
                    "portals_used": {"type": "array", "items": {"type": "string"}, "description": "List of portal URLs successfully used"},
                    "documents_saved": {"type": "array", "items": {"type": "string"}, "description": "List of paths of documents saved"},
                    "findings": {"type": "object", "description": "Key structured findings (plaintiff, owner, etc)"},
                },
                "required": ["summary"],
            },
        },
    },
]


# Dispatch dict so the loop can call the right Python function by name
TOOL_DISPATCH = {
    "web_search": web_search,
    "fetch_url": fetch_url,
    "extract_with_local_llm": extract_with_local_llm,
    "save_document": save_document,
    "download_pdf": download_pdf,
    "lookup_portal": lookup_portal,
    "remember_portal": remember_portal,
}