AR-House/data_fetchers/pa_photo_lookup.py

"""data_fetchers/pa_photo_lookup.py — Buscar fotos de propiedad en sitios PA (gratis).

PROPOSITO:
Los County Property Appraisers (PA) de Florida tienen fotos de las propiedades.
Acceso público vía Playwright (cero costo Firecrawl).

Es la alternativa GRATIS a `zillow_photo_lookup` (que cuesta 1 credit por property).

COVERAGE actual:
- Broward (bcpa.net): ✓ tested, 100% hit rate en 3-sample
- Miami-Dade (miamidadepa.gov): ✗ solo aerial, no street photo
- Duval (paopropertysearch.coj.net): pendiente investigar URL correcta
- Otros counties: stub para Phase 3.5.B

USO:
    from data_fetchers.pa_photo_lookup import fetch_pa_photo
    url, meta = fetch_pa_photo(county="Broward", parcel_id="484226062150")
"""
from __future__ import annotations

from typing import Optional


def fetch_pa_photo(
    county: str,
    parcel_id: str,
    timeout_seconds: int = 25,
) -> tuple[Optional[str], dict]:
    """Fetch property photo URL from County Property Appraiser.

    Args:
        county: county name (e.g. "Broward", "Miami-Dade", "Duval")
        parcel_id: county-specific parcel/folio number
        timeout_seconds: max wait per Playwright call

    Returns:
        (photo_url, metadata)
        photo_url: str or None
        metadata: {county, parcel_id, source, error}
    """
    meta = {"county": county, "parcel_id": parcel_id, "source": None, "error": None}
    if not parcel_id:
        meta["error"] = "no parcel_id"
        return None, meta

    cnorm = (county or "").lower().replace(" county", "").strip().replace(" ", "_")

    fetcher = _FETCHERS.get(cnorm)
    if not fetcher:
        meta["error"] = f"no PA fetcher for county {county!r} (supported: {sorted(_FETCHERS.keys())})"
        return None, meta

    try:
        url, source_name = fetcher(parcel_id, timeout_seconds)
        meta["source"] = source_name
        return url, meta
    except Exception as e:
        meta["error"] = f"{type(e).__name__}: {e}"
        return None, meta


# ────────────────────────────────────────────────────────────────────────────
# Per-county implementations
# ────────────────────────────────────────────────────────────────────────────

_CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"


def _fetch_broward(parcel_id: str, timeout_seconds: int) -> tuple[Optional[str], str]:
    """Broward bcpa.net SPA fetcher.

    URL pattern: https://web.bcpa.net/bcpaclient/#/Record-Search?folio=XXX
    Photo embedded in <img src=".../Photographs/<first6>/<next2>/<next4>/<file>.jpg">
    """
    from playwright.sync_api import sync_playwright
    import time

    url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
    photo = None
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        try:
            ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
            page = ctx.new_page()
            page.set_default_timeout(timeout_seconds * 1000)
            page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
            time.sleep(7)  # SPA render delay
            photos = page.evaluate(
                "Array.from(document.querySelectorAll('img'))"
                ".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
                ".map(i => i.src)"
            )
            if photos:
                photo = photos[0]
        finally:
            browser.close()
    return photo, "bcpa.net"


def _fetch_broward_batch(parcel_ids: list[str], timeout_seconds: int = 20) -> dict[str, Optional[str]]:
    """Optimized batch fetcher for Broward.

    Re-uses browser across folios (single session) for speed.
    Returns: {parcel_id: photo_url or None}
    """
    from playwright.sync_api import sync_playwright
    import time

    out: dict[str, Optional[str]] = {}
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
        for parcel_id in parcel_ids:
            page = ctx.new_page()
            page.set_default_timeout(timeout_seconds * 1000)
            url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
            try:
                page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
                time.sleep(7)
                photos = page.evaluate(
                    "Array.from(document.querySelectorAll('img'))"
                    ".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
                    ".map(i => i.src)"
                )
                out[parcel_id] = photos[0] if photos else None
            except Exception:
                out[parcel_id] = None
            page.close()
        browser.close()
    return out


_FETCHERS = {
    "broward": _fetch_broward,
    # TODO Phase 3.5.B:
    # "miami-dade": _fetch_miami_dade,  (only aerial, no street photo — skip)
    # "duval": _fetch_duval,
    # "hillsborough": _fetch_hillsborough,
    # "orange": _fetch_orange,
}


def list_supported_counties() -> list[str]:
    """Returns counties with implemented PA photo fetchers."""
    return sorted(_FETCHERS.keys())