"""data_fetchers/pa_photo_lookup.py — Buscar fotos de propiedad en sitios PA (gratis). PROPOSITO: Los County Property Appraisers (PA) de Florida tienen fotos de las propiedades. Acceso público vía Playwright (cero costo Firecrawl). Es la alternativa GRATIS a `zillow_photo_lookup` (que cuesta 1 credit por property). COVERAGE actual: - Broward (bcpa.net): ✓ tested, 100% hit rate en 3-sample - Miami-Dade (miamidadepa.gov): ✗ solo aerial, no street photo - Duval (paopropertysearch.coj.net): pendiente investigar URL correcta - Otros counties: stub para Phase 3.5.B USO: from data_fetchers.pa_photo_lookup import fetch_pa_photo url, meta = fetch_pa_photo(county="Broward", parcel_id="484226062150") """ from __future__ import annotations from typing import Optional def fetch_pa_photo( county: str, parcel_id: str, timeout_seconds: int = 25, ) -> tuple[Optional[str], dict]: """Fetch property photo URL from County Property Appraiser. Args: county: county name (e.g. "Broward", "Miami-Dade", "Duval") parcel_id: county-specific parcel/folio number timeout_seconds: max wait per Playwright call Returns: (photo_url, metadata) photo_url: str or None metadata: {county, parcel_id, source, error} """ meta = {"county": county, "parcel_id": parcel_id, "source": None, "error": None} if not parcel_id: meta["error"] = "no parcel_id" return None, meta cnorm = (county or "").lower().replace(" county", "").strip().replace(" ", "_") fetcher = _FETCHERS.get(cnorm) if not fetcher: meta["error"] = f"no PA fetcher for county {county!r} (supported: {sorted(_FETCHERS.keys())})" return None, meta try: url, source_name = fetcher(parcel_id, timeout_seconds) meta["source"] = source_name return url, meta except Exception as e: meta["error"] = f"{type(e).__name__}: {e}" return None, meta # ──────────────────────────────────────────────────────────────────────────── # Per-county implementations # ──────────────────────────────────────────────────────────────────────────── _CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36" def _fetch_broward(parcel_id: str, timeout_seconds: int) -> tuple[Optional[str], str]: """Broward bcpa.net SPA fetcher. URL pattern: https://web.bcpa.net/bcpaclient/#/Record-Search?folio=XXX Photo embedded in """ from playwright.sync_api import sync_playwright import time url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}" photo = None with sync_playwright() as p: browser = p.chromium.launch(headless=True) try: ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900}) page = ctx.new_page() page.set_default_timeout(timeout_seconds * 1000) page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000) time.sleep(7) # SPA render delay photos = page.evaluate( "Array.from(document.querySelectorAll('img'))" ".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)" ".map(i => i.src)" ) if photos: photo = photos[0] finally: browser.close() return photo, "bcpa.net" def _fetch_broward_batch(parcel_ids: list[str], timeout_seconds: int = 20) -> dict[str, Optional[str]]: """Optimized batch fetcher for Broward. Re-uses browser across folios (single session) for speed. Returns: {parcel_id: photo_url or None} """ from playwright.sync_api import sync_playwright import time out: dict[str, Optional[str]] = {} with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900}) for parcel_id in parcel_ids: page = ctx.new_page() page.set_default_timeout(timeout_seconds * 1000) url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}" try: page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000) time.sleep(7) photos = page.evaluate( "Array.from(document.querySelectorAll('img'))" ".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)" ".map(i => i.src)" ) out[parcel_id] = photos[0] if photos else None except Exception: out[parcel_id] = None page.close() browser.close() return out _FETCHERS = { "broward": _fetch_broward, # TODO Phase 3.5.B: # "miami-dade": _fetch_miami_dade, (only aerial, no street photo — skip) # "duval": _fetch_duval, # "hillsborough": _fetch_hillsborough, # "orange": _fetch_orange, } def list_supported_counties() -> list[str]: """Returns counties with implemented PA photo fetchers.""" return sorted(_FETCHERS.keys())