Files
AR-House/data_fetchers/pa_photo_lookup.py
T
2026-07-03 12:24:58 -04:00

146 lines
5.5 KiB
Python

"""data_fetchers/pa_photo_lookup.py — Buscar fotos de propiedad en sitios PA (gratis).
PROPOSITO:
Los County Property Appraisers (PA) de Florida tienen fotos de las propiedades.
Acceso público vía Playwright (cero costo Firecrawl).
Es la alternativa GRATIS a `zillow_photo_lookup` (que cuesta 1 credit por property).
COVERAGE actual:
- Broward (bcpa.net): ✓ tested, 100% hit rate en 3-sample
- Miami-Dade (miamidadepa.gov): ✗ solo aerial, no street photo
- Duval (paopropertysearch.coj.net): pendiente investigar URL correcta
- Otros counties: stub para Phase 3.5.B
USO:
from data_fetchers.pa_photo_lookup import fetch_pa_photo
url, meta = fetch_pa_photo(county="Broward", parcel_id="484226062150")
"""
from __future__ import annotations
from typing import Optional
def fetch_pa_photo(
county: str,
parcel_id: str,
timeout_seconds: int = 25,
) -> tuple[Optional[str], dict]:
"""Fetch property photo URL from County Property Appraiser.
Args:
county: county name (e.g. "Broward", "Miami-Dade", "Duval")
parcel_id: county-specific parcel/folio number
timeout_seconds: max wait per Playwright call
Returns:
(photo_url, metadata)
photo_url: str or None
metadata: {county, parcel_id, source, error}
"""
meta = {"county": county, "parcel_id": parcel_id, "source": None, "error": None}
if not parcel_id:
meta["error"] = "no parcel_id"
return None, meta
cnorm = (county or "").lower().replace(" county", "").strip().replace(" ", "_")
fetcher = _FETCHERS.get(cnorm)
if not fetcher:
meta["error"] = f"no PA fetcher for county {county!r} (supported: {sorted(_FETCHERS.keys())})"
return None, meta
try:
url, source_name = fetcher(parcel_id, timeout_seconds)
meta["source"] = source_name
return url, meta
except Exception as e:
meta["error"] = f"{type(e).__name__}: {e}"
return None, meta
# ────────────────────────────────────────────────────────────────────────────
# Per-county implementations
# ────────────────────────────────────────────────────────────────────────────
_CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"
def _fetch_broward(parcel_id: str, timeout_seconds: int) -> tuple[Optional[str], str]:
"""Broward bcpa.net SPA fetcher.
URL pattern: https://web.bcpa.net/bcpaclient/#/Record-Search?folio=XXX
Photo embedded in <img src=".../Photographs/<first6>/<next2>/<next4>/<file>.jpg">
"""
from playwright.sync_api import sync_playwright
import time
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
photo = None
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
try:
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
time.sleep(7) # SPA render delay
photos = page.evaluate(
"Array.from(document.querySelectorAll('img'))"
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
".map(i => i.src)"
)
if photos:
photo = photos[0]
finally:
browser.close()
return photo, "bcpa.net"
def _fetch_broward_batch(parcel_ids: list[str], timeout_seconds: int = 20) -> dict[str, Optional[str]]:
"""Optimized batch fetcher for Broward.
Re-uses browser across folios (single session) for speed.
Returns: {parcel_id: photo_url or None}
"""
from playwright.sync_api import sync_playwright
import time
out: dict[str, Optional[str]] = {}
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
for parcel_id in parcel_ids:
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
try:
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
time.sleep(7)
photos = page.evaluate(
"Array.from(document.querySelectorAll('img'))"
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
".map(i => i.src)"
)
out[parcel_id] = photos[0] if photos else None
except Exception:
out[parcel_id] = None
page.close()
browser.close()
return out
_FETCHERS = {
"broward": _fetch_broward,
# TODO Phase 3.5.B:
# "miami-dade": _fetch_miami_dade, (only aerial, no street photo — skip)
# "duval": _fetch_duval,
# "hillsborough": _fetch_hillsborough,
# "orange": _fetch_orange,
}
def list_supported_counties() -> list[str]:
"""Returns counties with implemented PA photo fetchers."""
return sorted(_FETCHERS.keys())