146 lines
5.5 KiB
Python
146 lines
5.5 KiB
Python
"""data_fetchers/pa_photo_lookup.py — Buscar fotos de propiedad en sitios PA (gratis).
|
|
|
|
PROPOSITO:
|
|
Los County Property Appraisers (PA) de Florida tienen fotos de las propiedades.
|
|
Acceso público vía Playwright (cero costo Firecrawl).
|
|
|
|
Es la alternativa GRATIS a `zillow_photo_lookup` (que cuesta 1 credit por property).
|
|
|
|
COVERAGE actual:
|
|
- Broward (bcpa.net): ✓ tested, 100% hit rate en 3-sample
|
|
- Miami-Dade (miamidadepa.gov): ✗ solo aerial, no street photo
|
|
- Duval (paopropertysearch.coj.net): pendiente investigar URL correcta
|
|
- Otros counties: stub para Phase 3.5.B
|
|
|
|
USO:
|
|
from data_fetchers.pa_photo_lookup import fetch_pa_photo
|
|
url, meta = fetch_pa_photo(county="Broward", parcel_id="484226062150")
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
|
|
|
|
def fetch_pa_photo(
|
|
county: str,
|
|
parcel_id: str,
|
|
timeout_seconds: int = 25,
|
|
) -> tuple[Optional[str], dict]:
|
|
"""Fetch property photo URL from County Property Appraiser.
|
|
|
|
Args:
|
|
county: county name (e.g. "Broward", "Miami-Dade", "Duval")
|
|
parcel_id: county-specific parcel/folio number
|
|
timeout_seconds: max wait per Playwright call
|
|
|
|
Returns:
|
|
(photo_url, metadata)
|
|
photo_url: str or None
|
|
metadata: {county, parcel_id, source, error}
|
|
"""
|
|
meta = {"county": county, "parcel_id": parcel_id, "source": None, "error": None}
|
|
if not parcel_id:
|
|
meta["error"] = "no parcel_id"
|
|
return None, meta
|
|
|
|
cnorm = (county or "").lower().replace(" county", "").strip().replace(" ", "_")
|
|
|
|
fetcher = _FETCHERS.get(cnorm)
|
|
if not fetcher:
|
|
meta["error"] = f"no PA fetcher for county {county!r} (supported: {sorted(_FETCHERS.keys())})"
|
|
return None, meta
|
|
|
|
try:
|
|
url, source_name = fetcher(parcel_id, timeout_seconds)
|
|
meta["source"] = source_name
|
|
return url, meta
|
|
except Exception as e:
|
|
meta["error"] = f"{type(e).__name__}: {e}"
|
|
return None, meta
|
|
|
|
|
|
# ────────────────────────────────────────────────────────────────────────────
|
|
# Per-county implementations
|
|
# ────────────────────────────────────────────────────────────────────────────
|
|
|
|
_CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"
|
|
|
|
|
|
def _fetch_broward(parcel_id: str, timeout_seconds: int) -> tuple[Optional[str], str]:
|
|
"""Broward bcpa.net SPA fetcher.
|
|
|
|
URL pattern: https://web.bcpa.net/bcpaclient/#/Record-Search?folio=XXX
|
|
Photo embedded in <img src=".../Photographs/<first6>/<next2>/<next4>/<file>.jpg">
|
|
"""
|
|
from playwright.sync_api import sync_playwright
|
|
import time
|
|
|
|
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
|
photo = None
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
try:
|
|
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(timeout_seconds * 1000)
|
|
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
|
|
time.sleep(7) # SPA render delay
|
|
photos = page.evaluate(
|
|
"Array.from(document.querySelectorAll('img'))"
|
|
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
|
|
".map(i => i.src)"
|
|
)
|
|
if photos:
|
|
photo = photos[0]
|
|
finally:
|
|
browser.close()
|
|
return photo, "bcpa.net"
|
|
|
|
|
|
def _fetch_broward_batch(parcel_ids: list[str], timeout_seconds: int = 20) -> dict[str, Optional[str]]:
|
|
"""Optimized batch fetcher for Broward.
|
|
|
|
Re-uses browser across folios (single session) for speed.
|
|
Returns: {parcel_id: photo_url or None}
|
|
"""
|
|
from playwright.sync_api import sync_playwright
|
|
import time
|
|
|
|
out: dict[str, Optional[str]] = {}
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
|
|
for parcel_id in parcel_ids:
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(timeout_seconds * 1000)
|
|
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
|
try:
|
|
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
|
|
time.sleep(7)
|
|
photos = page.evaluate(
|
|
"Array.from(document.querySelectorAll('img'))"
|
|
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
|
|
".map(i => i.src)"
|
|
)
|
|
out[parcel_id] = photos[0] if photos else None
|
|
except Exception:
|
|
out[parcel_id] = None
|
|
page.close()
|
|
browser.close()
|
|
return out
|
|
|
|
|
|
_FETCHERS = {
|
|
"broward": _fetch_broward,
|
|
# TODO Phase 3.5.B:
|
|
# "miami-dade": _fetch_miami_dade, (only aerial, no street photo — skip)
|
|
# "duval": _fetch_duval,
|
|
# "hillsborough": _fetch_hillsborough,
|
|
# "orange": _fetch_orange,
|
|
}
|
|
|
|
|
|
def list_supported_counties() -> list[str]:
|
|
"""Returns counties with implemented PA photo fetchers."""
|
|
return sorted(_FETCHERS.keys())
|