Files
AR-House/data_fetchers/pa_broward.py
T
2026-07-03 12:24:58 -04:00

462 lines
19 KiB
Python

"""data_fetchers/pa_broward.py — Full Broward County Property Appraiser extractor.
Extrae TODO lo publico de bcpa.net para construir un Property Snapshot Report ($15):
- Owner + mailing address
- Property address + neighborhood
- Year built, sqft, use code, units
- Just/Market value, Assessed/SOH value, by year (3 anios)
- Taxes paid (3 anios)
- Tax breakdown por district (County / School Board / Municipal / Independent)
- Exemptions (homestead, senior, vet, disabled, etc.)
- Photo URL
- Legal description
USAGE:
from data_fetchers.pa_broward import fetch_broward_pa_record
record = fetch_broward_pa_record(parcel_id="484226062150")
# record["owner_name"], record["just_value"], record["sales_history"]...
TECHNICAL:
- bcpa.net es Angular SPA — usar Playwright, NO requests/curl
- wait_until="domcontentloaded" + 25s sleep (NO networkidle, nunca termina)
- Element IDs son ESTABLES (data-bound by Angular, NO autogenerados como JSF)
- Per-folio latency: ~28-32s
- Free (Playwright local, no API cost)
"""
from __future__ import annotations
import re
import time
from datetime import datetime
from typing import Optional
# ════════════════════════════════════════════════════════════════════════════
# Field ID mapping — confirmed via probe on folio 484226062150
# ════════════════════════════════════════════════════════════════════════════
# Single-value scalar fields
_SCALAR_IDS = {
"folio_number": "folioNumberId",
"owner_name": "ownerNameId",
"owner_name_2": "ownerName2Id",
"mailing_address": "mailingAddressId",
"situs_address": "situsAddressId",
"neighborhood": "neighborhood",
"use_code": "useCodeId",
"millage_code": "millageCodeId",
"adj_bldg_sqft": "bldgSqFTId",
"under_air_sqft": "bldgUnderAirFootageId",
"effective_year": "effectiveAgeId",
"year_built": "actualAgeId",
"units_beds_baths": "unitsBedsBathsId",
"legal_description": "legalDescId",
"homestead_flag": "homesteadFlagId",
# Current year values (auto-current year, e.g. 2026)
"current_tax_year": "currentTaxYearId",
"land_value_current": "landCurrentYearId",
"bldg_value_current": "bldgCurrentYearId",
"just_value_current": "justCurrentYearId",
"assessed_value_current": "sohCurrentYearId",
# Last year (e.g. 2025)
"last_tax_year": "lastTaxYearId",
"land_value_last": "landLastYearId",
"bldg_value_last": "bldgLastYearId",
"just_value_last": "justLastYearId",
"assessed_value_last": "sohLastYearId",
"taxes_paid_last": "assessedLastYearId",
# Two years ago (e.g. 2024)
"two_years_ago_tax_year": "lastTwoTaxYearId",
"land_value_2yr": "landLasttwoYearsId",
"bldg_value_2yr": "bldgLasttwoYearsId",
"just_value_2yr": "justLasttwoYearsId",
"assessed_value_2yr": "sohLasttwoYearsId",
"taxes_paid_2yr": "assessedLasttwoYearsId",
}
# Tax breakdown by district (current year)
_DISTRICT_IDS = {
# district name: {field: id}
"county": {
"just_value": "justValueCounty",
"portability": "portabilityValueCounty",
"assessed_soh": "sohValueCounty",
"homestead": "he1AmountCounty",
"add_homestead": "he2AmountCounty",
"widow_vet_dis": "wvdAmountCounty",
"senior": "seniorExemptionCounty",
"exemption_type": "mexAmountCounty",
"affordable_housing": "ahAmountCounty",
"taxable": "taxableAmountCounty",
},
"school_board": {
"just_value": "justValueSchoolBoard",
"portability": "portabilityValueSchoolBoard",
"assessed_soh": "sohValueSchoolBoard",
"homestead": "he1AmountSchoolBoard",
"add_homestead": "he2AmountSchoolBoard",
"widow_vet_dis": "wvdAmountSchoolBoard",
"exemption_type": "mexAmountSchoolBoard",
"affordable_housing": "ahAmountSchoolBoard",
"taxable": "taxableAmountSchoolBoard",
},
"municipal": {
"just_value": "justValueMunicipal",
"portability": "portabilityValueMunicipal",
"assessed_soh": "sohValueMunicipal",
"homestead": "he1AmountMunicipal",
"add_homestead": "he2AmountMunicipal",
"widow_vet_dis": "wvdAmountMunicipal",
"senior": "seniorExemptionMunicipal",
"exemption_type": "mexAmountMunicipal",
"affordable_housing": "ahAmountMunicipal",
"taxable": "taxableAmountMunicipal",
},
"independent": {
"just_value": "justValueIndependent",
"portability": "portabilityValueIndependent",
"assessed_soh": "sohValueIndependent",
"homestead": "he1AmountIndependent",
"add_homestead": "he2AmountIndependent",
"widow_vet_dis": "wvdAmountIndependent",
"exemption_type": "mexAmountIndependent",
"affordable_housing": "ahAmountIndependent",
"taxable": "taxableAmountIndependent",
},
}
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def fetch_broward_pa_record(
parcel_id: str,
timeout_seconds: int = 45,
wait_after_load: int = 25,
) -> dict:
"""Fetch full Broward PA record for a parcel_id.
Args:
parcel_id: bcpa folio (e.g., "484226062150")
timeout_seconds: max wait per Playwright operation
wait_after_load: SPA settle time after domcontentloaded (default 25s)
Returns:
{
"folio_number": str,
"owner_name": str (may include " % " corp marker),
"owner_name_2": str (continuation line),
"mailing_address": str,
"situs_address": str,
"neighborhood": str,
"use_code": str (e.g. "01-01 Single Family"),
"year_built": int,
"effective_year": int,
"adj_bldg_sqft": int,
"under_air_sqft": int,
"millage_code": str,
"legal_description": str,
"homestead_active": bool,
"current_year": {
"tax_year": int,
"land_value": int,
"bldg_value": int,
"just_value": int,
"assessed_value": int,
},
"last_year": {
"tax_year": int,
"land_value": int,
"bldg_value": int,
"just_value": int,
"assessed_value": int,
"taxes_paid": float,
},
"two_years_ago": {... same ...},
"tax_breakdown": {
"county": {just_value, portability, assessed_soh,
homestead, add_homestead, widow_vet_dis, senior,
exemption_type, affordable_housing, taxable},
"school_board": {...},
"municipal": {...},
"independent": {...},
},
"sales_history": [
{date, type, qualified_disqualified, price, book_page_or_cin}, ...
],
"photo_url": str | None,
"source_url": str,
"fetched_at": ISO timestamp,
"errors": [str],
}
"""
fetched_at = datetime.utcnow().isoformat() + "Z"
result = {
"folio_number": parcel_id,
"errors": [],
"source_url": f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}",
"source_api_url": f"https://web.bcpa.net/bcpaclient/search.aspx?Folio={parcel_id}",
"fetched_at": fetched_at,
}
if not parcel_id or not parcel_id.strip():
result["errors"].append("no parcel_id provided")
return result
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except ImportError:
result["errors"].append("playwright not installed")
return result
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
viewport={"width": 1400, "height": 1000},
)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
page.goto(url, wait_until="domcontentloaded")
time.sleep(wait_after_load)
# Wait until at least one scalar populates (sentinel: actualAgeId = year built)
try:
page.wait_for_function(
"() => { const el = document.getElementById('actualAgeId'); return el && el.textContent.trim().length > 0; }",
timeout=10000,
)
except Exception:
# If sentinel didn't populate, try anyway — maybe extra time helps
time.sleep(5)
# Extract all scalar fields in one JS call (faster than per-locator)
scalar_values = page.evaluate(
"""(ids) => {
const out = {};
for (const [key, id] of Object.entries(ids)) {
const el = document.getElementById(id);
out[key] = el ? (el.textContent || '').trim() : '';
}
return out;
}""",
_SCALAR_IDS,
)
district_values = page.evaluate(
"""(districts) => {
const out = {};
for (const [name, fields] of Object.entries(districts)) {
out[name] = {};
for (const [field, id] of Object.entries(fields)) {
const el = document.getElementById(id);
out[name][field] = el ? (el.textContent || '').trim() : '';
}
}
return out;
}""",
_DISTRICT_IDS,
)
# Extract sales history table (PrimaryProperty Sales Information)
sales_history = page.evaluate("""
() => {
const out = [];
// The sales table has rows with class containing dates/types
// Look for the table with header "Date | Type | Qualified | Price | Book/Page"
const tables = document.querySelectorAll('table');
for (const tbl of tables) {
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
if (!hdrCells || hdrCells.length < 4) continue;
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
const isSalesHdr = hdrText.some(h => h.includes('date')) &&
hdrText.some(h => h.includes('type')) &&
hdrText.some(h => h.includes('price')) &&
hdrText.some(h => h.includes('qualified'));
if (!isSalesHdr) continue;
// Parse data rows
const rows = tbl.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 4) continue;
const r = {};
cells.forEach((c, idx) => {
const h = hdrText[idx] || `col${idx}`;
r[h] = (c.textContent || '').trim();
});
// Skip empty rows
if (Object.values(r).some(v => v && v.length > 0)) {
out.push(r);
}
}
if (out.length > 0) break;
}
return out;
}
""")
# Photo URL
photo_urls = page.evaluate("""
() => Array.from(document.querySelectorAll('img'))
.filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)
.map(i => i.src)
""")
browser.close()
# ─── Post-process scalars ─────────────────────────────────────
result.update({k: _clean(v) for k, v in scalar_values.items()})
# Coerce numeric fields
for k in ("year_built", "effective_year", "current_tax_year", "last_tax_year",
"two_years_ago_tax_year", "adj_bldg_sqft", "under_air_sqft"):
v = result.get(k, "")
if v:
result[k] = _to_int(v)
for k in ("land_value_current", "bldg_value_current", "just_value_current",
"assessed_value_current", "land_value_last", "bldg_value_last",
"just_value_last", "assessed_value_last",
"land_value_2yr", "bldg_value_2yr", "just_value_2yr", "assessed_value_2yr"):
v = result.get(k, "")
if v:
result[k] = _money_to_int(v)
for k in ("taxes_paid_last", "taxes_paid_2yr"):
v = result.get(k, "")
if v:
result[k] = _money_to_float(v)
# ─── Structured groupings for downstream consumers ───────────
result["current_year"] = {
"tax_year": result.get("current_tax_year"),
"land_value": result.get("land_value_current"),
"bldg_value": result.get("bldg_value_current"),
"just_value": result.get("just_value_current"),
"assessed_value": result.get("assessed_value_current"),
}
result["last_year"] = {
"tax_year": result.get("last_tax_year"),
"land_value": result.get("land_value_last"),
"bldg_value": result.get("bldg_value_last"),
"just_value": result.get("just_value_last"),
"assessed_value": result.get("assessed_value_last"),
"taxes_paid": result.get("taxes_paid_last"),
}
result["two_years_ago"] = {
"tax_year": result.get("two_years_ago_tax_year"),
"land_value": result.get("land_value_2yr"),
"bldg_value": result.get("bldg_value_2yr"),
"just_value": result.get("just_value_2yr"),
"assessed_value": result.get("assessed_value_2yr"),
"taxes_paid": result.get("taxes_paid_2yr"),
}
# Process tax breakdown — clean & convert
result["tax_breakdown"] = {}
for district, fields in district_values.items():
result["tax_breakdown"][district] = {
k: _money_to_int(v) if "$" in v or v.replace(",", "").replace(".", "").isdigit() else _clean(v)
for k, v in fields.items()
}
# Sales history cleanup
result["sales_history"] = []
for s in sales_history:
# Normalize key names from possibly varied headers
norm = {
"date": _clean(s.get("date", "")),
"type": _clean(s.get("type", "")),
"qualified_disqualified": _clean(s.get("qualified/disqualified", s.get("qualified", ""))),
"price": _money_to_int(s.get("price", "")) if s.get("price") else None,
"book_page_or_cin": _clean(s.get("book/page or cin", s.get("book/page", ""))),
}
if any(norm.values()):
result["sales_history"].append(norm)
# Homestead boolean (flag is " , N" or " , Y")
hf = result.get("homestead_flag", "")
result["homestead_active"] = "Y" in hf.upper() and "N" not in hf.upper()
# Photo
result["photo_url"] = photo_urls[0] if photo_urls else None
except PWTimeout as e:
result["errors"].append(f"timeout: {e}")
except Exception as e:
import traceback
result["errors"].append(f"{type(e).__name__}: {e}")
result["_trace"] = traceback.format_exc()[:600]
return result
# ════════════════════════════════════════════════════════════════════════════
# Helpers
# ════════════════════════════════════════════════════════════════════════════
def _clean(s: str) -> str:
"""Collapse whitespace and strip."""
if not isinstance(s, str):
return s
return re.sub(r"\s+", " ", s).strip()
def _to_int(s: str) -> Optional[int]:
"""Parse '1969' or '1,199' → int. Returns None if unparseable."""
if not s:
return None
cleaned = re.sub(r"[^\d-]", "", s)
try:
return int(cleaned) if cleaned else None
except ValueError:
return None
def _money_to_int(s: str) -> Optional[int]:
"""Parse '$322,580' → 322580. Returns None if unparseable."""
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", s)
if not cleaned or cleaned == "-":
return None
try:
return int(float(cleaned))
except ValueError:
return None
def _money_to_float(s: str) -> Optional[float]:
"""Parse '$5,256.59' → 5256.59."""
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", s)
if not cleaned or cleaned == "-":
return None
try:
return float(cleaned)
except ValueError:
return None
# ════════════════════════════════════════════════════════════════════════════
# CLI for manual testing
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Broward PA full record fetcher")
parser.add_argument("parcel_id", help="Folio number (e.g. 484226062150)")
parser.add_argument("--wait", type=int, default=25, help="SPA settle seconds (default 25)")
args = parser.parse_args()
record = fetch_broward_pa_record(args.parcel_id, wait_after_load=args.wait)
print(json.dumps(record, indent=2, default=str))