462 lines
19 KiB
Python
462 lines
19 KiB
Python
"""data_fetchers/pa_broward.py — Full Broward County Property Appraiser extractor.
|
|
|
|
Extrae TODO lo publico de bcpa.net para construir un Property Snapshot Report ($15):
|
|
- Owner + mailing address
|
|
- Property address + neighborhood
|
|
- Year built, sqft, use code, units
|
|
- Just/Market value, Assessed/SOH value, by year (3 anios)
|
|
- Taxes paid (3 anios)
|
|
- Tax breakdown por district (County / School Board / Municipal / Independent)
|
|
- Exemptions (homestead, senior, vet, disabled, etc.)
|
|
- Photo URL
|
|
- Legal description
|
|
|
|
USAGE:
|
|
from data_fetchers.pa_broward import fetch_broward_pa_record
|
|
record = fetch_broward_pa_record(parcel_id="484226062150")
|
|
# record["owner_name"], record["just_value"], record["sales_history"]...
|
|
|
|
TECHNICAL:
|
|
- bcpa.net es Angular SPA — usar Playwright, NO requests/curl
|
|
- wait_until="domcontentloaded" + 25s sleep (NO networkidle, nunca termina)
|
|
- Element IDs son ESTABLES (data-bound by Angular, NO autogenerados como JSF)
|
|
- Per-folio latency: ~28-32s
|
|
- Free (Playwright local, no API cost)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Field ID mapping — confirmed via probe on folio 484226062150
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Single-value scalar fields
|
|
_SCALAR_IDS = {
|
|
"folio_number": "folioNumberId",
|
|
"owner_name": "ownerNameId",
|
|
"owner_name_2": "ownerName2Id",
|
|
"mailing_address": "mailingAddressId",
|
|
"situs_address": "situsAddressId",
|
|
"neighborhood": "neighborhood",
|
|
"use_code": "useCodeId",
|
|
"millage_code": "millageCodeId",
|
|
"adj_bldg_sqft": "bldgSqFTId",
|
|
"under_air_sqft": "bldgUnderAirFootageId",
|
|
"effective_year": "effectiveAgeId",
|
|
"year_built": "actualAgeId",
|
|
"units_beds_baths": "unitsBedsBathsId",
|
|
"legal_description": "legalDescId",
|
|
"homestead_flag": "homesteadFlagId",
|
|
# Current year values (auto-current year, e.g. 2026)
|
|
"current_tax_year": "currentTaxYearId",
|
|
"land_value_current": "landCurrentYearId",
|
|
"bldg_value_current": "bldgCurrentYearId",
|
|
"just_value_current": "justCurrentYearId",
|
|
"assessed_value_current": "sohCurrentYearId",
|
|
# Last year (e.g. 2025)
|
|
"last_tax_year": "lastTaxYearId",
|
|
"land_value_last": "landLastYearId",
|
|
"bldg_value_last": "bldgLastYearId",
|
|
"just_value_last": "justLastYearId",
|
|
"assessed_value_last": "sohLastYearId",
|
|
"taxes_paid_last": "assessedLastYearId",
|
|
# Two years ago (e.g. 2024)
|
|
"two_years_ago_tax_year": "lastTwoTaxYearId",
|
|
"land_value_2yr": "landLasttwoYearsId",
|
|
"bldg_value_2yr": "bldgLasttwoYearsId",
|
|
"just_value_2yr": "justLasttwoYearsId",
|
|
"assessed_value_2yr": "sohLasttwoYearsId",
|
|
"taxes_paid_2yr": "assessedLasttwoYearsId",
|
|
}
|
|
|
|
# Tax breakdown by district (current year)
|
|
_DISTRICT_IDS = {
|
|
# district name: {field: id}
|
|
"county": {
|
|
"just_value": "justValueCounty",
|
|
"portability": "portabilityValueCounty",
|
|
"assessed_soh": "sohValueCounty",
|
|
"homestead": "he1AmountCounty",
|
|
"add_homestead": "he2AmountCounty",
|
|
"widow_vet_dis": "wvdAmountCounty",
|
|
"senior": "seniorExemptionCounty",
|
|
"exemption_type": "mexAmountCounty",
|
|
"affordable_housing": "ahAmountCounty",
|
|
"taxable": "taxableAmountCounty",
|
|
},
|
|
"school_board": {
|
|
"just_value": "justValueSchoolBoard",
|
|
"portability": "portabilityValueSchoolBoard",
|
|
"assessed_soh": "sohValueSchoolBoard",
|
|
"homestead": "he1AmountSchoolBoard",
|
|
"add_homestead": "he2AmountSchoolBoard",
|
|
"widow_vet_dis": "wvdAmountSchoolBoard",
|
|
"exemption_type": "mexAmountSchoolBoard",
|
|
"affordable_housing": "ahAmountSchoolBoard",
|
|
"taxable": "taxableAmountSchoolBoard",
|
|
},
|
|
"municipal": {
|
|
"just_value": "justValueMunicipal",
|
|
"portability": "portabilityValueMunicipal",
|
|
"assessed_soh": "sohValueMunicipal",
|
|
"homestead": "he1AmountMunicipal",
|
|
"add_homestead": "he2AmountMunicipal",
|
|
"widow_vet_dis": "wvdAmountMunicipal",
|
|
"senior": "seniorExemptionMunicipal",
|
|
"exemption_type": "mexAmountMunicipal",
|
|
"affordable_housing": "ahAmountMunicipal",
|
|
"taxable": "taxableAmountMunicipal",
|
|
},
|
|
"independent": {
|
|
"just_value": "justValueIndependent",
|
|
"portability": "portabilityValueIndependent",
|
|
"assessed_soh": "sohValueIndependent",
|
|
"homestead": "he1AmountIndependent",
|
|
"add_homestead": "he2AmountIndependent",
|
|
"widow_vet_dis": "wvdAmountIndependent",
|
|
"exemption_type": "mexAmountIndependent",
|
|
"affordable_housing": "ahAmountIndependent",
|
|
"taxable": "taxableAmountIndependent",
|
|
},
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Public API
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def fetch_broward_pa_record(
|
|
parcel_id: str,
|
|
timeout_seconds: int = 45,
|
|
wait_after_load: int = 25,
|
|
) -> dict:
|
|
"""Fetch full Broward PA record for a parcel_id.
|
|
|
|
Args:
|
|
parcel_id: bcpa folio (e.g., "484226062150")
|
|
timeout_seconds: max wait per Playwright operation
|
|
wait_after_load: SPA settle time after domcontentloaded (default 25s)
|
|
|
|
Returns:
|
|
{
|
|
"folio_number": str,
|
|
"owner_name": str (may include " % " corp marker),
|
|
"owner_name_2": str (continuation line),
|
|
"mailing_address": str,
|
|
"situs_address": str,
|
|
"neighborhood": str,
|
|
"use_code": str (e.g. "01-01 Single Family"),
|
|
"year_built": int,
|
|
"effective_year": int,
|
|
"adj_bldg_sqft": int,
|
|
"under_air_sqft": int,
|
|
"millage_code": str,
|
|
"legal_description": str,
|
|
"homestead_active": bool,
|
|
"current_year": {
|
|
"tax_year": int,
|
|
"land_value": int,
|
|
"bldg_value": int,
|
|
"just_value": int,
|
|
"assessed_value": int,
|
|
},
|
|
"last_year": {
|
|
"tax_year": int,
|
|
"land_value": int,
|
|
"bldg_value": int,
|
|
"just_value": int,
|
|
"assessed_value": int,
|
|
"taxes_paid": float,
|
|
},
|
|
"two_years_ago": {... same ...},
|
|
"tax_breakdown": {
|
|
"county": {just_value, portability, assessed_soh,
|
|
homestead, add_homestead, widow_vet_dis, senior,
|
|
exemption_type, affordable_housing, taxable},
|
|
"school_board": {...},
|
|
"municipal": {...},
|
|
"independent": {...},
|
|
},
|
|
"sales_history": [
|
|
{date, type, qualified_disqualified, price, book_page_or_cin}, ...
|
|
],
|
|
"photo_url": str | None,
|
|
"source_url": str,
|
|
"fetched_at": ISO timestamp,
|
|
"errors": [str],
|
|
}
|
|
"""
|
|
fetched_at = datetime.utcnow().isoformat() + "Z"
|
|
result = {
|
|
"folio_number": parcel_id,
|
|
"errors": [],
|
|
"source_url": f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}",
|
|
"source_api_url": f"https://web.bcpa.net/bcpaclient/search.aspx?Folio={parcel_id}",
|
|
"fetched_at": fetched_at,
|
|
}
|
|
|
|
if not parcel_id or not parcel_id.strip():
|
|
result["errors"].append("no parcel_id provided")
|
|
return result
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
|
except ImportError:
|
|
result["errors"].append("playwright not installed")
|
|
return result
|
|
|
|
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
ctx = browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
|
|
viewport={"width": 1400, "height": 1000},
|
|
)
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(timeout_seconds * 1000)
|
|
|
|
page.goto(url, wait_until="domcontentloaded")
|
|
time.sleep(wait_after_load)
|
|
|
|
# Wait until at least one scalar populates (sentinel: actualAgeId = year built)
|
|
try:
|
|
page.wait_for_function(
|
|
"() => { const el = document.getElementById('actualAgeId'); return el && el.textContent.trim().length > 0; }",
|
|
timeout=10000,
|
|
)
|
|
except Exception:
|
|
# If sentinel didn't populate, try anyway — maybe extra time helps
|
|
time.sleep(5)
|
|
|
|
# Extract all scalar fields in one JS call (faster than per-locator)
|
|
scalar_values = page.evaluate(
|
|
"""(ids) => {
|
|
const out = {};
|
|
for (const [key, id] of Object.entries(ids)) {
|
|
const el = document.getElementById(id);
|
|
out[key] = el ? (el.textContent || '').trim() : '';
|
|
}
|
|
return out;
|
|
}""",
|
|
_SCALAR_IDS,
|
|
)
|
|
|
|
district_values = page.evaluate(
|
|
"""(districts) => {
|
|
const out = {};
|
|
for (const [name, fields] of Object.entries(districts)) {
|
|
out[name] = {};
|
|
for (const [field, id] of Object.entries(fields)) {
|
|
const el = document.getElementById(id);
|
|
out[name][field] = el ? (el.textContent || '').trim() : '';
|
|
}
|
|
}
|
|
return out;
|
|
}""",
|
|
_DISTRICT_IDS,
|
|
)
|
|
|
|
# Extract sales history table (PrimaryProperty Sales Information)
|
|
sales_history = page.evaluate("""
|
|
() => {
|
|
const out = [];
|
|
// The sales table has rows with class containing dates/types
|
|
// Look for the table with header "Date | Type | Qualified | Price | Book/Page"
|
|
const tables = document.querySelectorAll('table');
|
|
for (const tbl of tables) {
|
|
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
|
if (!hdrCells || hdrCells.length < 4) continue;
|
|
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
|
|
const isSalesHdr = hdrText.some(h => h.includes('date')) &&
|
|
hdrText.some(h => h.includes('type')) &&
|
|
hdrText.some(h => h.includes('price')) &&
|
|
hdrText.some(h => h.includes('qualified'));
|
|
if (!isSalesHdr) continue;
|
|
// Parse data rows
|
|
const rows = tbl.querySelectorAll('tr');
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const cells = rows[i].querySelectorAll('td');
|
|
if (cells.length < 4) continue;
|
|
const r = {};
|
|
cells.forEach((c, idx) => {
|
|
const h = hdrText[idx] || `col${idx}`;
|
|
r[h] = (c.textContent || '').trim();
|
|
});
|
|
// Skip empty rows
|
|
if (Object.values(r).some(v => v && v.length > 0)) {
|
|
out.push(r);
|
|
}
|
|
}
|
|
if (out.length > 0) break;
|
|
}
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Photo URL
|
|
photo_urls = page.evaluate("""
|
|
() => Array.from(document.querySelectorAll('img'))
|
|
.filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)
|
|
.map(i => i.src)
|
|
""")
|
|
|
|
browser.close()
|
|
|
|
# ─── Post-process scalars ─────────────────────────────────────
|
|
result.update({k: _clean(v) for k, v in scalar_values.items()})
|
|
|
|
# Coerce numeric fields
|
|
for k in ("year_built", "effective_year", "current_tax_year", "last_tax_year",
|
|
"two_years_ago_tax_year", "adj_bldg_sqft", "under_air_sqft"):
|
|
v = result.get(k, "")
|
|
if v:
|
|
result[k] = _to_int(v)
|
|
|
|
for k in ("land_value_current", "bldg_value_current", "just_value_current",
|
|
"assessed_value_current", "land_value_last", "bldg_value_last",
|
|
"just_value_last", "assessed_value_last",
|
|
"land_value_2yr", "bldg_value_2yr", "just_value_2yr", "assessed_value_2yr"):
|
|
v = result.get(k, "")
|
|
if v:
|
|
result[k] = _money_to_int(v)
|
|
|
|
for k in ("taxes_paid_last", "taxes_paid_2yr"):
|
|
v = result.get(k, "")
|
|
if v:
|
|
result[k] = _money_to_float(v)
|
|
|
|
# ─── Structured groupings for downstream consumers ───────────
|
|
result["current_year"] = {
|
|
"tax_year": result.get("current_tax_year"),
|
|
"land_value": result.get("land_value_current"),
|
|
"bldg_value": result.get("bldg_value_current"),
|
|
"just_value": result.get("just_value_current"),
|
|
"assessed_value": result.get("assessed_value_current"),
|
|
}
|
|
result["last_year"] = {
|
|
"tax_year": result.get("last_tax_year"),
|
|
"land_value": result.get("land_value_last"),
|
|
"bldg_value": result.get("bldg_value_last"),
|
|
"just_value": result.get("just_value_last"),
|
|
"assessed_value": result.get("assessed_value_last"),
|
|
"taxes_paid": result.get("taxes_paid_last"),
|
|
}
|
|
result["two_years_ago"] = {
|
|
"tax_year": result.get("two_years_ago_tax_year"),
|
|
"land_value": result.get("land_value_2yr"),
|
|
"bldg_value": result.get("bldg_value_2yr"),
|
|
"just_value": result.get("just_value_2yr"),
|
|
"assessed_value": result.get("assessed_value_2yr"),
|
|
"taxes_paid": result.get("taxes_paid_2yr"),
|
|
}
|
|
|
|
# Process tax breakdown — clean & convert
|
|
result["tax_breakdown"] = {}
|
|
for district, fields in district_values.items():
|
|
result["tax_breakdown"][district] = {
|
|
k: _money_to_int(v) if "$" in v or v.replace(",", "").replace(".", "").isdigit() else _clean(v)
|
|
for k, v in fields.items()
|
|
}
|
|
|
|
# Sales history cleanup
|
|
result["sales_history"] = []
|
|
for s in sales_history:
|
|
# Normalize key names from possibly varied headers
|
|
norm = {
|
|
"date": _clean(s.get("date", "")),
|
|
"type": _clean(s.get("type", "")),
|
|
"qualified_disqualified": _clean(s.get("qualified/disqualified", s.get("qualified", ""))),
|
|
"price": _money_to_int(s.get("price", "")) if s.get("price") else None,
|
|
"book_page_or_cin": _clean(s.get("book/page or cin", s.get("book/page", ""))),
|
|
}
|
|
if any(norm.values()):
|
|
result["sales_history"].append(norm)
|
|
|
|
# Homestead boolean (flag is " , N" or " , Y")
|
|
hf = result.get("homestead_flag", "")
|
|
result["homestead_active"] = "Y" in hf.upper() and "N" not in hf.upper()
|
|
|
|
# Photo
|
|
result["photo_url"] = photo_urls[0] if photo_urls else None
|
|
|
|
except PWTimeout as e:
|
|
result["errors"].append(f"timeout: {e}")
|
|
except Exception as e:
|
|
import traceback
|
|
result["errors"].append(f"{type(e).__name__}: {e}")
|
|
result["_trace"] = traceback.format_exc()[:600]
|
|
|
|
return result
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Helpers
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def _clean(s: str) -> str:
|
|
"""Collapse whitespace and strip."""
|
|
if not isinstance(s, str):
|
|
return s
|
|
return re.sub(r"\s+", " ", s).strip()
|
|
|
|
|
|
def _to_int(s: str) -> Optional[int]:
|
|
"""Parse '1969' or '1,199' → int. Returns None if unparseable."""
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d-]", "", s)
|
|
try:
|
|
return int(cleaned) if cleaned else None
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _money_to_int(s: str) -> Optional[int]:
|
|
"""Parse '$322,580' → 322580. Returns None if unparseable."""
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d.-]", "", s)
|
|
if not cleaned or cleaned == "-":
|
|
return None
|
|
try:
|
|
return int(float(cleaned))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _money_to_float(s: str) -> Optional[float]:
|
|
"""Parse '$5,256.59' → 5256.59."""
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d.-]", "", s)
|
|
if not cleaned or cleaned == "-":
|
|
return None
|
|
try:
|
|
return float(cleaned)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# CLI for manual testing
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import json
|
|
|
|
parser = argparse.ArgumentParser(description="Broward PA full record fetcher")
|
|
parser.add_argument("parcel_id", help="Folio number (e.g. 484226062150)")
|
|
parser.add_argument("--wait", type=int, default=25, help="SPA settle seconds (default 25)")
|
|
args = parser.parse_args()
|
|
|
|
record = fetch_broward_pa_record(args.parcel_id, wait_after_load=args.wait)
|
|
print(json.dumps(record, indent=2, default=str))
|