feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,461 @@
|
||||
"""data_fetchers/pa_broward.py — Full Broward County Property Appraiser extractor.
|
||||
|
||||
Extrae TODO lo publico de bcpa.net para construir un Property Snapshot Report ($15):
|
||||
- Owner + mailing address
|
||||
- Property address + neighborhood
|
||||
- Year built, sqft, use code, units
|
||||
- Just/Market value, Assessed/SOH value, by year (3 anios)
|
||||
- Taxes paid (3 anios)
|
||||
- Tax breakdown por district (County / School Board / Municipal / Independent)
|
||||
- Exemptions (homestead, senior, vet, disabled, etc.)
|
||||
- Photo URL
|
||||
- Legal description
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.pa_broward import fetch_broward_pa_record
|
||||
record = fetch_broward_pa_record(parcel_id="484226062150")
|
||||
# record["owner_name"], record["just_value"], record["sales_history"]...
|
||||
|
||||
TECHNICAL:
|
||||
- bcpa.net es Angular SPA — usar Playwright, NO requests/curl
|
||||
- wait_until="domcontentloaded" + 25s sleep (NO networkidle, nunca termina)
|
||||
- Element IDs son ESTABLES (data-bound by Angular, NO autogenerados como JSF)
|
||||
- Per-folio latency: ~28-32s
|
||||
- Free (Playwright local, no API cost)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Field ID mapping — confirmed via probe on folio 484226062150
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Single-value scalar fields
|
||||
_SCALAR_IDS = {
|
||||
"folio_number": "folioNumberId",
|
||||
"owner_name": "ownerNameId",
|
||||
"owner_name_2": "ownerName2Id",
|
||||
"mailing_address": "mailingAddressId",
|
||||
"situs_address": "situsAddressId",
|
||||
"neighborhood": "neighborhood",
|
||||
"use_code": "useCodeId",
|
||||
"millage_code": "millageCodeId",
|
||||
"adj_bldg_sqft": "bldgSqFTId",
|
||||
"under_air_sqft": "bldgUnderAirFootageId",
|
||||
"effective_year": "effectiveAgeId",
|
||||
"year_built": "actualAgeId",
|
||||
"units_beds_baths": "unitsBedsBathsId",
|
||||
"legal_description": "legalDescId",
|
||||
"homestead_flag": "homesteadFlagId",
|
||||
# Current year values (auto-current year, e.g. 2026)
|
||||
"current_tax_year": "currentTaxYearId",
|
||||
"land_value_current": "landCurrentYearId",
|
||||
"bldg_value_current": "bldgCurrentYearId",
|
||||
"just_value_current": "justCurrentYearId",
|
||||
"assessed_value_current": "sohCurrentYearId",
|
||||
# Last year (e.g. 2025)
|
||||
"last_tax_year": "lastTaxYearId",
|
||||
"land_value_last": "landLastYearId",
|
||||
"bldg_value_last": "bldgLastYearId",
|
||||
"just_value_last": "justLastYearId",
|
||||
"assessed_value_last": "sohLastYearId",
|
||||
"taxes_paid_last": "assessedLastYearId",
|
||||
# Two years ago (e.g. 2024)
|
||||
"two_years_ago_tax_year": "lastTwoTaxYearId",
|
||||
"land_value_2yr": "landLasttwoYearsId",
|
||||
"bldg_value_2yr": "bldgLasttwoYearsId",
|
||||
"just_value_2yr": "justLasttwoYearsId",
|
||||
"assessed_value_2yr": "sohLasttwoYearsId",
|
||||
"taxes_paid_2yr": "assessedLasttwoYearsId",
|
||||
}
|
||||
|
||||
# Tax breakdown by district (current year)
|
||||
_DISTRICT_IDS = {
|
||||
# district name: {field: id}
|
||||
"county": {
|
||||
"just_value": "justValueCounty",
|
||||
"portability": "portabilityValueCounty",
|
||||
"assessed_soh": "sohValueCounty",
|
||||
"homestead": "he1AmountCounty",
|
||||
"add_homestead": "he2AmountCounty",
|
||||
"widow_vet_dis": "wvdAmountCounty",
|
||||
"senior": "seniorExemptionCounty",
|
||||
"exemption_type": "mexAmountCounty",
|
||||
"affordable_housing": "ahAmountCounty",
|
||||
"taxable": "taxableAmountCounty",
|
||||
},
|
||||
"school_board": {
|
||||
"just_value": "justValueSchoolBoard",
|
||||
"portability": "portabilityValueSchoolBoard",
|
||||
"assessed_soh": "sohValueSchoolBoard",
|
||||
"homestead": "he1AmountSchoolBoard",
|
||||
"add_homestead": "he2AmountSchoolBoard",
|
||||
"widow_vet_dis": "wvdAmountSchoolBoard",
|
||||
"exemption_type": "mexAmountSchoolBoard",
|
||||
"affordable_housing": "ahAmountSchoolBoard",
|
||||
"taxable": "taxableAmountSchoolBoard",
|
||||
},
|
||||
"municipal": {
|
||||
"just_value": "justValueMunicipal",
|
||||
"portability": "portabilityValueMunicipal",
|
||||
"assessed_soh": "sohValueMunicipal",
|
||||
"homestead": "he1AmountMunicipal",
|
||||
"add_homestead": "he2AmountMunicipal",
|
||||
"widow_vet_dis": "wvdAmountMunicipal",
|
||||
"senior": "seniorExemptionMunicipal",
|
||||
"exemption_type": "mexAmountMunicipal",
|
||||
"affordable_housing": "ahAmountMunicipal",
|
||||
"taxable": "taxableAmountMunicipal",
|
||||
},
|
||||
"independent": {
|
||||
"just_value": "justValueIndependent",
|
||||
"portability": "portabilityValueIndependent",
|
||||
"assessed_soh": "sohValueIndependent",
|
||||
"homestead": "he1AmountIndependent",
|
||||
"add_homestead": "he2AmountIndependent",
|
||||
"widow_vet_dis": "wvdAmountIndependent",
|
||||
"exemption_type": "mexAmountIndependent",
|
||||
"affordable_housing": "ahAmountIndependent",
|
||||
"taxable": "taxableAmountIndependent",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_broward_pa_record(
|
||||
parcel_id: str,
|
||||
timeout_seconds: int = 45,
|
||||
wait_after_load: int = 25,
|
||||
) -> dict:
|
||||
"""Fetch full Broward PA record for a parcel_id.
|
||||
|
||||
Args:
|
||||
parcel_id: bcpa folio (e.g., "484226062150")
|
||||
timeout_seconds: max wait per Playwright operation
|
||||
wait_after_load: SPA settle time after domcontentloaded (default 25s)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"folio_number": str,
|
||||
"owner_name": str (may include " % " corp marker),
|
||||
"owner_name_2": str (continuation line),
|
||||
"mailing_address": str,
|
||||
"situs_address": str,
|
||||
"neighborhood": str,
|
||||
"use_code": str (e.g. "01-01 Single Family"),
|
||||
"year_built": int,
|
||||
"effective_year": int,
|
||||
"adj_bldg_sqft": int,
|
||||
"under_air_sqft": int,
|
||||
"millage_code": str,
|
||||
"legal_description": str,
|
||||
"homestead_active": bool,
|
||||
"current_year": {
|
||||
"tax_year": int,
|
||||
"land_value": int,
|
||||
"bldg_value": int,
|
||||
"just_value": int,
|
||||
"assessed_value": int,
|
||||
},
|
||||
"last_year": {
|
||||
"tax_year": int,
|
||||
"land_value": int,
|
||||
"bldg_value": int,
|
||||
"just_value": int,
|
||||
"assessed_value": int,
|
||||
"taxes_paid": float,
|
||||
},
|
||||
"two_years_ago": {... same ...},
|
||||
"tax_breakdown": {
|
||||
"county": {just_value, portability, assessed_soh,
|
||||
homestead, add_homestead, widow_vet_dis, senior,
|
||||
exemption_type, affordable_housing, taxable},
|
||||
"school_board": {...},
|
||||
"municipal": {...},
|
||||
"independent": {...},
|
||||
},
|
||||
"sales_history": [
|
||||
{date, type, qualified_disqualified, price, book_page_or_cin}, ...
|
||||
],
|
||||
"photo_url": str | None,
|
||||
"source_url": str,
|
||||
"fetched_at": ISO timestamp,
|
||||
"errors": [str],
|
||||
}
|
||||
"""
|
||||
fetched_at = datetime.utcnow().isoformat() + "Z"
|
||||
result = {
|
||||
"folio_number": parcel_id,
|
||||
"errors": [],
|
||||
"source_url": f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}",
|
||||
"source_api_url": f"https://web.bcpa.net/bcpaclient/search.aspx?Folio={parcel_id}",
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
|
||||
if not parcel_id or not parcel_id.strip():
|
||||
result["errors"].append("no parcel_id provided")
|
||||
return result
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
except ImportError:
|
||||
result["errors"].append("playwright not installed")
|
||||
return result
|
||||
|
||||
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1400, "height": 1000},
|
||||
)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
time.sleep(wait_after_load)
|
||||
|
||||
# Wait until at least one scalar populates (sentinel: actualAgeId = year built)
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"() => { const el = document.getElementById('actualAgeId'); return el && el.textContent.trim().length > 0; }",
|
||||
timeout=10000,
|
||||
)
|
||||
except Exception:
|
||||
# If sentinel didn't populate, try anyway — maybe extra time helps
|
||||
time.sleep(5)
|
||||
|
||||
# Extract all scalar fields in one JS call (faster than per-locator)
|
||||
scalar_values = page.evaluate(
|
||||
"""(ids) => {
|
||||
const out = {};
|
||||
for (const [key, id] of Object.entries(ids)) {
|
||||
const el = document.getElementById(id);
|
||||
out[key] = el ? (el.textContent || '').trim() : '';
|
||||
}
|
||||
return out;
|
||||
}""",
|
||||
_SCALAR_IDS,
|
||||
)
|
||||
|
||||
district_values = page.evaluate(
|
||||
"""(districts) => {
|
||||
const out = {};
|
||||
for (const [name, fields] of Object.entries(districts)) {
|
||||
out[name] = {};
|
||||
for (const [field, id] of Object.entries(fields)) {
|
||||
const el = document.getElementById(id);
|
||||
out[name][field] = el ? (el.textContent || '').trim() : '';
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}""",
|
||||
_DISTRICT_IDS,
|
||||
)
|
||||
|
||||
# Extract sales history table (PrimaryProperty Sales Information)
|
||||
sales_history = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
// The sales table has rows with class containing dates/types
|
||||
// Look for the table with header "Date | Type | Qualified | Price | Book/Page"
|
||||
const tables = document.querySelectorAll('table');
|
||||
for (const tbl of tables) {
|
||||
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
||||
if (!hdrCells || hdrCells.length < 4) continue;
|
||||
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
const isSalesHdr = hdrText.some(h => h.includes('date')) &&
|
||||
hdrText.some(h => h.includes('type')) &&
|
||||
hdrText.some(h => h.includes('price')) &&
|
||||
hdrText.some(h => h.includes('qualified'));
|
||||
if (!isSalesHdr) continue;
|
||||
// Parse data rows
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
const r = {};
|
||||
cells.forEach((c, idx) => {
|
||||
const h = hdrText[idx] || `col${idx}`;
|
||||
r[h] = (c.textContent || '').trim();
|
||||
});
|
||||
// Skip empty rows
|
||||
if (Object.values(r).some(v => v && v.length > 0)) {
|
||||
out.push(r);
|
||||
}
|
||||
}
|
||||
if (out.length > 0) break;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Photo URL
|
||||
photo_urls = page.evaluate("""
|
||||
() => Array.from(document.querySelectorAll('img'))
|
||||
.filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)
|
||||
.map(i => i.src)
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
# ─── Post-process scalars ─────────────────────────────────────
|
||||
result.update({k: _clean(v) for k, v in scalar_values.items()})
|
||||
|
||||
# Coerce numeric fields
|
||||
for k in ("year_built", "effective_year", "current_tax_year", "last_tax_year",
|
||||
"two_years_ago_tax_year", "adj_bldg_sqft", "under_air_sqft"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _to_int(v)
|
||||
|
||||
for k in ("land_value_current", "bldg_value_current", "just_value_current",
|
||||
"assessed_value_current", "land_value_last", "bldg_value_last",
|
||||
"just_value_last", "assessed_value_last",
|
||||
"land_value_2yr", "bldg_value_2yr", "just_value_2yr", "assessed_value_2yr"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _money_to_int(v)
|
||||
|
||||
for k in ("taxes_paid_last", "taxes_paid_2yr"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _money_to_float(v)
|
||||
|
||||
# ─── Structured groupings for downstream consumers ───────────
|
||||
result["current_year"] = {
|
||||
"tax_year": result.get("current_tax_year"),
|
||||
"land_value": result.get("land_value_current"),
|
||||
"bldg_value": result.get("bldg_value_current"),
|
||||
"just_value": result.get("just_value_current"),
|
||||
"assessed_value": result.get("assessed_value_current"),
|
||||
}
|
||||
result["last_year"] = {
|
||||
"tax_year": result.get("last_tax_year"),
|
||||
"land_value": result.get("land_value_last"),
|
||||
"bldg_value": result.get("bldg_value_last"),
|
||||
"just_value": result.get("just_value_last"),
|
||||
"assessed_value": result.get("assessed_value_last"),
|
||||
"taxes_paid": result.get("taxes_paid_last"),
|
||||
}
|
||||
result["two_years_ago"] = {
|
||||
"tax_year": result.get("two_years_ago_tax_year"),
|
||||
"land_value": result.get("land_value_2yr"),
|
||||
"bldg_value": result.get("bldg_value_2yr"),
|
||||
"just_value": result.get("just_value_2yr"),
|
||||
"assessed_value": result.get("assessed_value_2yr"),
|
||||
"taxes_paid": result.get("taxes_paid_2yr"),
|
||||
}
|
||||
|
||||
# Process tax breakdown — clean & convert
|
||||
result["tax_breakdown"] = {}
|
||||
for district, fields in district_values.items():
|
||||
result["tax_breakdown"][district] = {
|
||||
k: _money_to_int(v) if "$" in v or v.replace(",", "").replace(".", "").isdigit() else _clean(v)
|
||||
for k, v in fields.items()
|
||||
}
|
||||
|
||||
# Sales history cleanup
|
||||
result["sales_history"] = []
|
||||
for s in sales_history:
|
||||
# Normalize key names from possibly varied headers
|
||||
norm = {
|
||||
"date": _clean(s.get("date", "")),
|
||||
"type": _clean(s.get("type", "")),
|
||||
"qualified_disqualified": _clean(s.get("qualified/disqualified", s.get("qualified", ""))),
|
||||
"price": _money_to_int(s.get("price", "")) if s.get("price") else None,
|
||||
"book_page_or_cin": _clean(s.get("book/page or cin", s.get("book/page", ""))),
|
||||
}
|
||||
if any(norm.values()):
|
||||
result["sales_history"].append(norm)
|
||||
|
||||
# Homestead boolean (flag is " , N" or " , Y")
|
||||
hf = result.get("homestead_flag", "")
|
||||
result["homestead_active"] = "Y" in hf.upper() and "N" not in hf.upper()
|
||||
|
||||
# Photo
|
||||
result["photo_url"] = photo_urls[0] if photo_urls else None
|
||||
|
||||
except PWTimeout as e:
|
||||
result["errors"].append(f"timeout: {e}")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result["errors"].append(f"{type(e).__name__}: {e}")
|
||||
result["_trace"] = traceback.format_exc()[:600]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _clean(s: str) -> str:
|
||||
"""Collapse whitespace and strip."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _to_int(s: str) -> Optional[int]:
|
||||
"""Parse '1969' or '1,199' → int. Returns None if unparseable."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", s)
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s: str) -> Optional[int]:
|
||||
"""Parse '$322,580' → 322580. Returns None if unparseable."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", s)
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_float(s: str) -> Optional[float]:
|
||||
"""Parse '$5,256.59' → 5256.59."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", s)
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI for manual testing
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Broward PA full record fetcher")
|
||||
parser.add_argument("parcel_id", help="Folio number (e.g. 484226062150)")
|
||||
parser.add_argument("--wait", type=int, default=25, help="SPA settle seconds (default 25)")
|
||||
args = parser.parse_args()
|
||||
|
||||
record = fetch_broward_pa_record(args.parcel_id, wait_after_load=args.wait)
|
||||
print(json.dumps(record, indent=2, default=str))
|
||||
Reference in New Issue
Block a user