"""data_fetchers/pa_broward.py — Full Broward County Property Appraiser extractor. Extrae TODO lo publico de bcpa.net para construir un Property Snapshot Report ($15): - Owner + mailing address - Property address + neighborhood - Year built, sqft, use code, units - Just/Market value, Assessed/SOH value, by year (3 anios) - Taxes paid (3 anios) - Tax breakdown por district (County / School Board / Municipal / Independent) - Exemptions (homestead, senior, vet, disabled, etc.) - Photo URL - Legal description USAGE: from data_fetchers.pa_broward import fetch_broward_pa_record record = fetch_broward_pa_record(parcel_id="484226062150") # record["owner_name"], record["just_value"], record["sales_history"]... TECHNICAL: - bcpa.net es Angular SPA — usar Playwright, NO requests/curl - wait_until="domcontentloaded" + 25s sleep (NO networkidle, nunca termina) - Element IDs son ESTABLES (data-bound by Angular, NO autogenerados como JSF) - Per-folio latency: ~28-32s - Free (Playwright local, no API cost) """ from __future__ import annotations import re import time from datetime import datetime from typing import Optional # ════════════════════════════════════════════════════════════════════════════ # Field ID mapping — confirmed via probe on folio 484226062150 # ════════════════════════════════════════════════════════════════════════════ # Single-value scalar fields _SCALAR_IDS = { "folio_number": "folioNumberId", "owner_name": "ownerNameId", "owner_name_2": "ownerName2Id", "mailing_address": "mailingAddressId", "situs_address": "situsAddressId", "neighborhood": "neighborhood", "use_code": "useCodeId", "millage_code": "millageCodeId", "adj_bldg_sqft": "bldgSqFTId", "under_air_sqft": "bldgUnderAirFootageId", "effective_year": "effectiveAgeId", "year_built": "actualAgeId", "units_beds_baths": "unitsBedsBathsId", "legal_description": "legalDescId", "homestead_flag": "homesteadFlagId", # Current year values (auto-current year, e.g. 2026) "current_tax_year": "currentTaxYearId", "land_value_current": "landCurrentYearId", "bldg_value_current": "bldgCurrentYearId", "just_value_current": "justCurrentYearId", "assessed_value_current": "sohCurrentYearId", # Last year (e.g. 2025) "last_tax_year": "lastTaxYearId", "land_value_last": "landLastYearId", "bldg_value_last": "bldgLastYearId", "just_value_last": "justLastYearId", "assessed_value_last": "sohLastYearId", "taxes_paid_last": "assessedLastYearId", # Two years ago (e.g. 2024) "two_years_ago_tax_year": "lastTwoTaxYearId", "land_value_2yr": "landLasttwoYearsId", "bldg_value_2yr": "bldgLasttwoYearsId", "just_value_2yr": "justLasttwoYearsId", "assessed_value_2yr": "sohLasttwoYearsId", "taxes_paid_2yr": "assessedLasttwoYearsId", } # Tax breakdown by district (current year) _DISTRICT_IDS = { # district name: {field: id} "county": { "just_value": "justValueCounty", "portability": "portabilityValueCounty", "assessed_soh": "sohValueCounty", "homestead": "he1AmountCounty", "add_homestead": "he2AmountCounty", "widow_vet_dis": "wvdAmountCounty", "senior": "seniorExemptionCounty", "exemption_type": "mexAmountCounty", "affordable_housing": "ahAmountCounty", "taxable": "taxableAmountCounty", }, "school_board": { "just_value": "justValueSchoolBoard", "portability": "portabilityValueSchoolBoard", "assessed_soh": "sohValueSchoolBoard", "homestead": "he1AmountSchoolBoard", "add_homestead": "he2AmountSchoolBoard", "widow_vet_dis": "wvdAmountSchoolBoard", "exemption_type": "mexAmountSchoolBoard", "affordable_housing": "ahAmountSchoolBoard", "taxable": "taxableAmountSchoolBoard", }, "municipal": { "just_value": "justValueMunicipal", "portability": "portabilityValueMunicipal", "assessed_soh": "sohValueMunicipal", "homestead": "he1AmountMunicipal", "add_homestead": "he2AmountMunicipal", "widow_vet_dis": "wvdAmountMunicipal", "senior": "seniorExemptionMunicipal", "exemption_type": "mexAmountMunicipal", "affordable_housing": "ahAmountMunicipal", "taxable": "taxableAmountMunicipal", }, "independent": { "just_value": "justValueIndependent", "portability": "portabilityValueIndependent", "assessed_soh": "sohValueIndependent", "homestead": "he1AmountIndependent", "add_homestead": "he2AmountIndependent", "widow_vet_dis": "wvdAmountIndependent", "exemption_type": "mexAmountIndependent", "affordable_housing": "ahAmountIndependent", "taxable": "taxableAmountIndependent", }, } # ════════════════════════════════════════════════════════════════════════════ # Public API # ════════════════════════════════════════════════════════════════════════════ def fetch_broward_pa_record( parcel_id: str, timeout_seconds: int = 45, wait_after_load: int = 25, ) -> dict: """Fetch full Broward PA record for a parcel_id. Args: parcel_id: bcpa folio (e.g., "484226062150") timeout_seconds: max wait per Playwright operation wait_after_load: SPA settle time after domcontentloaded (default 25s) Returns: { "folio_number": str, "owner_name": str (may include " % " corp marker), "owner_name_2": str (continuation line), "mailing_address": str, "situs_address": str, "neighborhood": str, "use_code": str (e.g. "01-01 Single Family"), "year_built": int, "effective_year": int, "adj_bldg_sqft": int, "under_air_sqft": int, "millage_code": str, "legal_description": str, "homestead_active": bool, "current_year": { "tax_year": int, "land_value": int, "bldg_value": int, "just_value": int, "assessed_value": int, }, "last_year": { "tax_year": int, "land_value": int, "bldg_value": int, "just_value": int, "assessed_value": int, "taxes_paid": float, }, "two_years_ago": {... same ...}, "tax_breakdown": { "county": {just_value, portability, assessed_soh, homestead, add_homestead, widow_vet_dis, senior, exemption_type, affordable_housing, taxable}, "school_board": {...}, "municipal": {...}, "independent": {...}, }, "sales_history": [ {date, type, qualified_disqualified, price, book_page_or_cin}, ... ], "photo_url": str | None, "source_url": str, "fetched_at": ISO timestamp, "errors": [str], } """ fetched_at = datetime.utcnow().isoformat() + "Z" result = { "folio_number": parcel_id, "errors": [], "source_url": f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}", "source_api_url": f"https://web.bcpa.net/bcpaclient/search.aspx?Folio={parcel_id}", "fetched_at": fetched_at, } if not parcel_id or not parcel_id.strip(): result["errors"].append("no parcel_id provided") return result try: from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout except ImportError: result["errors"].append("playwright not installed") return result url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}" try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36", viewport={"width": 1400, "height": 1000}, ) page = ctx.new_page() page.set_default_timeout(timeout_seconds * 1000) page.goto(url, wait_until="domcontentloaded") time.sleep(wait_after_load) # Wait until at least one scalar populates (sentinel: actualAgeId = year built) try: page.wait_for_function( "() => { const el = document.getElementById('actualAgeId'); return el && el.textContent.trim().length > 0; }", timeout=10000, ) except Exception: # If sentinel didn't populate, try anyway — maybe extra time helps time.sleep(5) # Extract all scalar fields in one JS call (faster than per-locator) scalar_values = page.evaluate( """(ids) => { const out = {}; for (const [key, id] of Object.entries(ids)) { const el = document.getElementById(id); out[key] = el ? (el.textContent || '').trim() : ''; } return out; }""", _SCALAR_IDS, ) district_values = page.evaluate( """(districts) => { const out = {}; for (const [name, fields] of Object.entries(districts)) { out[name] = {}; for (const [field, id] of Object.entries(fields)) { const el = document.getElementById(id); out[name][field] = el ? (el.textContent || '').trim() : ''; } } return out; }""", _DISTRICT_IDS, ) # Extract sales history table (PrimaryProperty Sales Information) sales_history = page.evaluate(""" () => { const out = []; // The sales table has rows with class containing dates/types // Look for the table with header "Date | Type | Qualified | Price | Book/Page" const tables = document.querySelectorAll('table'); for (const tbl of tables) { const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td'); if (!hdrCells || hdrCells.length < 4) continue; const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase()); const isSalesHdr = hdrText.some(h => h.includes('date')) && hdrText.some(h => h.includes('type')) && hdrText.some(h => h.includes('price')) && hdrText.some(h => h.includes('qualified')); if (!isSalesHdr) continue; // Parse data rows const rows = tbl.querySelectorAll('tr'); for (let i = 1; i < rows.length; i++) { const cells = rows[i].querySelectorAll('td'); if (cells.length < 4) continue; const r = {}; cells.forEach((c, idx) => { const h = hdrText[idx] || `col${idx}`; r[h] = (c.textContent || '').trim(); }); // Skip empty rows if (Object.values(r).some(v => v && v.length > 0)) { out.push(r); } } if (out.length > 0) break; } return out; } """) # Photo URL photo_urls = page.evaluate(""" () => Array.from(document.querySelectorAll('img')) .filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200) .map(i => i.src) """) browser.close() # ─── Post-process scalars ───────────────────────────────────── result.update({k: _clean(v) for k, v in scalar_values.items()}) # Coerce numeric fields for k in ("year_built", "effective_year", "current_tax_year", "last_tax_year", "two_years_ago_tax_year", "adj_bldg_sqft", "under_air_sqft"): v = result.get(k, "") if v: result[k] = _to_int(v) for k in ("land_value_current", "bldg_value_current", "just_value_current", "assessed_value_current", "land_value_last", "bldg_value_last", "just_value_last", "assessed_value_last", "land_value_2yr", "bldg_value_2yr", "just_value_2yr", "assessed_value_2yr"): v = result.get(k, "") if v: result[k] = _money_to_int(v) for k in ("taxes_paid_last", "taxes_paid_2yr"): v = result.get(k, "") if v: result[k] = _money_to_float(v) # ─── Structured groupings for downstream consumers ─────────── result["current_year"] = { "tax_year": result.get("current_tax_year"), "land_value": result.get("land_value_current"), "bldg_value": result.get("bldg_value_current"), "just_value": result.get("just_value_current"), "assessed_value": result.get("assessed_value_current"), } result["last_year"] = { "tax_year": result.get("last_tax_year"), "land_value": result.get("land_value_last"), "bldg_value": result.get("bldg_value_last"), "just_value": result.get("just_value_last"), "assessed_value": result.get("assessed_value_last"), "taxes_paid": result.get("taxes_paid_last"), } result["two_years_ago"] = { "tax_year": result.get("two_years_ago_tax_year"), "land_value": result.get("land_value_2yr"), "bldg_value": result.get("bldg_value_2yr"), "just_value": result.get("just_value_2yr"), "assessed_value": result.get("assessed_value_2yr"), "taxes_paid": result.get("taxes_paid_2yr"), } # Process tax breakdown — clean & convert result["tax_breakdown"] = {} for district, fields in district_values.items(): result["tax_breakdown"][district] = { k: _money_to_int(v) if "$" in v or v.replace(",", "").replace(".", "").isdigit() else _clean(v) for k, v in fields.items() } # Sales history cleanup result["sales_history"] = [] for s in sales_history: # Normalize key names from possibly varied headers norm = { "date": _clean(s.get("date", "")), "type": _clean(s.get("type", "")), "qualified_disqualified": _clean(s.get("qualified/disqualified", s.get("qualified", ""))), "price": _money_to_int(s.get("price", "")) if s.get("price") else None, "book_page_or_cin": _clean(s.get("book/page or cin", s.get("book/page", ""))), } if any(norm.values()): result["sales_history"].append(norm) # Homestead boolean (flag is " , N" or " , Y") hf = result.get("homestead_flag", "") result["homestead_active"] = "Y" in hf.upper() and "N" not in hf.upper() # Photo result["photo_url"] = photo_urls[0] if photo_urls else None except PWTimeout as e: result["errors"].append(f"timeout: {e}") except Exception as e: import traceback result["errors"].append(f"{type(e).__name__}: {e}") result["_trace"] = traceback.format_exc()[:600] return result # ════════════════════════════════════════════════════════════════════════════ # Helpers # ════════════════════════════════════════════════════════════════════════════ def _clean(s: str) -> str: """Collapse whitespace and strip.""" if not isinstance(s, str): return s return re.sub(r"\s+", " ", s).strip() def _to_int(s: str) -> Optional[int]: """Parse '1969' or '1,199' → int. Returns None if unparseable.""" if not s: return None cleaned = re.sub(r"[^\d-]", "", s) try: return int(cleaned) if cleaned else None except ValueError: return None def _money_to_int(s: str) -> Optional[int]: """Parse '$322,580' → 322580. Returns None if unparseable.""" if not s: return None cleaned = re.sub(r"[^\d.-]", "", s) if not cleaned or cleaned == "-": return None try: return int(float(cleaned)) except ValueError: return None def _money_to_float(s: str) -> Optional[float]: """Parse '$5,256.59' → 5256.59.""" if not s: return None cleaned = re.sub(r"[^\d.-]", "", s) if not cleaned or cleaned == "-": return None try: return float(cleaned) except ValueError: return None # ════════════════════════════════════════════════════════════════════════════ # CLI for manual testing # ════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": import argparse import json parser = argparse.ArgumentParser(description="Broward PA full record fetcher") parser.add_argument("parcel_id", help="Folio number (e.g. 484226062150)") parser.add_argument("--wait", type=int, default=25, help="SPA settle seconds (default 25)") args = parser.parse_args() record = fetch_broward_pa_record(args.parcel_id, wait_after_load=args.wait) print(json.dumps(record, indent=2, default=str))