"""Map ALL extractable fields from bcpa.net Broward PA SPA. Uses real folio: 484226062150 (id=143, 31 NW 17 CT). """ from pathlib import Path import json def probe(): from playwright.sync_api import sync_playwright out_dir = Path(__file__).parent.parent / "_probe_out" / "bcpa" out_dir.mkdir(parents=True, exist_ok=True) folio = "484226062150" url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={folio}" # Also capture all XHR/fetch — bcpa SPA may load data via API endpoints captured_apis: list[dict] = [] with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36", viewport={"width": 1400, "height": 1000}, ) page = ctx.new_page() def on_response(resp): try: u = resp.url if any(x in u for x in (".js", ".css", ".png", ".jpg", ".woff", ".ico", "google-analytics")): return ct = resp.headers.get("content-type", "") if "json" in ct or "xml" in ct or "text" in ct: captured_apis.append({ "url": u[:200], "status": resp.status, "content_type": ct, }) except Exception: pass page.on("response", on_response) print(f"[1] Loading {url}") page.goto(url, wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(12000) # SPA Angular render full data # Save full HTML rendered html = page.content() (out_dir / "01_record.html").write_text(html, encoding="utf-8") page.screenshot(path=str(out_dir / "01_record.png"), full_page=True) print(f"\n[2] Page title: {page.title()}") print(f" URL: {page.url}") # Dump all visible text (sections) body = page.inner_text("body") print(f"\n[3] Body length: {len(body)} chars") # Save full text (out_dir / "02_text.txt").write_text(body, encoding="utf-8") # First 4000 chars of body print(f"\n[4] First 3000 chars of rendered text:\n{body[:3000]}") # Look for key sections by keywords print(f"\n[5] Key data sections detected (search by keyword):") keywords = [ "owner", "mailing", "property address", "site address", "assessed", "market", "just value", "taxable", "year built", "living area", "adj bldg", "lot size", "sale", "deed", "qualification", "doc#", "instrument", "tax", "exemption", "homestead", "millage", "mill", "improvement", "feature", "use code", "land", "permit", "building", "construction", ] for kw in keywords: cnt = body.lower().count(kw) if cnt > 0: # Find first occurrence idx = body.lower().find(kw) snippet = body[max(0, idx-30):idx+150].replace("\n", " ") print(f" '{kw}' ({cnt}x): ...{snippet[:180]}...") # All tables on page print(f"\n[6] Tables on page:") tables = page.locator("table").all() for i, tbl in enumerate(tables[:15]): try: rows = tbl.locator("tr").count() if rows < 1: continue hdr_cells = tbl.locator("tr").first.locator("th, td").all() hdrs = [(h.inner_text() or "").strip()[:30] for h in hdr_cells[:8]] print(f" [{i}] rows={rows} headers={hdrs}") except Exception: pass # API endpoints discovered print(f"\n[7] API/XHR responses captured ({len(captured_apis)}):") seen = set() for api in captured_apis: url_short = api["url"].split("?")[0] if url_short in seen: continue seen.add(url_short) print(f" {api['status']} {api['url'][:130]}") # Save APIs to JSON (out_dir / "03_apis.json").write_text(json.dumps(captured_apis, indent=2), encoding="utf-8") browser.close() print(f"\n[OK] Saved to {out_dir}/") if __name__ == "__main__": probe()