Files
AR-House/scripts/probe_bcpa_full.py
2026-07-03 12:24:58 -04:00

120 lines
4.3 KiB
Python

"""Map ALL extractable fields from bcpa.net Broward PA SPA.
Uses real folio: 484226062150 (id=143, 31 NW 17 CT).
"""
from pathlib import Path
import json
def probe():
from playwright.sync_api import sync_playwright
out_dir = Path(__file__).parent.parent / "_probe_out" / "bcpa"
out_dir.mkdir(parents=True, exist_ok=True)
folio = "484226062150"
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={folio}"
# Also capture all XHR/fetch — bcpa SPA may load data via API endpoints
captured_apis: list[dict] = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
viewport={"width": 1400, "height": 1000},
)
page = ctx.new_page()
def on_response(resp):
try:
u = resp.url
if any(x in u for x in (".js", ".css", ".png", ".jpg", ".woff", ".ico", "google-analytics")):
return
ct = resp.headers.get("content-type", "")
if "json" in ct or "xml" in ct or "text" in ct:
captured_apis.append({
"url": u[:200],
"status": resp.status,
"content_type": ct,
})
except Exception:
pass
page.on("response", on_response)
print(f"[1] Loading {url}")
page.goto(url, wait_until="domcontentloaded", timeout=30000)
page.wait_for_timeout(12000) # SPA Angular render full data
# Save full HTML rendered
html = page.content()
(out_dir / "01_record.html").write_text(html, encoding="utf-8")
page.screenshot(path=str(out_dir / "01_record.png"), full_page=True)
print(f"\n[2] Page title: {page.title()}")
print(f" URL: {page.url}")
# Dump all visible text (sections)
body = page.inner_text("body")
print(f"\n[3] Body length: {len(body)} chars")
# Save full text
(out_dir / "02_text.txt").write_text(body, encoding="utf-8")
# First 4000 chars of body
print(f"\n[4] First 3000 chars of rendered text:\n{body[:3000]}")
# Look for key sections by keywords
print(f"\n[5] Key data sections detected (search by keyword):")
keywords = [
"owner", "mailing", "property address", "site address",
"assessed", "market", "just value", "taxable",
"year built", "living area", "adj bldg", "lot size",
"sale", "deed", "qualification", "doc#", "instrument",
"tax", "exemption", "homestead", "millage", "mill",
"improvement", "feature", "use code", "land",
"permit", "building", "construction",
]
for kw in keywords:
cnt = body.lower().count(kw)
if cnt > 0:
# Find first occurrence
idx = body.lower().find(kw)
snippet = body[max(0, idx-30):idx+150].replace("\n", " ")
print(f" '{kw}' ({cnt}x): ...{snippet[:180]}...")
# All tables on page
print(f"\n[6] Tables on page:")
tables = page.locator("table").all()
for i, tbl in enumerate(tables[:15]):
try:
rows = tbl.locator("tr").count()
if rows < 1:
continue
hdr_cells = tbl.locator("tr").first.locator("th, td").all()
hdrs = [(h.inner_text() or "").strip()[:30] for h in hdr_cells[:8]]
print(f" [{i}] rows={rows} headers={hdrs}")
except Exception:
pass
# API endpoints discovered
print(f"\n[7] API/XHR responses captured ({len(captured_apis)}):")
seen = set()
for api in captured_apis:
url_short = api["url"].split("?")[0]
if url_short in seen:
continue
seen.add(url_short)
print(f" {api['status']} {api['url'][:130]}")
# Save APIs to JSON
(out_dir / "03_apis.json").write_text(json.dumps(captured_apis, indent=2), encoding="utf-8")
browser.close()
print(f"\n[OK] Saved to {out_dir}/")
if __name__ == "__main__":
probe()