120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
"""Map ALL extractable fields from bcpa.net Broward PA SPA.
|
|
|
|
Uses real folio: 484226062150 (id=143, 31 NW 17 CT).
|
|
"""
|
|
from pathlib import Path
|
|
import json
|
|
|
|
|
|
def probe():
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
out_dir = Path(__file__).parent.parent / "_probe_out" / "bcpa"
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
folio = "484226062150"
|
|
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={folio}"
|
|
|
|
# Also capture all XHR/fetch — bcpa SPA may load data via API endpoints
|
|
captured_apis: list[dict] = []
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
ctx = browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
|
|
viewport={"width": 1400, "height": 1000},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
def on_response(resp):
|
|
try:
|
|
u = resp.url
|
|
if any(x in u for x in (".js", ".css", ".png", ".jpg", ".woff", ".ico", "google-analytics")):
|
|
return
|
|
ct = resp.headers.get("content-type", "")
|
|
if "json" in ct or "xml" in ct or "text" in ct:
|
|
captured_apis.append({
|
|
"url": u[:200],
|
|
"status": resp.status,
|
|
"content_type": ct,
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
page.on("response", on_response)
|
|
|
|
print(f"[1] Loading {url}")
|
|
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
|
page.wait_for_timeout(12000) # SPA Angular render full data
|
|
|
|
# Save full HTML rendered
|
|
html = page.content()
|
|
(out_dir / "01_record.html").write_text(html, encoding="utf-8")
|
|
page.screenshot(path=str(out_dir / "01_record.png"), full_page=True)
|
|
|
|
print(f"\n[2] Page title: {page.title()}")
|
|
print(f" URL: {page.url}")
|
|
|
|
# Dump all visible text (sections)
|
|
body = page.inner_text("body")
|
|
print(f"\n[3] Body length: {len(body)} chars")
|
|
|
|
# Save full text
|
|
(out_dir / "02_text.txt").write_text(body, encoding="utf-8")
|
|
|
|
# First 4000 chars of body
|
|
print(f"\n[4] First 3000 chars of rendered text:\n{body[:3000]}")
|
|
|
|
# Look for key sections by keywords
|
|
print(f"\n[5] Key data sections detected (search by keyword):")
|
|
keywords = [
|
|
"owner", "mailing", "property address", "site address",
|
|
"assessed", "market", "just value", "taxable",
|
|
"year built", "living area", "adj bldg", "lot size",
|
|
"sale", "deed", "qualification", "doc#", "instrument",
|
|
"tax", "exemption", "homestead", "millage", "mill",
|
|
"improvement", "feature", "use code", "land",
|
|
"permit", "building", "construction",
|
|
]
|
|
for kw in keywords:
|
|
cnt = body.lower().count(kw)
|
|
if cnt > 0:
|
|
# Find first occurrence
|
|
idx = body.lower().find(kw)
|
|
snippet = body[max(0, idx-30):idx+150].replace("\n", " ")
|
|
print(f" '{kw}' ({cnt}x): ...{snippet[:180]}...")
|
|
|
|
# All tables on page
|
|
print(f"\n[6] Tables on page:")
|
|
tables = page.locator("table").all()
|
|
for i, tbl in enumerate(tables[:15]):
|
|
try:
|
|
rows = tbl.locator("tr").count()
|
|
if rows < 1:
|
|
continue
|
|
hdr_cells = tbl.locator("tr").first.locator("th, td").all()
|
|
hdrs = [(h.inner_text() or "").strip()[:30] for h in hdr_cells[:8]]
|
|
print(f" [{i}] rows={rows} headers={hdrs}")
|
|
except Exception:
|
|
pass
|
|
|
|
# API endpoints discovered
|
|
print(f"\n[7] API/XHR responses captured ({len(captured_apis)}):")
|
|
seen = set()
|
|
for api in captured_apis:
|
|
url_short = api["url"].split("?")[0]
|
|
if url_short in seen:
|
|
continue
|
|
seen.add(url_short)
|
|
print(f" {api['status']} {api['url'][:130]}")
|
|
|
|
# Save APIs to JSON
|
|
(out_dir / "03_apis.json").write_text(json.dumps(captured_apis, indent=2), encoding="utf-8")
|
|
|
|
browser.close()
|
|
print(f"\n[OK] Saved to {out_dir}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
probe()
|