feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,119 @@
|
||||
"""Map ALL extractable fields from bcpa.net Broward PA SPA.
|
||||
|
||||
Uses real folio: 484226062150 (id=143, 31 NW 17 CT).
|
||||
"""
|
||||
from pathlib import Path
|
||||
import json
|
||||
|
||||
|
||||
def probe():
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
out_dir = Path(__file__).parent.parent / "_probe_out" / "bcpa"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
folio = "484226062150"
|
||||
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={folio}"
|
||||
|
||||
# Also capture all XHR/fetch — bcpa SPA may load data via API endpoints
|
||||
captured_apis: list[dict] = []
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1400, "height": 1000},
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
def on_response(resp):
|
||||
try:
|
||||
u = resp.url
|
||||
if any(x in u for x in (".js", ".css", ".png", ".jpg", ".woff", ".ico", "google-analytics")):
|
||||
return
|
||||
ct = resp.headers.get("content-type", "")
|
||||
if "json" in ct or "xml" in ct or "text" in ct:
|
||||
captured_apis.append({
|
||||
"url": u[:200],
|
||||
"status": resp.status,
|
||||
"content_type": ct,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
page.on("response", on_response)
|
||||
|
||||
print(f"[1] Loading {url}")
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
page.wait_for_timeout(12000) # SPA Angular render full data
|
||||
|
||||
# Save full HTML rendered
|
||||
html = page.content()
|
||||
(out_dir / "01_record.html").write_text(html, encoding="utf-8")
|
||||
page.screenshot(path=str(out_dir / "01_record.png"), full_page=True)
|
||||
|
||||
print(f"\n[2] Page title: {page.title()}")
|
||||
print(f" URL: {page.url}")
|
||||
|
||||
# Dump all visible text (sections)
|
||||
body = page.inner_text("body")
|
||||
print(f"\n[3] Body length: {len(body)} chars")
|
||||
|
||||
# Save full text
|
||||
(out_dir / "02_text.txt").write_text(body, encoding="utf-8")
|
||||
|
||||
# First 4000 chars of body
|
||||
print(f"\n[4] First 3000 chars of rendered text:\n{body[:3000]}")
|
||||
|
||||
# Look for key sections by keywords
|
||||
print(f"\n[5] Key data sections detected (search by keyword):")
|
||||
keywords = [
|
||||
"owner", "mailing", "property address", "site address",
|
||||
"assessed", "market", "just value", "taxable",
|
||||
"year built", "living area", "adj bldg", "lot size",
|
||||
"sale", "deed", "qualification", "doc#", "instrument",
|
||||
"tax", "exemption", "homestead", "millage", "mill",
|
||||
"improvement", "feature", "use code", "land",
|
||||
"permit", "building", "construction",
|
||||
]
|
||||
for kw in keywords:
|
||||
cnt = body.lower().count(kw)
|
||||
if cnt > 0:
|
||||
# Find first occurrence
|
||||
idx = body.lower().find(kw)
|
||||
snippet = body[max(0, idx-30):idx+150].replace("\n", " ")
|
||||
print(f" '{kw}' ({cnt}x): ...{snippet[:180]}...")
|
||||
|
||||
# All tables on page
|
||||
print(f"\n[6] Tables on page:")
|
||||
tables = page.locator("table").all()
|
||||
for i, tbl in enumerate(tables[:15]):
|
||||
try:
|
||||
rows = tbl.locator("tr").count()
|
||||
if rows < 1:
|
||||
continue
|
||||
hdr_cells = tbl.locator("tr").first.locator("th, td").all()
|
||||
hdrs = [(h.inner_text() or "").strip()[:30] for h in hdr_cells[:8]]
|
||||
print(f" [{i}] rows={rows} headers={hdrs}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# API endpoints discovered
|
||||
print(f"\n[7] API/XHR responses captured ({len(captured_apis)}):")
|
||||
seen = set()
|
||||
for api in captured_apis:
|
||||
url_short = api["url"].split("?")[0]
|
||||
if url_short in seen:
|
||||
continue
|
||||
seen.add(url_short)
|
||||
print(f" {api['status']} {api['url'][:130]}")
|
||||
|
||||
# Save APIs to JSON
|
||||
(out_dir / "03_apis.json").write_text(json.dumps(captured_apis, indent=2), encoding="utf-8")
|
||||
|
||||
browser.close()
|
||||
print(f"\n[OK] Saved to {out_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
probe()
|
||||
Reference in New Issue
Block a user