AR-House/scripts/probe_bcpa_full.py

"""Map ALL extractable fields from bcpa.net Broward PA SPA.

Uses real folio: 484226062150 (id=143, 31 NW 17 CT).
"""
from pathlib import Path
import json


def probe():
    from playwright.sync_api import sync_playwright

    out_dir = Path(__file__).parent.parent / "_probe_out" / "bcpa"
    out_dir.mkdir(parents=True, exist_ok=True)

    folio = "484226062150"
    url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={folio}"

    # Also capture all XHR/fetch — bcpa SPA may load data via API endpoints
    captured_apis: list[dict] = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        ctx = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
            viewport={"width": 1400, "height": 1000},
        )
        page = ctx.new_page()

        def on_response(resp):
            try:
                u = resp.url
                if any(x in u for x in (".js", ".css", ".png", ".jpg", ".woff", ".ico", "google-analytics")):
                    return
                ct = resp.headers.get("content-type", "")
                if "json" in ct or "xml" in ct or "text" in ct:
                    captured_apis.append({
                        "url": u[:200],
                        "status": resp.status,
                        "content_type": ct,
                    })
            except Exception:
                pass

        page.on("response", on_response)

        print(f"[1] Loading {url}")
        page.goto(url, wait_until="domcontentloaded", timeout=30000)
        page.wait_for_timeout(12000)  # SPA Angular render full data

        # Save full HTML rendered
        html = page.content()
        (out_dir / "01_record.html").write_text(html, encoding="utf-8")
        page.screenshot(path=str(out_dir / "01_record.png"), full_page=True)

        print(f"\n[2] Page title: {page.title()}")
        print(f"    URL: {page.url}")

        # Dump all visible text (sections)
        body = page.inner_text("body")
        print(f"\n[3] Body length: {len(body)} chars")

        # Save full text
        (out_dir / "02_text.txt").write_text(body, encoding="utf-8")

        # First 4000 chars of body
        print(f"\n[4] First 3000 chars of rendered text:\n{body[:3000]}")

        # Look for key sections by keywords
        print(f"\n[5] Key data sections detected (search by keyword):")
        keywords = [
            "owner", "mailing", "property address", "site address",
            "assessed", "market", "just value", "taxable",
            "year built", "living area", "adj bldg", "lot size",
            "sale", "deed", "qualification", "doc#", "instrument",
            "tax", "exemption", "homestead", "millage", "mill",
            "improvement", "feature", "use code", "land",
            "permit", "building", "construction",
        ]
        for kw in keywords:
            cnt = body.lower().count(kw)
            if cnt > 0:
                # Find first occurrence
                idx = body.lower().find(kw)
                snippet = body[max(0, idx-30):idx+150].replace("\n", " ")
                print(f"    '{kw}' ({cnt}x): ...{snippet[:180]}...")

        # All tables on page
        print(f"\n[6] Tables on page:")
        tables = page.locator("table").all()
        for i, tbl in enumerate(tables[:15]):
            try:
                rows = tbl.locator("tr").count()
                if rows < 1:
                    continue
                hdr_cells = tbl.locator("tr").first.locator("th, td").all()
                hdrs = [(h.inner_text() or "").strip()[:30] for h in hdr_cells[:8]]
                print(f"    [{i}] rows={rows} headers={hdrs}")
            except Exception:
                pass

        # API endpoints discovered
        print(f"\n[7] API/XHR responses captured ({len(captured_apis)}):")
        seen = set()
        for api in captured_apis:
            url_short = api["url"].split("?")[0]
            if url_short in seen:
                continue
            seen.add(url_short)
            print(f"    {api['status']} {api['url'][:130]}")

        # Save APIs to JSON
        (out_dir / "03_apis.json").write_text(json.dumps(captured_apis, indent=2), encoding="utf-8")

        browser.close()
        print(f"\n[OK] Saved to {out_dir}/")


if __name__ == "__main__":
    probe()