AR-House/scripts/explore_miamidade_auction_calendar.py

"""Deep-dive on the Miami-Dade Realforeclose auction calendar page."""
from __future__ import annotations
import io, sys, time
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")

from playwright.sync_api import sync_playwright

REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
           "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")

PREVIEW_URL = "https://www.miamidade.realforeclose.com/index.cfm?zaction=AUCTION&Zmethod=PREVIEW"

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    context = browser.new_context(
        user_agent=REAL_UA,
        viewport={"width": 1280, "height": 800},
        locale="en-US",
        timezone_id="America/New_York",
    )
    page = context.new_page()
    page.set_default_timeout(20_000)

    print("Loading PREVIEW page...")
    response = page.goto(PREVIEW_URL, wait_until="networkidle", timeout=25_000)
    print(f"Status: {response.status}, URL: {page.url}")
    print(f"Title: {page.title()}")

    # Wait extra for any JS rendering
    time.sleep(3)

    # Inspect what's on the page after JS rendering
    print()
    print("--- ALL TABLES ---")
    tables = page.locator("table").all()
    print(f"Found {len(tables)} tables")
    for i, t in enumerate(tables[:8]):
        try:
            rows = t.locator("tr").all()
            print(f"\n  Table [{i}]: {len(rows)} rows")
            for j, r in enumerate(rows[:4]):
                cells = [(c.text_content() or "").strip()[:40] for c in r.locator("td, th").all()]
                print(f"    Row {j}: {cells}")
        except Exception as e:
            print(f"  Table [{i}] error: {e}")

    print()
    print("--- DIVs with id or class containing 'auction' / 'sale' / 'calendar' ---")
    selectors_to_probe = [
        "div[id*='auction']", "div[id*='sale']", "div[id*='calendar']",
        "div[class*='auction']", "div[class*='sale']", "div[class*='calendar']",
        "div[class*='content']", "div.AUCTION_DETAILS_DIV",
    ]
    for sel in selectors_to_probe:
        try:
            els = page.locator(sel).all()
            if els:
                print(f"  {sel}: {len(els)} elements")
                for e in els[:3]:
                    text = (e.text_content() or "").strip()[:200]
                    if text:
                        print(f"    → {text!r}")
        except Exception:
            pass

    print()
    print("--- All links with 'auction', 'sale', 'case' in href or text ---")
    for link in page.locator("a").all()[:80]:
        try:
            href = link.get_attribute("href") or ""
            text = (link.text_content() or "").strip()[:80]
            if any(kw in href.lower() for kw in ["auction", "sale", "case"]) or any(kw in text.lower() for kw in ["auction", "calendar", "sale", "scheduled"]):
                if not href.startswith("javascript:") and not href.startswith("#"):
                    print(f"  href={href} | text='{text}'")
        except Exception:
            pass

    # Save the rendered HTML
    html = page.content()
    with open("scripts/_miamidade_preview_rendered.html", "w", encoding="utf-8") as f:
        f.write(html)
    print(f"\nRendered HTML saved: scripts/_miamidade_preview_rendered.html ({len(html):,} chars)")

    # Show key body text snippets
    print()
    print("--- BODY TEXT SNIPPETS (around 'auction' or 'calendar' keywords) ---")
    body_text = page.locator("body").inner_text()
    import re
    for kw in ["Auction Calendar", "Today's Auctions", "Upcoming Auctions", "View Auction", "Scheduled Sales", "Number of Auctions"]:
        idx = body_text.find(kw)
        if idx >= 0:
            print(f"  '{kw}' at pos {idx}: ...{body_text[max(0,idx-50):idx+300]!r}")

    browser.close()