"""Probe Duval Property Appraiser detail page — map ALL extractable fields. Test address: 2352 Scenic View Ct, Jacksonville, FL 32218 (user's bug report). """ from pathlib import Path import time def probe(): from playwright.sync_api import sync_playwright out_dir = Path(__file__).parent.parent / "_probe_out" / "duval_pa" out_dir.mkdir(parents=True, exist_ok=True) with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131", ) page = ctx.new_page() # Step 1: search page print("[1] Loading search page...") page.goto("https://paopropertysearch.coj.net/Basic/Search.aspx", wait_until="networkidle", timeout=20000) print(f" URL: {page.url}") # Step 2: Fill address — 2352 SCENIC VIEW CT print("[2] Filling form (2352 SCENIC VIEW CT 32218)...") page.locator("#ctl00_cphBody_tbStreetNumber").fill("2352") # No prefix page.locator("#ctl00_cphBody_tbStreetName").fill("SCENIC VIEW") # Suffix CT try: page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(value="CT") except Exception: try: page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(label="CT") except Exception: pass try: page.locator("#ctl00_cphBody_tbZipCode").fill("32218") except Exception: pass # Submit page.locator("#ctl00_cphBody_bSearch").click() page.wait_for_timeout(4000) print(f"[3] After submit URL: {page.url}") body_text = page.inner_text("body")[:500] print(f" Body preview: {body_text[:400].encode('ascii', 'replace').decode('ascii')}") (out_dir / "01_results.html").write_text(page.content(), encoding="utf-8") page.screenshot(path=str(out_dir / "01_results.png"), full_page=True) # If results table, click first row to get detail page results_table = page.locator("table:has(tr:has(td))").first try: tables = page.locator("table").all() for t in tables[:10]: rows = t.locator("tr").count() if rows < 2: continue hdrs = [(h.inner_text() or "").strip().lower() for h in t.locator("tr").first.locator("th, td").all()] if any("re" in h or "parcel" in h or "owner" in h or "address" in h for h in hdrs): # First data row → click first link first_row = t.locator("tr").nth(1) link = first_row.locator("a").first if link.count() > 0: href = link.get_attribute("href") link_text = link.inner_text() print(f"[4] Clicking result link: text={link_text!r} href={href}") link.click() page.wait_for_timeout(5000) break except Exception as e: print(f" Click result error: {e}") print(f"\n[5] Detail page URL: {page.url}") (out_dir / "02_detail.html").write_text(page.content(), encoding="utf-8") page.screenshot(path=str(out_dir / "02_detail.png"), full_page=True) # Dump ALL element IDs that have text elements = page.evaluate(""" () => { const out = []; const all = document.querySelectorAll('[id]'); for (const el of all) { const txt = (el.textContent || '').trim(); if (txt && txt.length < 300 && el.children.length < 4) { out.push({id: el.id, text: txt.substring(0, 200)}); } } return out; } """) print(f"\n[6] Elements with text content: {len(elements)}\n") for e in elements: tid = e["id"] # Skip nav/utility if tid.startswith("uw-") or tid.startswith("__"): continue txt_safe = e['text'][:120].encode('ascii', 'replace').decode('ascii') print(f" #{tid:50s} = {txt_safe!r}") # All tables data print("\n\n===== TABLES =====") tables_data = page.evaluate(""" () => { const out = []; document.querySelectorAll('table').forEach((tbl, idx) => { const rows = []; for (const tr of tbl.querySelectorAll('tr')) { const cells = []; for (const c of tr.querySelectorAll('td, th')) { cells.push((c.textContent || '').trim()); } if (cells.some(c => c && c.length > 0)) rows.push(cells); } if (rows.length > 0) out.push({idx, rows}); }); return out; } """) for t in tables_data: rows = t["rows"] if not rows or len(rows) < 1: continue print(f"\n--- Table {t['idx']} ({len(rows)} rows) ---") for r in rows[:8]: line = " | ".join(c[:50] for c in r[:8])[:200] line_safe = line.encode('ascii', 'replace').decode('ascii') print(f" {line_safe}") browser.close() print(f"\n[OK] saved to {out_dir}/") if __name__ == "__main__": probe()