feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,141 @@
|
||||
"""Probe Duval Property Appraiser detail page — map ALL extractable fields.
|
||||
|
||||
Test address: 2352 Scenic View Ct, Jacksonville, FL 32218 (user's bug report).
|
||||
"""
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
|
||||
def probe():
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
out_dir = Path(__file__).parent.parent / "_probe_out" / "duval_pa"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131",
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
# Step 1: search page
|
||||
print("[1] Loading search page...")
|
||||
page.goto("https://paopropertysearch.coj.net/Basic/Search.aspx",
|
||||
wait_until="networkidle", timeout=20000)
|
||||
print(f" URL: {page.url}")
|
||||
|
||||
# Step 2: Fill address — 2352 SCENIC VIEW CT
|
||||
print("[2] Filling form (2352 SCENIC VIEW CT 32218)...")
|
||||
page.locator("#ctl00_cphBody_tbStreetNumber").fill("2352")
|
||||
# No prefix
|
||||
page.locator("#ctl00_cphBody_tbStreetName").fill("SCENIC VIEW")
|
||||
# Suffix CT
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(value="CT")
|
||||
except Exception:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(label="CT")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_tbZipCode").fill("32218")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Submit
|
||||
page.locator("#ctl00_cphBody_bSearch").click()
|
||||
page.wait_for_timeout(4000)
|
||||
print(f"[3] After submit URL: {page.url}")
|
||||
body_text = page.inner_text("body")[:500]
|
||||
print(f" Body preview: {body_text[:400].encode('ascii', 'replace').decode('ascii')}")
|
||||
(out_dir / "01_results.html").write_text(page.content(), encoding="utf-8")
|
||||
page.screenshot(path=str(out_dir / "01_results.png"), full_page=True)
|
||||
|
||||
# If results table, click first row to get detail page
|
||||
results_table = page.locator("table:has(tr:has(td))").first
|
||||
try:
|
||||
tables = page.locator("table").all()
|
||||
for t in tables[:10]:
|
||||
rows = t.locator("tr").count()
|
||||
if rows < 2:
|
||||
continue
|
||||
hdrs = [(h.inner_text() or "").strip().lower() for h in t.locator("tr").first.locator("th, td").all()]
|
||||
if any("re" in h or "parcel" in h or "owner" in h or "address" in h for h in hdrs):
|
||||
# First data row → click first link
|
||||
first_row = t.locator("tr").nth(1)
|
||||
link = first_row.locator("a").first
|
||||
if link.count() > 0:
|
||||
href = link.get_attribute("href")
|
||||
link_text = link.inner_text()
|
||||
print(f"[4] Clicking result link: text={link_text!r} href={href}")
|
||||
link.click()
|
||||
page.wait_for_timeout(5000)
|
||||
break
|
||||
except Exception as e:
|
||||
print(f" Click result error: {e}")
|
||||
|
||||
print(f"\n[5] Detail page URL: {page.url}")
|
||||
(out_dir / "02_detail.html").write_text(page.content(), encoding="utf-8")
|
||||
page.screenshot(path=str(out_dir / "02_detail.png"), full_page=True)
|
||||
|
||||
# Dump ALL element IDs that have text
|
||||
elements = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const all = document.querySelectorAll('[id]');
|
||||
for (const el of all) {
|
||||
const txt = (el.textContent || '').trim();
|
||||
if (txt && txt.length < 300 && el.children.length < 4) {
|
||||
out.push({id: el.id, text: txt.substring(0, 200)});
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
print(f"\n[6] Elements with text content: {len(elements)}\n")
|
||||
for e in elements:
|
||||
tid = e["id"]
|
||||
# Skip nav/utility
|
||||
if tid.startswith("uw-") or tid.startswith("__"):
|
||||
continue
|
||||
txt_safe = e['text'][:120].encode('ascii', 'replace').decode('ascii')
|
||||
print(f" #{tid:50s} = {txt_safe!r}")
|
||||
|
||||
# All tables data
|
||||
print("\n\n===== TABLES =====")
|
||||
tables_data = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
document.querySelectorAll('table').forEach((tbl, idx) => {
|
||||
const rows = [];
|
||||
for (const tr of tbl.querySelectorAll('tr')) {
|
||||
const cells = [];
|
||||
for (const c of tr.querySelectorAll('td, th')) {
|
||||
cells.push((c.textContent || '').trim());
|
||||
}
|
||||
if (cells.some(c => c && c.length > 0)) rows.push(cells);
|
||||
}
|
||||
if (rows.length > 0) out.push({idx, rows});
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
for t in tables_data:
|
||||
rows = t["rows"]
|
||||
if not rows or len(rows) < 1:
|
||||
continue
|
||||
print(f"\n--- Table {t['idx']} ({len(rows)} rows) ---")
|
||||
for r in rows[:8]:
|
||||
line = " | ".join(c[:50] for c in r[:8])[:200]
|
||||
line_safe = line.encode('ascii', 'replace').decode('ascii')
|
||||
print(f" {line_safe}")
|
||||
|
||||
browser.close()
|
||||
print(f"\n[OK] saved to {out_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
probe()
|
||||
Reference in New Issue
Block a user