feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
"""Wait 25s + dump all element IDs that have text content."""
|
||||
from playwright.sync_api import sync_playwright
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
|
||||
def probe():
|
||||
folio = "484226062150"
|
||||
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={folio}"
|
||||
out_dir = Path(__file__).parent.parent / "_probe_out" / "bcpa"
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131"
|
||||
).new_page()
|
||||
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
time.sleep(25)
|
||||
|
||||
# Save full rendered HTML
|
||||
(out_dir / "FINAL_rendered.html").write_text(page.content(), encoding="utf-8")
|
||||
page.screenshot(path=str(out_dir / "FINAL_rendered.png"), full_page=True)
|
||||
|
||||
# Extract ALL elements with id attribute that have text content
|
||||
elements = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const all = document.querySelectorAll('[id]');
|
||||
for (const el of all) {
|
||||
const txt = (el.textContent || '').trim();
|
||||
// Only collect leaf-like elements with reasonable text
|
||||
if (txt && txt.length < 300 && el.children.length < 3) {
|
||||
// Find the closest visible label (preceding sibling td.lblRecinfoNew or label)
|
||||
let label = '';
|
||||
const parent = el.closest('tr, div.row, p, .info-row');
|
||||
if (parent) {
|
||||
const labelEl = parent.querySelector('.lblRecinfoNew, .searchTblCategory, .searchTblCategory2, label, .info-label');
|
||||
if (labelEl) label = (labelEl.textContent || '').trim().substring(0, 80);
|
||||
}
|
||||
out.push({id: el.id, text: txt.substring(0, 200), label: label});
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
print(f"Elements with text content: {len(elements)}\n")
|
||||
for e in elements:
|
||||
label = e['label'][:50] if e['label'] else ''
|
||||
print(f" #{e['id']:35s} [{label[:30]:30s}] = {e['text'][:80]!r}")
|
||||
|
||||
# Also extract table data — populated cells
|
||||
print("\n\n===== TABLE DATA (rows with non-empty cells) =====")
|
||||
tables_data = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const tables = document.querySelectorAll('table');
|
||||
tables.forEach((tbl, idx) => {
|
||||
const rows = [];
|
||||
for (const tr of tbl.querySelectorAll('tr')) {
|
||||
const cells = [];
|
||||
for (const c of tr.querySelectorAll('td, th')) {
|
||||
cells.push((c.textContent || '').trim());
|
||||
}
|
||||
if (cells.some(c => c && c.length > 0)) {
|
||||
rows.push(cells);
|
||||
}
|
||||
}
|
||||
if (rows.length > 0) {
|
||||
// First row sometimes has the table identifier
|
||||
out.push({idx, rows: rows.slice(0, 15)});
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
for t in tables_data:
|
||||
if not t["rows"]:
|
||||
continue
|
||||
first_row = " | ".join(t["rows"][0][:6])[:120]
|
||||
print(f"\n--- Table {t['idx']} ({len(t['rows'])} rows) ---")
|
||||
print(f" Header/R0: {first_row}")
|
||||
for r in t["rows"][1:6]:
|
||||
line = " | ".join(c[:35] for c in r[:6])[:140]
|
||||
print(f" R: {line}")
|
||||
|
||||
browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
probe()
|
||||
Reference in New Issue
Block a user