Files
AR-House/scripts/explore_hud_card_structure.py
2026-07-03 12:24:58 -04:00

120 lines
5.3 KiB
Python

"""Find the exact DOM structure for HUD property cards."""
from __future__ import annotations
import io, sys, time, re
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
from playwright.sync_api import sync_playwright
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_context(
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
).new_page()
page.set_default_timeout(30_000)
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle")
time.sleep(2)
page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle")
time.sleep(6)
# Find each property card via its case # link pattern
# Look for ALL elements containing a 'Case #:' label
print("--- Finding all cards via case # text ---")
# Use Playwright to find parents of "Case #" text
# Each property card seems to contain the price, address, and case # near each other
# Find via XPath the nearest div ancestor of "Case #:" text
cards = page.evaluate("""
() => {
const allElements = Array.from(document.querySelectorAll('*'));
const cardLikeElements = new Set();
for (const el of allElements) {
if (!el.children.length) continue;
const text = el.textContent || '';
const hasCase = /Case #:\\s*\\d/.test(text);
const hasPrice = /\\$[\\d,]+/.test(text);
const hasBeds = /\\d+\\s+Beds/.test(text);
if (hasCase && hasPrice && hasBeds && text.length < 600) {
// Likely a card-sized element
cardLikeElements.add(el);
}
}
// Return their tag/class info
return Array.from(cardLikeElements).slice(0, 5).map(el => ({
tagName: el.tagName,
id: el.id || null,
className: el.className || null,
textPreview: (el.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 200),
}));
}
""")
print(f"Found {len(cards)} card-like elements:")
for i, c in enumerate(cards):
print(f" [{i}] tag={c['tagName']} class={c['className']!r}")
print(f" preview: {c['textPreview']!r}")
# Now find ALL such cards (no slice)
all_cards_count = page.evaluate("""
() => {
const allElements = Array.from(document.querySelectorAll('*'));
const seen = new Set();
for (const el of allElements) {
if (!el.children.length) continue;
const text = el.textContent || '';
if (/Case #:\\s*\\d/.test(text) && /\\$[\\d,]+/.test(text)
&& /\\d+\\s+Beds/.test(text) && text.length < 600) {
seen.add(el);
}
}
return seen.size;
}
""")
print(f"\nTotal card-like elements: {all_cards_count}")
# Now look at SMALL elements (deepest containers of one card)
# Find ones where text is unique to one case
print("\n--- Finding the most specific 'card' element per property ---")
cards_details = page.evaluate("""
() => {
const allElements = Array.from(document.querySelectorAll('*'));
// For each element with Case #, find the SMALLEST ancestor (deepest) that contains exactly ONE case #
const caseMatches = {}; // caseNumber -> array of elements
for (const el of allElements) {
if (!el.children || !el.children.length) continue;
const text = el.textContent || '';
const cases = (text.match(/Case #:\\s*\\d{3}-\\d{6}/g) || []);
if (cases.length === 1) {
const num = cases[0];
if (!caseMatches[num]) caseMatches[num] = [];
caseMatches[num].push(el);
}
}
// For each case, the smallest element is the card
const cards = [];
for (const [caseNum, elements] of Object.entries(caseMatches)) {
// Sort by element size (smallest = most specific)
elements.sort((a, b) => (a.textContent || '').length - (b.textContent || '').length);
const smallest = elements[0];
cards.push({
case_number: caseNum,
tagName: smallest.tagName,
id: smallest.id || null,
className: smallest.className || null,
text_length: (smallest.textContent || '').length,
text_preview: (smallest.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 250),
});
}
return cards.slice(0, 10); // first 10
}
""")
print(f"\nUnique cards found: {len(cards_details)}")
for i, c in enumerate(cards_details):
print(f"\n [{i}] {c['case_number']}")
print(f" tag={c['tagName']} class={c['className']!r}")
print(f" text_length={c['text_length']}")
print(f" preview: {c['text_preview']!r}")
browser.close()