120 lines
5.3 KiB
Python
120 lines
5.3 KiB
Python
"""Find the exact DOM structure for HUD property cards."""
|
|
from __future__ import annotations
|
|
import io, sys, time, re
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_context(
|
|
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
|
|
).new_page()
|
|
page.set_default_timeout(30_000)
|
|
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle")
|
|
time.sleep(2)
|
|
page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle")
|
|
time.sleep(6)
|
|
|
|
# Find each property card via its case # link pattern
|
|
# Look for ALL elements containing a 'Case #:' label
|
|
print("--- Finding all cards via case # text ---")
|
|
|
|
# Use Playwright to find parents of "Case #" text
|
|
# Each property card seems to contain the price, address, and case # near each other
|
|
# Find via XPath the nearest div ancestor of "Case #:" text
|
|
cards = page.evaluate("""
|
|
() => {
|
|
const allElements = Array.from(document.querySelectorAll('*'));
|
|
const cardLikeElements = new Set();
|
|
for (const el of allElements) {
|
|
if (!el.children.length) continue;
|
|
const text = el.textContent || '';
|
|
const hasCase = /Case #:\\s*\\d/.test(text);
|
|
const hasPrice = /\\$[\\d,]+/.test(text);
|
|
const hasBeds = /\\d+\\s+Beds/.test(text);
|
|
if (hasCase && hasPrice && hasBeds && text.length < 600) {
|
|
// Likely a card-sized element
|
|
cardLikeElements.add(el);
|
|
}
|
|
}
|
|
// Return their tag/class info
|
|
return Array.from(cardLikeElements).slice(0, 5).map(el => ({
|
|
tagName: el.tagName,
|
|
id: el.id || null,
|
|
className: el.className || null,
|
|
textPreview: (el.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 200),
|
|
}));
|
|
}
|
|
""")
|
|
print(f"Found {len(cards)} card-like elements:")
|
|
for i, c in enumerate(cards):
|
|
print(f" [{i}] tag={c['tagName']} class={c['className']!r}")
|
|
print(f" preview: {c['textPreview']!r}")
|
|
|
|
# Now find ALL such cards (no slice)
|
|
all_cards_count = page.evaluate("""
|
|
() => {
|
|
const allElements = Array.from(document.querySelectorAll('*'));
|
|
const seen = new Set();
|
|
for (const el of allElements) {
|
|
if (!el.children.length) continue;
|
|
const text = el.textContent || '';
|
|
if (/Case #:\\s*\\d/.test(text) && /\\$[\\d,]+/.test(text)
|
|
&& /\\d+\\s+Beds/.test(text) && text.length < 600) {
|
|
seen.add(el);
|
|
}
|
|
}
|
|
return seen.size;
|
|
}
|
|
""")
|
|
print(f"\nTotal card-like elements: {all_cards_count}")
|
|
|
|
# Now look at SMALL elements (deepest containers of one card)
|
|
# Find ones where text is unique to one case
|
|
print("\n--- Finding the most specific 'card' element per property ---")
|
|
cards_details = page.evaluate("""
|
|
() => {
|
|
const allElements = Array.from(document.querySelectorAll('*'));
|
|
// For each element with Case #, find the SMALLEST ancestor (deepest) that contains exactly ONE case #
|
|
const caseMatches = {}; // caseNumber -> array of elements
|
|
for (const el of allElements) {
|
|
if (!el.children || !el.children.length) continue;
|
|
const text = el.textContent || '';
|
|
const cases = (text.match(/Case #:\\s*\\d{3}-\\d{6}/g) || []);
|
|
if (cases.length === 1) {
|
|
const num = cases[0];
|
|
if (!caseMatches[num]) caseMatches[num] = [];
|
|
caseMatches[num].push(el);
|
|
}
|
|
}
|
|
// For each case, the smallest element is the card
|
|
const cards = [];
|
|
for (const [caseNum, elements] of Object.entries(caseMatches)) {
|
|
// Sort by element size (smallest = most specific)
|
|
elements.sort((a, b) => (a.textContent || '').length - (b.textContent || '').length);
|
|
const smallest = elements[0];
|
|
cards.push({
|
|
case_number: caseNum,
|
|
tagName: smallest.tagName,
|
|
id: smallest.id || null,
|
|
className: smallest.className || null,
|
|
text_length: (smallest.textContent || '').length,
|
|
text_preview: (smallest.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 250),
|
|
});
|
|
}
|
|
return cards.slice(0, 10); // first 10
|
|
}
|
|
""")
|
|
print(f"\nUnique cards found: {len(cards_details)}")
|
|
for i, c in enumerate(cards_details):
|
|
print(f"\n [{i}] {c['case_number']}")
|
|
print(f" tag={c['tagName']} class={c['className']!r}")
|
|
print(f" text_length={c['text_length']}")
|
|
print(f" preview: {c['text_preview']!r}")
|
|
|
|
browser.close()
|