"""Find the exact DOM structure for HUD property cards.""" from __future__ import annotations import io, sys, time, re sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") from playwright.sync_api import sync_playwright REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_context( user_agent=REAL_UA, viewport={"width": 1400, "height": 900}, ).new_page() page.set_default_timeout(30_000) page.goto("https://www.hudhomestore.gov/", wait_until="networkidle") time.sleep(2) page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle") time.sleep(6) # Find each property card via its case # link pattern # Look for ALL elements containing a 'Case #:' label print("--- Finding all cards via case # text ---") # Use Playwright to find parents of "Case #" text # Each property card seems to contain the price, address, and case # near each other # Find via XPath the nearest div ancestor of "Case #:" text cards = page.evaluate(""" () => { const allElements = Array.from(document.querySelectorAll('*')); const cardLikeElements = new Set(); for (const el of allElements) { if (!el.children.length) continue; const text = el.textContent || ''; const hasCase = /Case #:\\s*\\d/.test(text); const hasPrice = /\\$[\\d,]+/.test(text); const hasBeds = /\\d+\\s+Beds/.test(text); if (hasCase && hasPrice && hasBeds && text.length < 600) { // Likely a card-sized element cardLikeElements.add(el); } } // Return their tag/class info return Array.from(cardLikeElements).slice(0, 5).map(el => ({ tagName: el.tagName, id: el.id || null, className: el.className || null, textPreview: (el.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 200), })); } """) print(f"Found {len(cards)} card-like elements:") for i, c in enumerate(cards): print(f" [{i}] tag={c['tagName']} class={c['className']!r}") print(f" preview: {c['textPreview']!r}") # Now find ALL such cards (no slice) all_cards_count = page.evaluate(""" () => { const allElements = Array.from(document.querySelectorAll('*')); const seen = new Set(); for (const el of allElements) { if (!el.children.length) continue; const text = el.textContent || ''; if (/Case #:\\s*\\d/.test(text) && /\\$[\\d,]+/.test(text) && /\\d+\\s+Beds/.test(text) && text.length < 600) { seen.add(el); } } return seen.size; } """) print(f"\nTotal card-like elements: {all_cards_count}") # Now look at SMALL elements (deepest containers of one card) # Find ones where text is unique to one case print("\n--- Finding the most specific 'card' element per property ---") cards_details = page.evaluate(""" () => { const allElements = Array.from(document.querySelectorAll('*')); // For each element with Case #, find the SMALLEST ancestor (deepest) that contains exactly ONE case # const caseMatches = {}; // caseNumber -> array of elements for (const el of allElements) { if (!el.children || !el.children.length) continue; const text = el.textContent || ''; const cases = (text.match(/Case #:\\s*\\d{3}-\\d{6}/g) || []); if (cases.length === 1) { const num = cases[0]; if (!caseMatches[num]) caseMatches[num] = []; caseMatches[num].push(el); } } // For each case, the smallest element is the card const cards = []; for (const [caseNum, elements] of Object.entries(caseMatches)) { // Sort by element size (smallest = most specific) elements.sort((a, b) => (a.textContent || '').length - (b.textContent || '').length); const smallest = elements[0]; cards.push({ case_number: caseNum, tagName: smallest.tagName, id: smallest.id || null, className: smallest.className || null, text_length: (smallest.textContent || '').length, text_preview: (smallest.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 250), }); } return cards.slice(0, 10); // first 10 } """) print(f"\nUnique cards found: {len(cards_details)}") for i, c in enumerate(cards_details): print(f"\n [{i}] {c['case_number']}") print(f" tag={c['tagName']} class={c['className']!r}") print(f" text_length={c['text_length']}") print(f" preview: {c['text_preview']!r}") browser.close()