feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,119 @@
|
||||
"""Find the exact DOM structure for HUD property cards."""
|
||||
from __future__ import annotations
|
||||
import io, sys, time, re
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_context(
|
||||
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
|
||||
).new_page()
|
||||
page.set_default_timeout(30_000)
|
||||
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle")
|
||||
time.sleep(2)
|
||||
page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle")
|
||||
time.sleep(6)
|
||||
|
||||
# Find each property card via its case # link pattern
|
||||
# Look for ALL elements containing a 'Case #:' label
|
||||
print("--- Finding all cards via case # text ---")
|
||||
|
||||
# Use Playwright to find parents of "Case #" text
|
||||
# Each property card seems to contain the price, address, and case # near each other
|
||||
# Find via XPath the nearest div ancestor of "Case #:" text
|
||||
cards = page.evaluate("""
|
||||
() => {
|
||||
const allElements = Array.from(document.querySelectorAll('*'));
|
||||
const cardLikeElements = new Set();
|
||||
for (const el of allElements) {
|
||||
if (!el.children.length) continue;
|
||||
const text = el.textContent || '';
|
||||
const hasCase = /Case #:\\s*\\d/.test(text);
|
||||
const hasPrice = /\\$[\\d,]+/.test(text);
|
||||
const hasBeds = /\\d+\\s+Beds/.test(text);
|
||||
if (hasCase && hasPrice && hasBeds && text.length < 600) {
|
||||
// Likely a card-sized element
|
||||
cardLikeElements.add(el);
|
||||
}
|
||||
}
|
||||
// Return their tag/class info
|
||||
return Array.from(cardLikeElements).slice(0, 5).map(el => ({
|
||||
tagName: el.tagName,
|
||||
id: el.id || null,
|
||||
className: el.className || null,
|
||||
textPreview: (el.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 200),
|
||||
}));
|
||||
}
|
||||
""")
|
||||
print(f"Found {len(cards)} card-like elements:")
|
||||
for i, c in enumerate(cards):
|
||||
print(f" [{i}] tag={c['tagName']} class={c['className']!r}")
|
||||
print(f" preview: {c['textPreview']!r}")
|
||||
|
||||
# Now find ALL such cards (no slice)
|
||||
all_cards_count = page.evaluate("""
|
||||
() => {
|
||||
const allElements = Array.from(document.querySelectorAll('*'));
|
||||
const seen = new Set();
|
||||
for (const el of allElements) {
|
||||
if (!el.children.length) continue;
|
||||
const text = el.textContent || '';
|
||||
if (/Case #:\\s*\\d/.test(text) && /\\$[\\d,]+/.test(text)
|
||||
&& /\\d+\\s+Beds/.test(text) && text.length < 600) {
|
||||
seen.add(el);
|
||||
}
|
||||
}
|
||||
return seen.size;
|
||||
}
|
||||
""")
|
||||
print(f"\nTotal card-like elements: {all_cards_count}")
|
||||
|
||||
# Now look at SMALL elements (deepest containers of one card)
|
||||
# Find ones where text is unique to one case
|
||||
print("\n--- Finding the most specific 'card' element per property ---")
|
||||
cards_details = page.evaluate("""
|
||||
() => {
|
||||
const allElements = Array.from(document.querySelectorAll('*'));
|
||||
// For each element with Case #, find the SMALLEST ancestor (deepest) that contains exactly ONE case #
|
||||
const caseMatches = {}; // caseNumber -> array of elements
|
||||
for (const el of allElements) {
|
||||
if (!el.children || !el.children.length) continue;
|
||||
const text = el.textContent || '';
|
||||
const cases = (text.match(/Case #:\\s*\\d{3}-\\d{6}/g) || []);
|
||||
if (cases.length === 1) {
|
||||
const num = cases[0];
|
||||
if (!caseMatches[num]) caseMatches[num] = [];
|
||||
caseMatches[num].push(el);
|
||||
}
|
||||
}
|
||||
// For each case, the smallest element is the card
|
||||
const cards = [];
|
||||
for (const [caseNum, elements] of Object.entries(caseMatches)) {
|
||||
// Sort by element size (smallest = most specific)
|
||||
elements.sort((a, b) => (a.textContent || '').length - (b.textContent || '').length);
|
||||
const smallest = elements[0];
|
||||
cards.push({
|
||||
case_number: caseNum,
|
||||
tagName: smallest.tagName,
|
||||
id: smallest.id || null,
|
||||
className: smallest.className || null,
|
||||
text_length: (smallest.textContent || '').length,
|
||||
text_preview: (smallest.textContent || '').replace(/\\s+/g, ' ').trim().slice(0, 250),
|
||||
});
|
||||
}
|
||||
return cards.slice(0, 10); // first 10
|
||||
}
|
||||
""")
|
||||
print(f"\nUnique cards found: {len(cards_details)}")
|
||||
for i, c in enumerate(cards_details):
|
||||
print(f"\n [{i}] {c['case_number']}")
|
||||
print(f" tag={c['tagName']} class={c['className']!r}")
|
||||
print(f" text_length={c['text_length']}")
|
||||
print(f" preview: {c['text_preview']!r}")
|
||||
|
||||
browser.close()
|
||||
Reference in New Issue
Block a user