"""Find the deep-link URL pattern for HUD property detail pages."""
from __future__ import annotations
import io, sys, time, re
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
from playwright.sync_api import sync_playwright
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_context(
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
).new_page()
page.set_default_timeout(30_000)
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle")
time.sleep(2)
page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle")
time.sleep(6)
# Find all links inside cards
print("=== ALL elements inside property cards ===")
cards = page.locator("div.topMap-card.card-body").all()
print(f"Found {len(cards)} cards")
# Inspect the FIRST card in detail
if cards:
c1 = cards[0]
print()
print("--- CARD #1 HTML structure (first 2000 chars) ---")
html = c1.evaluate("(el) => el.outerHTML")
# Filter out script/style noise
cleaned = re.sub(r"\s+", " ", html)[:2500]
print(cleaned)
print()
print("--- CARD #1 ALL hrefs ---")
anchors = c1.locator("a").all()
for i, a in enumerate(anchors[:15]):
href = a.get_attribute("href") or ""
text = (a.text_content() or "").strip()[:60]
print(f" [{i}] href={href} | text='{text}'")
# Also look for onclick handlers + data attributes
print()
print("--- CARD #1 elements with onclick / data-* ---")
clickables = c1.locator("[onclick], [data-href], [data-url], [data-link], [data-property]").all()
for el in clickables[:10]:
attrs = page.evaluate("""(el) => {
const out = {};
for (const a of el.attributes) out[a.name] = a.value;
return out;
}""", el.element_handle())
print(f" {el.evaluate('(el) => el.tagName')}: {attrs}")
# Check if there's a global pattern for property detail URLs in the page
print()
print("=== Looking for '/propertydetails' / '/Listing' anywhere in page ===")
full_html = page.content()
# Find href patterns
urls = re.findall(
r'href="([^"]*(?:propertydetail|propertyDetail|listing/PropertyDetail|case[Nn]umber)[^"]*)"',
full_html, re.IGNORECASE,
)
for u in set(urls[:10]):
print(f" {u}")
# Also look for data attribs with case#
case_links = re.findall(
r'(href|data-[a-z]+)="([^"]*093-?\d{6}[^"]*)"',
full_html, re.IGNORECASE,
)
print(f"\nLinks containing a case number (093-XXXXXX):")
for attr, url in case_links[:8]:
print(f" {attr}={url}")
browser.close()