"""Find the deep-link URL pattern for HUD property detail pages.""" from __future__ import annotations import io, sys, time, re sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") from playwright.sync_api import sync_playwright REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_context( user_agent=REAL_UA, viewport={"width": 1400, "height": 900}, ).new_page() page.set_default_timeout(30_000) page.goto("https://www.hudhomestore.gov/", wait_until="networkidle") time.sleep(2) page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle") time.sleep(6) # Find all links inside cards print("=== ALL elements inside property cards ===") cards = page.locator("div.topMap-card.card-body").all() print(f"Found {len(cards)} cards") # Inspect the FIRST card in detail if cards: c1 = cards[0] print() print("--- CARD #1 HTML structure (first 2000 chars) ---") html = c1.evaluate("(el) => el.outerHTML") # Filter out script/style noise cleaned = re.sub(r"\s+", " ", html)[:2500] print(cleaned) print() print("--- CARD #1 ALL hrefs ---") anchors = c1.locator("a").all() for i, a in enumerate(anchors[:15]): href = a.get_attribute("href") or "" text = (a.text_content() or "").strip()[:60] print(f" [{i}] href={href} | text='{text}'") # Also look for onclick handlers + data attributes print() print("--- CARD #1 elements with onclick / data-* ---") clickables = c1.locator("[onclick], [data-href], [data-url], [data-link], [data-property]").all() for el in clickables[:10]: attrs = page.evaluate("""(el) => { const out = {}; for (const a of el.attributes) out[a.name] = a.value; return out; }""", el.element_handle()) print(f" {el.evaluate('(el) => el.tagName')}: {attrs}") # Check if there's a global pattern for property detail URLs in the page print() print("=== Looking for '/propertydetails' / '/Listing' anywhere in page ===") full_html = page.content() # Find href patterns urls = re.findall( r'href="([^"]*(?:propertydetail|propertyDetail|listing/PropertyDetail|case[Nn]umber)[^"]*)"', full_html, re.IGNORECASE, ) for u in set(urls[:10]): print(f" {u}") # Also look for data attribs with case# case_links = re.findall( r'(href|data-[a-z]+)="([^"]*093-?\d{6}[^"]*)"', full_html, re.IGNORECASE, ) print(f"\nLinks containing a case number (093-XXXXXX):") for attr, url in case_links[:8]: print(f" {attr}={url}") browser.close()