79 lines
2.9 KiB
Python
79 lines
2.9 KiB
Python
"""Find the deep-link URL pattern for HUD property detail pages."""
|
|
from __future__ import annotations
|
|
import io, sys, time, re
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_context(
|
|
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
|
|
).new_page()
|
|
page.set_default_timeout(30_000)
|
|
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle")
|
|
time.sleep(2)
|
|
page.goto("https://www.hudhomestore.gov/searchresult?citystate=FL", wait_until="networkidle")
|
|
time.sleep(6)
|
|
|
|
# Find all links inside cards
|
|
print("=== ALL <a> elements inside property cards ===")
|
|
cards = page.locator("div.topMap-card.card-body").all()
|
|
print(f"Found {len(cards)} cards")
|
|
|
|
# Inspect the FIRST card in detail
|
|
if cards:
|
|
c1 = cards[0]
|
|
print()
|
|
print("--- CARD #1 HTML structure (first 2000 chars) ---")
|
|
html = c1.evaluate("(el) => el.outerHTML")
|
|
# Filter out script/style noise
|
|
cleaned = re.sub(r"\s+", " ", html)[:2500]
|
|
print(cleaned)
|
|
|
|
print()
|
|
print("--- CARD #1 ALL <a> hrefs ---")
|
|
anchors = c1.locator("a").all()
|
|
for i, a in enumerate(anchors[:15]):
|
|
href = a.get_attribute("href") or ""
|
|
text = (a.text_content() or "").strip()[:60]
|
|
print(f" [{i}] href={href} | text='{text}'")
|
|
|
|
# Also look for onclick handlers + data attributes
|
|
print()
|
|
print("--- CARD #1 elements with onclick / data-* ---")
|
|
clickables = c1.locator("[onclick], [data-href], [data-url], [data-link], [data-property]").all()
|
|
for el in clickables[:10]:
|
|
attrs = page.evaluate("""(el) => {
|
|
const out = {};
|
|
for (const a of el.attributes) out[a.name] = a.value;
|
|
return out;
|
|
}""", el.element_handle())
|
|
print(f" {el.evaluate('(el) => el.tagName')}: {attrs}")
|
|
|
|
# Check if there's a global pattern for property detail URLs in the page
|
|
print()
|
|
print("=== Looking for '/propertydetails' / '/Listing' anywhere in page ===")
|
|
full_html = page.content()
|
|
# Find href patterns
|
|
urls = re.findall(
|
|
r'href="([^"]*(?:propertydetail|propertyDetail|listing/PropertyDetail|case[Nn]umber)[^"]*)"',
|
|
full_html, re.IGNORECASE,
|
|
)
|
|
for u in set(urls[:10]):
|
|
print(f" {u}")
|
|
|
|
# Also look for data attribs with case#
|
|
case_links = re.findall(
|
|
r'(href|data-[a-z]+)="([^"]*093-?\d{6}[^"]*)"',
|
|
full_html, re.IGNORECASE,
|
|
)
|
|
print(f"\nLinks containing a case number (093-XXXXXX):")
|
|
for attr, url in case_links[:8]:
|
|
print(f" {attr}={url}")
|
|
|
|
browser.close()
|