191 lines
7.6 KiB
Python
191 lines
7.6 KiB
Python
"""Explore HUD Homestore (hudhomestore.gov) site structure with Playwright.
|
|
|
|
Goals:
|
|
1. Find URL pattern for FL state results
|
|
2. Find the search form input names/IDs
|
|
3. Document what fields each listing shows in results
|
|
4. Find URL pattern for individual property detail pages
|
|
"""
|
|
from __future__ import annotations
|
|
import io, sys, time
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
# Real Chrome UA — federal sites often block non-standard UAs
|
|
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
|
|
|
|
|
def main():
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
|
|
locale="en-US", timezone_id="America/New_York",
|
|
)
|
|
page = context.new_page()
|
|
page.set_default_timeout(30_000)
|
|
|
|
# Step 1: Load landing page
|
|
print("=" * 70)
|
|
print("Step 1: landing page")
|
|
print("=" * 70)
|
|
response = page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
|
|
print(f" status: {response.status}")
|
|
print(f" url: {page.url}")
|
|
print(f" title: {page.title()}")
|
|
time.sleep(2)
|
|
|
|
# Save landing HTML
|
|
landing_html = page.content()
|
|
with open("scripts/_hud_landing.html", "w", encoding="utf-8") as f:
|
|
f.write(landing_html)
|
|
print(f" HTML saved: scripts/_hud_landing.html ({len(landing_html):,} chars)")
|
|
|
|
# Inspect form inputs
|
|
print()
|
|
print("--- INPUTS (first 30) ---")
|
|
for i, inp in enumerate(page.locator("input").all()[:30]):
|
|
try:
|
|
attrs = page.evaluate("""(el) => {
|
|
const out = {};
|
|
for (const a of el.attributes) out[a.name] = a.value;
|
|
return out;
|
|
}""", inp.element_handle())
|
|
print(f" [{i}] {attrs}")
|
|
except Exception:
|
|
pass
|
|
|
|
# Inspect select dropdowns (state selector likely)
|
|
print()
|
|
print("--- SELECTS ---")
|
|
for i, sel in enumerate(page.locator("select").all()[:10]):
|
|
try:
|
|
attrs = page.evaluate("""(el) => {
|
|
const out = {};
|
|
for (const a of el.attributes) out[a.name] = a.value;
|
|
return out;
|
|
}""", sel.element_handle())
|
|
opts_count = sel.locator("option").count()
|
|
print(f" [{i}] {attrs} ({opts_count} options)")
|
|
except Exception:
|
|
pass
|
|
|
|
# Inspect navigation links
|
|
print()
|
|
print("--- KEY LINKS (search, results, etc) ---")
|
|
for link in page.locator("a").all()[:80]:
|
|
try:
|
|
href = link.get_attribute("href") or ""
|
|
text = (link.text_content() or "").strip()[:80]
|
|
if any(kw in href.lower() for kw in ["search", "result", "state", "property"]) or \
|
|
any(kw in text.lower() for kw in ["search", "browse", "list", "view all"]):
|
|
if href and not href.startswith("javascript:") and not href.startswith("#"):
|
|
print(f" href={href} | text='{text}'")
|
|
except Exception:
|
|
pass
|
|
|
|
# Step 2: try direct URL paths
|
|
print()
|
|
print("=" * 70)
|
|
print("Step 2: probe direct URLs")
|
|
print("=" * 70)
|
|
urls_to_try = [
|
|
"https://www.hudhomestore.gov/searchresult",
|
|
"https://www.hudhomestore.gov/SearchResult",
|
|
"https://www.hudhomestore.gov/Search",
|
|
"https://www.hudhomestore.gov/Listing/PropertyDetails",
|
|
]
|
|
for url in urls_to_try:
|
|
try:
|
|
r = page.goto(url, wait_until="domcontentloaded", timeout=15_000)
|
|
print(f" {url}: status={r.status}, final={page.url}, title={page.title()}")
|
|
except Exception as e:
|
|
print(f" {url}: ERROR {e}")
|
|
|
|
# Step 3: Try searching with FL state via search form
|
|
print()
|
|
print("=" * 70)
|
|
print("Step 3: search FL via form")
|
|
print("=" * 70)
|
|
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
|
|
time.sleep(2)
|
|
|
|
# Try filling state select with FL
|
|
state_filled = False
|
|
for sel in ["select[name*='State']", "select[id*='State']", "select[id*='state']"]:
|
|
try:
|
|
if page.locator(sel).count() > 0:
|
|
page.locator(sel).first.select_option(value="FL")
|
|
state_filled = True
|
|
print(f" State filled with 'FL' via {sel}")
|
|
break
|
|
except Exception as e:
|
|
pass
|
|
if not state_filled:
|
|
# Try option label "Florida"
|
|
for sel in ["select"]:
|
|
try:
|
|
selects = page.locator(sel).all()
|
|
for s in selects:
|
|
try:
|
|
s.select_option(label="Florida")
|
|
state_filled = True
|
|
print(f" State filled with 'Florida' label")
|
|
break
|
|
except Exception:
|
|
continue
|
|
if state_filled:
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
# Now try submit search
|
|
try:
|
|
page.locator("button:has-text('Search'), input[type='submit']").first.click()
|
|
page.wait_for_load_state("networkidle", timeout=15_000)
|
|
time.sleep(2)
|
|
print(f" Search submitted. URL after: {page.url}")
|
|
results_html = page.content()
|
|
with open("scripts/_hud_results_fl.html", "w", encoding="utf-8") as f:
|
|
f.write(results_html)
|
|
print(f" Results HTML saved: scripts/_hud_results_fl.html ({len(results_html):,} chars)")
|
|
|
|
# Inspect what came back — tables or cards?
|
|
tables = page.locator("table").all()
|
|
print(f" tables found: {len(tables)}")
|
|
cards = page.locator(".propertyResult, .property-result, .listing, [class*='listing']").all()
|
|
print(f" card-like elements: {len(cards)}")
|
|
|
|
# If table-based, dump first 3 rows of each table
|
|
for i, t in enumerate(tables[:3]):
|
|
rows = t.locator("tr").all()
|
|
print(f" Table [{i}]: {len(rows)} rows")
|
|
for j, r in enumerate(rows[:3]):
|
|
cells = [(c.text_content() or "").strip()[:50] for c in r.locator("td, th").all()]
|
|
non_empty = [c for c in cells if c]
|
|
if non_empty:
|
|
print(f" Row {j}: {non_empty}")
|
|
|
|
# Look for links to property detail
|
|
print()
|
|
print(" Property detail links (sample):")
|
|
for link in page.locator("a").all()[:50]:
|
|
try:
|
|
href = link.get_attribute("href") or ""
|
|
if "property" in href.lower() or "case" in href.lower() or "listing" in href.lower():
|
|
if "javascript" not in href:
|
|
text = (link.text_content() or "").strip()[:60]
|
|
print(f" href={href} | text='{text}'")
|
|
except Exception:
|
|
pass
|
|
except Exception as e:
|
|
print(f" Search submit ERROR: {e}")
|
|
|
|
browser.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|