Files
AR-House/scripts/explore_hudhomestore.py
2026-07-03 12:24:58 -04:00

191 lines
7.6 KiB
Python

"""Explore HUD Homestore (hudhomestore.gov) site structure with Playwright.
Goals:
1. Find URL pattern for FL state results
2. Find the search form input names/IDs
3. Document what fields each listing shows in results
4. Find URL pattern for individual property detail pages
"""
from __future__ import annotations
import io, sys, time
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
from playwright.sync_api import sync_playwright
# Real Chrome UA — federal sites often block non-standard UAs
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
def main():
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent=REAL_UA, viewport={"width": 1400, "height": 900},
locale="en-US", timezone_id="America/New_York",
)
page = context.new_page()
page.set_default_timeout(30_000)
# Step 1: Load landing page
print("=" * 70)
print("Step 1: landing page")
print("=" * 70)
response = page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
print(f" status: {response.status}")
print(f" url: {page.url}")
print(f" title: {page.title()}")
time.sleep(2)
# Save landing HTML
landing_html = page.content()
with open("scripts/_hud_landing.html", "w", encoding="utf-8") as f:
f.write(landing_html)
print(f" HTML saved: scripts/_hud_landing.html ({len(landing_html):,} chars)")
# Inspect form inputs
print()
print("--- INPUTS (first 30) ---")
for i, inp in enumerate(page.locator("input").all()[:30]):
try:
attrs = page.evaluate("""(el) => {
const out = {};
for (const a of el.attributes) out[a.name] = a.value;
return out;
}""", inp.element_handle())
print(f" [{i}] {attrs}")
except Exception:
pass
# Inspect select dropdowns (state selector likely)
print()
print("--- SELECTS ---")
for i, sel in enumerate(page.locator("select").all()[:10]):
try:
attrs = page.evaluate("""(el) => {
const out = {};
for (const a of el.attributes) out[a.name] = a.value;
return out;
}""", sel.element_handle())
opts_count = sel.locator("option").count()
print(f" [{i}] {attrs} ({opts_count} options)")
except Exception:
pass
# Inspect navigation links
print()
print("--- KEY LINKS (search, results, etc) ---")
for link in page.locator("a").all()[:80]:
try:
href = link.get_attribute("href") or ""
text = (link.text_content() or "").strip()[:80]
if any(kw in href.lower() for kw in ["search", "result", "state", "property"]) or \
any(kw in text.lower() for kw in ["search", "browse", "list", "view all"]):
if href and not href.startswith("javascript:") and not href.startswith("#"):
print(f" href={href} | text='{text}'")
except Exception:
pass
# Step 2: try direct URL paths
print()
print("=" * 70)
print("Step 2: probe direct URLs")
print("=" * 70)
urls_to_try = [
"https://www.hudhomestore.gov/searchresult",
"https://www.hudhomestore.gov/SearchResult",
"https://www.hudhomestore.gov/Search",
"https://www.hudhomestore.gov/Listing/PropertyDetails",
]
for url in urls_to_try:
try:
r = page.goto(url, wait_until="domcontentloaded", timeout=15_000)
print(f" {url}: status={r.status}, final={page.url}, title={page.title()}")
except Exception as e:
print(f" {url}: ERROR {e}")
# Step 3: Try searching with FL state via search form
print()
print("=" * 70)
print("Step 3: search FL via form")
print("=" * 70)
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
time.sleep(2)
# Try filling state select with FL
state_filled = False
for sel in ["select[name*='State']", "select[id*='State']", "select[id*='state']"]:
try:
if page.locator(sel).count() > 0:
page.locator(sel).first.select_option(value="FL")
state_filled = True
print(f" State filled with 'FL' via {sel}")
break
except Exception as e:
pass
if not state_filled:
# Try option label "Florida"
for sel in ["select"]:
try:
selects = page.locator(sel).all()
for s in selects:
try:
s.select_option(label="Florida")
state_filled = True
print(f" State filled with 'Florida' label")
break
except Exception:
continue
if state_filled:
break
except Exception:
pass
# Now try submit search
try:
page.locator("button:has-text('Search'), input[type='submit']").first.click()
page.wait_for_load_state("networkidle", timeout=15_000)
time.sleep(2)
print(f" Search submitted. URL after: {page.url}")
results_html = page.content()
with open("scripts/_hud_results_fl.html", "w", encoding="utf-8") as f:
f.write(results_html)
print(f" Results HTML saved: scripts/_hud_results_fl.html ({len(results_html):,} chars)")
# Inspect what came back — tables or cards?
tables = page.locator("table").all()
print(f" tables found: {len(tables)}")
cards = page.locator(".propertyResult, .property-result, .listing, [class*='listing']").all()
print(f" card-like elements: {len(cards)}")
# If table-based, dump first 3 rows of each table
for i, t in enumerate(tables[:3]):
rows = t.locator("tr").all()
print(f" Table [{i}]: {len(rows)} rows")
for j, r in enumerate(rows[:3]):
cells = [(c.text_content() or "").strip()[:50] for c in r.locator("td, th").all()]
non_empty = [c for c in cells if c]
if non_empty:
print(f" Row {j}: {non_empty}")
# Look for links to property detail
print()
print(" Property detail links (sample):")
for link in page.locator("a").all()[:50]:
try:
href = link.get_attribute("href") or ""
if "property" in href.lower() or "case" in href.lower() or "listing" in href.lower():
if "javascript" not in href:
text = (link.text_content() or "").strip()[:60]
print(f" href={href} | text='{text}'")
except Exception:
pass
except Exception as e:
print(f" Search submit ERROR: {e}")
browser.close()
if __name__ == "__main__":
main()