148 lines
5.1 KiB
Python
148 lines
5.1 KiB
Python
"""Validate B3 source_url bugfix.
|
|
|
|
1. Unit test: build_deep_link() with sample case numbers.
|
|
2. Scrape fresh HUD listings → verify each deal_record has unique deep-link.
|
|
3. HTTP-verify (via Playwright) that 3 random deep-links actually open the property.
|
|
"""
|
|
from __future__ import annotations
|
|
import io, sys, time
|
|
from pathlib import Path
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from scrapers.hud_homestore import build_deep_link, scrape_hud_homestore
|
|
|
|
|
|
def test_unit():
|
|
"""Unit tests for build_deep_link()."""
|
|
print("=" * 60)
|
|
print("UNIT TESTS — build_deep_link()")
|
|
print("=" * 60)
|
|
cases = [
|
|
("093-676572", "https://www.hudhomestore.gov/PropertyDetails?caseNumber=093-676572"),
|
|
("093-612260", "https://www.hudhomestore.gov/PropertyDetails?caseNumber=093-612260"),
|
|
(None, None),
|
|
("", None),
|
|
(" 093-727486 ", "https://www.hudhomestore.gov/PropertyDetails?caseNumber=093-727486"),
|
|
]
|
|
failures = 0
|
|
for inp, expected in cases:
|
|
actual = build_deep_link(inp)
|
|
ok = "✅" if actual == expected else "❌"
|
|
print(f" {ok} build_deep_link({inp!r}) → {actual}")
|
|
if actual != expected:
|
|
failures += 1
|
|
print(f" expected: {expected}")
|
|
return failures
|
|
|
|
|
|
def test_scrape_unique_urls():
|
|
"""Scrape fresh + verify each deal has a unique deep-link."""
|
|
print()
|
|
print("=" * 60)
|
|
print("SCRAPE TEST — fresh scrape, verify unique deep-links")
|
|
print("=" * 60)
|
|
deals = scrape_hud_homestore(
|
|
states=["FL"],
|
|
status_cb=lambda m: print(f" {m}"),
|
|
use_cache=True,
|
|
)
|
|
print(f"\n Total: {len(deals)} deals scraped")
|
|
|
|
# Verify uniqueness
|
|
urls = [d.get("source_url") for d in deals]
|
|
unique = set(urls)
|
|
print(f" Unique source_urls: {len(unique)} (should be == {len(deals)})")
|
|
|
|
# Sample 5
|
|
print()
|
|
print(" Sample 5 deal source_urls:")
|
|
for d in deals[:5]:
|
|
url = d.get("source_url")
|
|
case = d.get("case_number")
|
|
addr = (d.get("address") or "?")[:50]
|
|
# Verify URL contains case_number
|
|
case_in_url = case in (url or "") if case else False
|
|
print(f" case={case} url={url}")
|
|
print(f" addr={addr} | case_in_url={case_in_url}")
|
|
|
|
return deals
|
|
|
|
|
|
def test_http_verify(deals, n_samples=3):
|
|
"""HTTP-load 3 random deep-links via Playwright; verify page renders the property."""
|
|
print()
|
|
print("=" * 60)
|
|
print(f"HTTP VERIFY — {n_samples} random deep-links open correct property")
|
|
print("=" * 60)
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
REAL_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
|
|
|
# Pick first, middle, last
|
|
samples = [deals[0], deals[len(deals)//2], deals[-1]] if len(deals) >= 3 else deals
|
|
|
|
failures = 0
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_context(user_agent=REAL_UA, viewport={"width": 1400, "height": 900}).new_page()
|
|
page.set_default_timeout(20_000)
|
|
|
|
# Set session
|
|
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle")
|
|
time.sleep(1.5)
|
|
|
|
for d in samples:
|
|
url = d.get("source_url")
|
|
case = d.get("case_number")
|
|
addr = d.get("address") or ""
|
|
print(f"\n Testing case={case}")
|
|
print(f" URL: {url}")
|
|
print(f" Expected address: {addr[:60]}")
|
|
try:
|
|
page.goto(url, wait_until="networkidle", timeout=20_000)
|
|
time.sleep(3)
|
|
body = page.locator("body").inner_text()
|
|
# Verify body contains the case number AND part of the expected address
|
|
# Use the street number as a strong signal
|
|
import re
|
|
street_num_match = re.match(r"^(\d+)", addr)
|
|
street_num = street_num_match.group(1) if street_num_match else None
|
|
has_case = case in body
|
|
has_street_num = (street_num in body) if street_num else False
|
|
has_price = (str(int(d.get("listing_price") or 0)) in body.replace(",", ""))
|
|
print(f" has_case#={has_case}, has_street_num={has_street_num}, has_price={has_price}")
|
|
if has_case and (has_street_num or has_price):
|
|
print(f" ✅ Deep-link renders the correct property")
|
|
else:
|
|
print(f" ❌ Deep-link does NOT render the expected property")
|
|
failures += 1
|
|
except Exception as e:
|
|
print(f" ❌ ERROR: {e}")
|
|
failures += 1
|
|
|
|
browser.close()
|
|
return failures
|
|
|
|
|
|
def main():
|
|
rc = 0
|
|
rc += test_unit()
|
|
deals = test_scrape_unique_urls()
|
|
if deals:
|
|
rc += test_http_verify(deals, n_samples=3)
|
|
print()
|
|
print("=" * 60)
|
|
if rc == 0:
|
|
print("✅ ALL TESTS PASSED")
|
|
else:
|
|
print(f"❌ {rc} failures")
|
|
return rc
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|