Files
2026-07-03 12:24:58 -04:00

524 lines
19 KiB
Python

"""scrapers/hud_homestore.py — HUD Homestore federal REO listings (FHA defaults).
SOURCE: https://www.hudhomestore.gov/searchresult?citystate={STATE_CODE}
STACK: Playwright local (Chromium) — SPA pesado, requiere render JS completo
URL PATTERN:
https://www.hudhomestore.gov/searchresult?citystate=FL
→ renderiza ~30-50 properties FL después de ~6s de SPA load
CARD STRUCTURE (DOM):
Each property card = <div class="topMap-card card-body col-12 col-md-7 px-2 pl-md-4">
Text content (raw, no semantic tags):
[optional] "Price Reduced" | "New Listing" (badge)
"BIDS OPEN MM/DD/YYYY"
"Listing Period: Extended" | "Exclusive" | etc
"$XXX,XXX"
"<street address>"
"<city>, FL, <zip>"
"<beds> Beds <baths> Baths <county> County"
"Case #: <agency>-<number>" (ej: 093-676572)
DEAL TYPE: 'reo' (Real Estate Owned — HUD post-foreclosure de loans FHA)
ANTI-BOT: real Chrome UA. Sin headers especiales adicionales necesarios.
LISTING PERIOD significance:
- "Exclusive": solo Owner-Occupants, nonprofits, gobiernos (primeros 10-30 dias)
- "Extended": disponible para investors (post-exclusive)
- "Lottery": offer aleatorio por demanda alta
- Investor-eligible deals tienen "Extended" o no-period
"""
from __future__ import annotations
import re
import time
from datetime import datetime, timezone
from typing import Callable, Optional
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
# Real Chrome UA — HUD usa Yardi Systems SPA framework, anti-bot leve pero presente
_CHROME_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
# Rate limit: 1 request c/3s al sitio HUD
_REQUEST_INTERVAL_SECONDS = 3.0
SOURCE = "hud_homestore"
# Tiempo de espera para SPA renderize property cards (descubierto via exploration: 6s seguro)
_SPA_RENDER_WAIT_SECONDS = 6.5
# Deep-link pattern descubierto via probe (B3 bugfix):
# /PropertyDetails?caseNumber=XXX rendera el property especifico (verified status 200 + address+price)
# Otros patterns devolvieron 404 (no funciona): /Listing/PropertyDetails, /Property/Details, /listing/{case}, etc.
_PROPERTY_DETAIL_URL_TEMPLATE = (
"https://www.hudhomestore.gov/PropertyDetails?caseNumber={case_number}"
)
def build_deep_link(case_number: Optional[str]) -> Optional[str]:
"""Construye URL canonica al property detail de HUD a partir del case_number.
case_number format: 'XXX-XXXXXX' (e.g., '093-676572')
Returns None si case_number invalido o vacio.
"""
if not case_number or not isinstance(case_number, str):
return None
case_number = case_number.strip()
if not case_number:
return None
# HUD case numbers son formato AAA-NNNNNN. No usar URL-encoding (hyphens son safe).
return _PROPERTY_DETAIL_URL_TEMPLATE.format(case_number=case_number)
def _parse_money(s: str) -> Optional[float]:
"""Parse '$446,000' → 446000.0"""
if not s:
return None
cleaned = re.sub(r"[^\d.]", "", s)
if not cleaned:
return None
try:
return float(cleaned)
except ValueError:
return None
def _parse_card_text(text: str) -> Optional[dict]:
"""
Parse the raw text content of a single property card.
Returns dict with:
bid_open_date (ISO YYYY-MM-DD), listing_period, price, street_address,
city, state, zip, beds, baths, county, case_number, badges (list).
Returns None si parse fails.
"""
if not text or "Case #:" not in text:
return None
out: dict = {}
# Badges (optional)
badges = []
for kw in ("New Listing", "Price Reduced", "Extended", "Exclusive", "Lottery"):
if kw in text:
badges.append(kw)
out["badges"] = badges
# Bid open date
m = re.search(r"BIDS OPEN (\d{2}/\d{2}/\d{4})", text)
if m:
try:
d = datetime.strptime(m.group(1), "%m/%d/%Y").date()
out["bid_open_date"] = d.isoformat()
except ValueError:
out["bid_open_date"] = None
# Listing period
m = re.search(r"Listing Period:\s*(\w+)", text)
if m:
out["listing_period"] = m.group(1)
# Price
m = re.search(r"\$([\d,]+)", text)
if m:
out["price"] = _parse_money(m.group(0))
# Case #
m = re.search(r"Case #:\s*(\d{3}-\d{6})", text)
if m:
out["case_number"] = m.group(1)
else:
return None # Sin case # no es un card valido
# Address — pattern: "<street>" then "<city>, FL, <zip>" then "<n> Beds <n[.n]> Baths <county> County"
# The card text es muy denso, sin tags. Parse via regex multi-line.
# Match address block:
# Capture lines between "Listing Period" or "$NNN,NNN" and "X Beds"
addr_match = re.search(
r"\$[\d,]+\s+(.+?)\s+(\d+)\s+Beds\s+([\d.]+)\s+Baths\s+(.+?)\s+County",
text, re.DOTALL,
)
if addr_match:
addr_block = addr_match.group(1).strip()
# The addr_block has format: "<street>\n<city>, FL, <zip>"
# Try to split: last comma-separated part should be zip, before should be "city, state"
# Pattern: "<street> <city>, FL, <zip>" or "<street>, <city>, FL, <zip>"
zip_m = re.search(r",\s*(FL|F\.L\.)\s*,?\s*(\d{5})", addr_block)
if zip_m:
out["state"] = "FL"
out["zip"] = zip_m.group(2)
# Remove the ", FL, zip" suffix to find street + city
pre_zip = addr_block[:zip_m.start()].strip().rstrip(",").strip()
# Heuristic: last word group before zip is city (often 1-2 words)
# Use comma split first
if "," in pre_zip:
parts = [p.strip() for p in pre_zip.split(",")]
out["city"] = parts[-1]
out["address_street"] = ", ".join(parts[:-1])
else:
# No comma — city/street separated by newline (already collapsed). Best effort.
# Take last 1-3 words as city, rest as street
tokens = pre_zip.split()
# FL cities: last 1-3 tokens typically
# E.g., "4641 Samoset Dr Sarasota" → street="4641 Samoset Dr", city="Sarasota"
# E.g., "8342 N Pine Haven Pt Crystal River" → street="...Pt", city="Crystal River"
# Heuristic: city is at most 3 words; if last token looks like a street suffix
# (Dr, St, Ave, etc), then the part before is street and we need to be careful
street_suffixes = {"DR", "ST", "AVE", "RD", "BLVD", "LN", "WAY", "CT", "PL",
"CIR", "TER", "PKWY", "HWY", "TRL", "XING", "PT", "LOOP"}
# Find the LAST street suffix; city is what's after
for i in range(len(tokens) - 1, -1, -1):
if tokens[i].upper().rstrip(".") in street_suffixes:
out["address_street"] = " ".join(tokens[:i+1])
out["city"] = " ".join(tokens[i+1:])
break
else:
# Fallback: split half
half = len(tokens) // 2
out["address_street"] = " ".join(tokens[:half])
out["city"] = " ".join(tokens[half:])
else:
# No FL match — store raw
out["address_street"] = addr_block
out["beds"] = int(addr_match.group(2))
try:
out["baths"] = float(addr_match.group(3))
except ValueError:
out["baths"] = None
out["county"] = addr_match.group(4).strip()
# Build full address
full_addr_parts = []
if out.get("address_street"):
full_addr_parts.append(out["address_street"])
if out.get("city"):
full_addr_parts.append(out["city"])
if out.get("state"):
full_addr_parts.append(out["state"])
if out.get("zip"):
full_addr_parts.append(out["zip"])
if full_addr_parts:
out["address"] = ", ".join(full_addr_parts)
return out
def _build_deal_record(card_data: dict, state: str) -> dict:
"""
Convert parsed HUD card → deal record para deals_db.insert_deal.
Bugfix B3: source_url ahora es el deep-link al property especifico
(https://www.hudhomestore.gov/PropertyDetails?caseNumber=XXX), NO el URL
generico del search results. Fallback: None si case_number falta.
"""
bid_date = card_data.get("bid_open_date")
listing_period = card_data.get("listing_period")
badges = card_data.get("badges", [])
case_number = card_data.get("case_number")
# Build description
desc_bits = []
if badges:
desc_bits.append("Badges: " + ", ".join(badges))
if listing_period:
desc_bits.append(f"Listing Period: {listing_period}")
if bid_date:
desc_bits.append(f"Bids Open: {bid_date}")
desc_bits.append(f"HUD Case #: {case_number}")
desc_bits.append(f"Source: HUD Homestore (FHA-default REO)")
return {
"source": SOURCE,
"source_url": build_deep_link(case_number), # BUGFIX: deep-link per case
"address": card_data.get("address"),
"city": card_data.get("city"),
"state": card_data.get("state") or state,
"zip": card_data.get("zip"),
"county": card_data.get("county"),
"listing_price": card_data.get("price"),
"deal_type": "reo", # HUD properties son REO post-foreclosure
"starting_bid": card_data.get("price"), # HUD: list price = bid floor approx
"estimated_arv": None, # No provisto por HUD
"beds": card_data.get("beds"),
"baths": card_data.get("baths"),
# year_built, sqft no en results card — necesitarian detail page scrape
# HUD "case_number" is a tracking ID, NOT a court case. Goes in external_id.
# case_number stays NULL (HUD listings are REO post-foreclosure, no active
# court proceeding from the buyer's perspective).
"case_number": None,
"external_id": case_number,
"auction_date": bid_date,
"listing_description": " | ".join(desc_bits),
}
def scrape_hud_homestore(
*,
states: list[str] = None,
status_cb: Optional[Callable[[str], None]] = None,
use_cache: bool = True,
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
) -> list[dict]:
"""
Scrape HUD Homestore para los estados dados (default: solo FL).
Args:
states: lista de state codes a scrapear (default ["FL"])
status_cb: log callback
use_cache: True (24h cache)
cache_ttl_seconds: TTL del cache
Returns:
list[dict] deal records.
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
if states is None:
states = ["FL"]
def _log(msg: str) -> None:
if status_cb:
status_cb(msg)
cache_namespace = "hud_homestore"
deals: list[dict] = []
# Step 1: cache check per state
cached_pages: dict[str, str] = {}
states_to_fetch: list[str] = []
cache_hits = 0
for state in states:
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
if use_cache:
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
if cached:
cached_pages[state] = cached
cache_hits += 1
continue
states_to_fetch.append(state)
_log(f"HUD Homestore: states={states}, cache hits {cache_hits}/{len(states)}, fetching {len(states_to_fetch)}")
# Step 2: fetch fresh HTML for non-cached states
fresh_pages: dict[str, str] = {}
if states_to_fetch:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent=_CHROME_UA,
viewport={"width": 1400, "height": 900},
locale="en-US",
timezone_id="America/New_York",
)
page = context.new_page()
page.set_default_timeout(30_000)
# Load landing first to set cookies + session
try:
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
time.sleep(2)
except Exception as e:
_log(f" HUD landing load failed: {e}")
last_request_at = 0.0
for state in states_to_fetch:
elapsed = time.time() - last_request_at
if elapsed < _REQUEST_INTERVAL_SECONDS:
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
last_request_at = time.time()
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
_log(f" Fetching {state}...")
try:
response = page.goto(url, wait_until="networkidle", timeout=30_000)
if response.status != 200:
_log(f" HTTP {response.status} for {state} — skip")
continue
except PlaywrightTimeout:
_log(f" timeout for {state} — skip")
continue
except Exception as e:
_log(f" error for {state}: {e}")
continue
# Wait extra for SPA render
time.sleep(_SPA_RENDER_WAIT_SECONDS)
html = page.content()
fresh_pages[state] = html
if use_cache:
save_cache(cache_namespace, url, html,
status_code=200, ttl_seconds=cache_ttl_seconds)
browser.close()
# Step 3: parse all pages (cached + fresh) via lightweight DOM eval
# For cached HTML: parse with stdlib. For fresh pages we already have rendered HTML.
for state in states:
html = cached_pages.get(state) or fresh_pages.get(state)
if not html:
continue
# Parse the HTML using a non-Playwright approach (regex + BeautifulSoup-style)
# Since the card structure is stable (div.topMap-card.card-body), we can use re
# to extract card boundaries then process text.
cards_text = _extract_card_texts_from_html(html, _log)
_log(f" {state}: extracted {len(cards_text)} card texts from HTML")
for card_text in cards_text:
card = _parse_card_text(card_text)
if card and card.get("case_number"):
deal = _build_deal_record(card, state)
if deal.get("address") or deal.get("case_number"):
deals.append(deal)
_log(f"HUD Homestore: scraped {len(deals)} total deals across {len(states)} states")
return deals
def _extract_card_texts_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[str]:
"""
Extract the inner text content of each property card from raw HTML.
Uses stdlib html parser. Card boundary: <div class="topMap-card card-body ...">.
"""
from html.parser import HTMLParser
target_class_marker = "topMap-card"
class CardExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.in_card = 0
self.depth_when_entered = 0
self.current_depth = 0
self.text_parts: list[str] = []
self.cards_texts: list[str] = []
def handle_starttag(self, tag, attrs):
self.current_depth += 1
if not self.in_card:
# Look for div with class containing topMap-card
if tag == "div":
for name, val in attrs:
if name == "class" and val and target_class_marker in val:
self.in_card = 1
self.depth_when_entered = self.current_depth
self.text_parts = []
return
def handle_endtag(self, tag):
if self.in_card and tag == "div" and self.current_depth == self.depth_when_entered:
# Closing tag matches the depth where we entered card
text = " ".join("".join(self.text_parts).split()).strip()
if text:
self.cards_texts.append(text)
self.in_card = 0
self.depth_when_entered = 0
self.text_parts = []
self.current_depth -= 1
def handle_data(self, data):
if self.in_card:
self.text_parts.append(data)
parser = CardExtractor()
try:
parser.feed(html)
except Exception as e:
if log_fn:
log_fn(f" HTML parse error: {e}")
return []
return parser.cards_texts
def run_scraper_to_db(
*,
states: list[str] = None,
auto_classify: bool = True,
status_cb: Optional[Callable[[str], None]] = None,
) -> dict:
"""Full pipeline: scrape HUD → persist deals.db → optionally classify."""
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
init_db()
run_id = record_scraper_run(SOURCE)
errors: list[str] = []
def _log(m: str) -> None:
if status_cb:
status_cb(m)
try:
deals = scrape_hud_homestore(states=states, status_cb=status_cb)
except Exception as e:
errors.append(f"scrape failed: {e}")
deals = []
deals_new = 0
deals_updated = 0
new_deal_ids: list[int] = []
for deal in deals:
try:
deal_id, is_new = insert_deal(deal)
if is_new:
deals_new += 1
new_deal_ids.append(deal_id)
else:
deals_updated += 1
except Exception as e:
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
classified_count = 0
if auto_classify and new_deal_ids:
_log(f"Auto-classifying {len(new_deal_ids)} new HUD deals...")
from deal_classifier import classify_deal
from deals_db import get_deal_by_id, update_classification
for did in new_deal_ids:
try:
d = get_deal_by_id(did)
if not d:
continue
result = classify_deal(d)
update_classification(
deal_id=did,
status=result["classification_status"],
score=result["score"],
reasons=result["reasons"],
strategy=result["strategy"],
)
classified_count += 1
except Exception as e:
errors.append(f"classify fail for {did}: {e}")
finish_scraper_run(
run_id,
deals_found=len(deals),
deals_new=deals_new,
deals_updated=deals_updated,
errors_count=len(errors),
errors_summary=errors if errors else None,
firecrawl_credits_used=0,
status="success" if not errors else ("partial" if deals else "failed"),
)
return {
"source": SOURCE,
"scraper_run_id": run_id,
"deals_found": len(deals),
"deals_new": deals_new,
"deals_updated": deals_updated,
"deals_classified": classified_count,
"errors_count": len(errors),
"errors": errors,
}