524 lines
19 KiB
Python
524 lines
19 KiB
Python
"""scrapers/hud_homestore.py — HUD Homestore federal REO listings (FHA defaults).
|
|
|
|
SOURCE: https://www.hudhomestore.gov/searchresult?citystate={STATE_CODE}
|
|
STACK: Playwright local (Chromium) — SPA pesado, requiere render JS completo
|
|
|
|
URL PATTERN:
|
|
https://www.hudhomestore.gov/searchresult?citystate=FL
|
|
→ renderiza ~30-50 properties FL después de ~6s de SPA load
|
|
|
|
CARD STRUCTURE (DOM):
|
|
Each property card = <div class="topMap-card card-body col-12 col-md-7 px-2 pl-md-4">
|
|
Text content (raw, no semantic tags):
|
|
[optional] "Price Reduced" | "New Listing" (badge)
|
|
"BIDS OPEN MM/DD/YYYY"
|
|
"Listing Period: Extended" | "Exclusive" | etc
|
|
"$XXX,XXX"
|
|
"<street address>"
|
|
"<city>, FL, <zip>"
|
|
"<beds> Beds <baths> Baths <county> County"
|
|
"Case #: <agency>-<number>" (ej: 093-676572)
|
|
|
|
DEAL TYPE: 'reo' (Real Estate Owned — HUD post-foreclosure de loans FHA)
|
|
|
|
ANTI-BOT: real Chrome UA. Sin headers especiales adicionales necesarios.
|
|
|
|
LISTING PERIOD significance:
|
|
- "Exclusive": solo Owner-Occupants, nonprofits, gobiernos (primeros 10-30 dias)
|
|
- "Extended": disponible para investors (post-exclusive)
|
|
- "Lottery": offer aleatorio por demanda alta
|
|
- Investor-eligible deals tienen "Extended" o no-period
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Callable, Optional
|
|
|
|
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
|
|
|
|
# Real Chrome UA — HUD usa Yardi Systems SPA framework, anti-bot leve pero presente
|
|
_CHROME_UA = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/131.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
# Rate limit: 1 request c/3s al sitio HUD
|
|
_REQUEST_INTERVAL_SECONDS = 3.0
|
|
|
|
SOURCE = "hud_homestore"
|
|
|
|
# Tiempo de espera para SPA renderize property cards (descubierto via exploration: 6s seguro)
|
|
_SPA_RENDER_WAIT_SECONDS = 6.5
|
|
|
|
# Deep-link pattern descubierto via probe (B3 bugfix):
|
|
# /PropertyDetails?caseNumber=XXX rendera el property especifico (verified status 200 + address+price)
|
|
# Otros patterns devolvieron 404 (no funciona): /Listing/PropertyDetails, /Property/Details, /listing/{case}, etc.
|
|
_PROPERTY_DETAIL_URL_TEMPLATE = (
|
|
"https://www.hudhomestore.gov/PropertyDetails?caseNumber={case_number}"
|
|
)
|
|
|
|
|
|
def build_deep_link(case_number: Optional[str]) -> Optional[str]:
|
|
"""Construye URL canonica al property detail de HUD a partir del case_number.
|
|
|
|
case_number format: 'XXX-XXXXXX' (e.g., '093-676572')
|
|
Returns None si case_number invalido o vacio.
|
|
"""
|
|
if not case_number or not isinstance(case_number, str):
|
|
return None
|
|
case_number = case_number.strip()
|
|
if not case_number:
|
|
return None
|
|
# HUD case numbers son formato AAA-NNNNNN. No usar URL-encoding (hyphens son safe).
|
|
return _PROPERTY_DETAIL_URL_TEMPLATE.format(case_number=case_number)
|
|
|
|
|
|
def _parse_money(s: str) -> Optional[float]:
|
|
"""Parse '$446,000' → 446000.0"""
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d.]", "", s)
|
|
if not cleaned:
|
|
return None
|
|
try:
|
|
return float(cleaned)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _parse_card_text(text: str) -> Optional[dict]:
|
|
"""
|
|
Parse the raw text content of a single property card.
|
|
|
|
Returns dict with:
|
|
bid_open_date (ISO YYYY-MM-DD), listing_period, price, street_address,
|
|
city, state, zip, beds, baths, county, case_number, badges (list).
|
|
|
|
Returns None si parse fails.
|
|
"""
|
|
if not text or "Case #:" not in text:
|
|
return None
|
|
|
|
out: dict = {}
|
|
|
|
# Badges (optional)
|
|
badges = []
|
|
for kw in ("New Listing", "Price Reduced", "Extended", "Exclusive", "Lottery"):
|
|
if kw in text:
|
|
badges.append(kw)
|
|
out["badges"] = badges
|
|
|
|
# Bid open date
|
|
m = re.search(r"BIDS OPEN (\d{2}/\d{2}/\d{4})", text)
|
|
if m:
|
|
try:
|
|
d = datetime.strptime(m.group(1), "%m/%d/%Y").date()
|
|
out["bid_open_date"] = d.isoformat()
|
|
except ValueError:
|
|
out["bid_open_date"] = None
|
|
|
|
# Listing period
|
|
m = re.search(r"Listing Period:\s*(\w+)", text)
|
|
if m:
|
|
out["listing_period"] = m.group(1)
|
|
|
|
# Price
|
|
m = re.search(r"\$([\d,]+)", text)
|
|
if m:
|
|
out["price"] = _parse_money(m.group(0))
|
|
|
|
# Case #
|
|
m = re.search(r"Case #:\s*(\d{3}-\d{6})", text)
|
|
if m:
|
|
out["case_number"] = m.group(1)
|
|
else:
|
|
return None # Sin case # no es un card valido
|
|
|
|
# Address — pattern: "<street>" then "<city>, FL, <zip>" then "<n> Beds <n[.n]> Baths <county> County"
|
|
# The card text es muy denso, sin tags. Parse via regex multi-line.
|
|
# Match address block:
|
|
# Capture lines between "Listing Period" or "$NNN,NNN" and "X Beds"
|
|
addr_match = re.search(
|
|
r"\$[\d,]+\s+(.+?)\s+(\d+)\s+Beds\s+([\d.]+)\s+Baths\s+(.+?)\s+County",
|
|
text, re.DOTALL,
|
|
)
|
|
if addr_match:
|
|
addr_block = addr_match.group(1).strip()
|
|
# The addr_block has format: "<street>\n<city>, FL, <zip>"
|
|
# Try to split: last comma-separated part should be zip, before should be "city, state"
|
|
# Pattern: "<street> <city>, FL, <zip>" or "<street>, <city>, FL, <zip>"
|
|
zip_m = re.search(r",\s*(FL|F\.L\.)\s*,?\s*(\d{5})", addr_block)
|
|
if zip_m:
|
|
out["state"] = "FL"
|
|
out["zip"] = zip_m.group(2)
|
|
# Remove the ", FL, zip" suffix to find street + city
|
|
pre_zip = addr_block[:zip_m.start()].strip().rstrip(",").strip()
|
|
# Heuristic: last word group before zip is city (often 1-2 words)
|
|
# Use comma split first
|
|
if "," in pre_zip:
|
|
parts = [p.strip() for p in pre_zip.split(",")]
|
|
out["city"] = parts[-1]
|
|
out["address_street"] = ", ".join(parts[:-1])
|
|
else:
|
|
# No comma — city/street separated by newline (already collapsed). Best effort.
|
|
# Take last 1-3 words as city, rest as street
|
|
tokens = pre_zip.split()
|
|
# FL cities: last 1-3 tokens typically
|
|
# E.g., "4641 Samoset Dr Sarasota" → street="4641 Samoset Dr", city="Sarasota"
|
|
# E.g., "8342 N Pine Haven Pt Crystal River" → street="...Pt", city="Crystal River"
|
|
# Heuristic: city is at most 3 words; if last token looks like a street suffix
|
|
# (Dr, St, Ave, etc), then the part before is street and we need to be careful
|
|
street_suffixes = {"DR", "ST", "AVE", "RD", "BLVD", "LN", "WAY", "CT", "PL",
|
|
"CIR", "TER", "PKWY", "HWY", "TRL", "XING", "PT", "LOOP"}
|
|
# Find the LAST street suffix; city is what's after
|
|
for i in range(len(tokens) - 1, -1, -1):
|
|
if tokens[i].upper().rstrip(".") in street_suffixes:
|
|
out["address_street"] = " ".join(tokens[:i+1])
|
|
out["city"] = " ".join(tokens[i+1:])
|
|
break
|
|
else:
|
|
# Fallback: split half
|
|
half = len(tokens) // 2
|
|
out["address_street"] = " ".join(tokens[:half])
|
|
out["city"] = " ".join(tokens[half:])
|
|
else:
|
|
# No FL match — store raw
|
|
out["address_street"] = addr_block
|
|
|
|
out["beds"] = int(addr_match.group(2))
|
|
try:
|
|
out["baths"] = float(addr_match.group(3))
|
|
except ValueError:
|
|
out["baths"] = None
|
|
out["county"] = addr_match.group(4).strip()
|
|
|
|
# Build full address
|
|
full_addr_parts = []
|
|
if out.get("address_street"):
|
|
full_addr_parts.append(out["address_street"])
|
|
if out.get("city"):
|
|
full_addr_parts.append(out["city"])
|
|
if out.get("state"):
|
|
full_addr_parts.append(out["state"])
|
|
if out.get("zip"):
|
|
full_addr_parts.append(out["zip"])
|
|
if full_addr_parts:
|
|
out["address"] = ", ".join(full_addr_parts)
|
|
|
|
return out
|
|
|
|
|
|
def _build_deal_record(card_data: dict, state: str) -> dict:
|
|
"""
|
|
Convert parsed HUD card → deal record para deals_db.insert_deal.
|
|
|
|
Bugfix B3: source_url ahora es el deep-link al property especifico
|
|
(https://www.hudhomestore.gov/PropertyDetails?caseNumber=XXX), NO el URL
|
|
generico del search results. Fallback: None si case_number falta.
|
|
"""
|
|
bid_date = card_data.get("bid_open_date")
|
|
listing_period = card_data.get("listing_period")
|
|
badges = card_data.get("badges", [])
|
|
case_number = card_data.get("case_number")
|
|
|
|
# Build description
|
|
desc_bits = []
|
|
if badges:
|
|
desc_bits.append("Badges: " + ", ".join(badges))
|
|
if listing_period:
|
|
desc_bits.append(f"Listing Period: {listing_period}")
|
|
if bid_date:
|
|
desc_bits.append(f"Bids Open: {bid_date}")
|
|
desc_bits.append(f"HUD Case #: {case_number}")
|
|
desc_bits.append(f"Source: HUD Homestore (FHA-default REO)")
|
|
|
|
return {
|
|
"source": SOURCE,
|
|
"source_url": build_deep_link(case_number), # BUGFIX: deep-link per case
|
|
"address": card_data.get("address"),
|
|
"city": card_data.get("city"),
|
|
"state": card_data.get("state") or state,
|
|
"zip": card_data.get("zip"),
|
|
"county": card_data.get("county"),
|
|
"listing_price": card_data.get("price"),
|
|
"deal_type": "reo", # HUD properties son REO post-foreclosure
|
|
"starting_bid": card_data.get("price"), # HUD: list price = bid floor approx
|
|
"estimated_arv": None, # No provisto por HUD
|
|
"beds": card_data.get("beds"),
|
|
"baths": card_data.get("baths"),
|
|
# year_built, sqft no en results card — necesitarian detail page scrape
|
|
# HUD "case_number" is a tracking ID, NOT a court case. Goes in external_id.
|
|
# case_number stays NULL (HUD listings are REO post-foreclosure, no active
|
|
# court proceeding from the buyer's perspective).
|
|
"case_number": None,
|
|
"external_id": case_number,
|
|
"auction_date": bid_date,
|
|
"listing_description": " | ".join(desc_bits),
|
|
}
|
|
|
|
|
|
def scrape_hud_homestore(
|
|
*,
|
|
states: list[str] = None,
|
|
status_cb: Optional[Callable[[str], None]] = None,
|
|
use_cache: bool = True,
|
|
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
|
) -> list[dict]:
|
|
"""
|
|
Scrape HUD Homestore para los estados dados (default: solo FL).
|
|
|
|
Args:
|
|
states: lista de state codes a scrapear (default ["FL"])
|
|
status_cb: log callback
|
|
use_cache: True (24h cache)
|
|
cache_ttl_seconds: TTL del cache
|
|
|
|
Returns:
|
|
list[dict] deal records.
|
|
"""
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
|
|
if states is None:
|
|
states = ["FL"]
|
|
|
|
def _log(msg: str) -> None:
|
|
if status_cb:
|
|
status_cb(msg)
|
|
|
|
cache_namespace = "hud_homestore"
|
|
deals: list[dict] = []
|
|
|
|
# Step 1: cache check per state
|
|
cached_pages: dict[str, str] = {}
|
|
states_to_fetch: list[str] = []
|
|
cache_hits = 0
|
|
for state in states:
|
|
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
|
|
if use_cache:
|
|
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
|
|
if cached:
|
|
cached_pages[state] = cached
|
|
cache_hits += 1
|
|
continue
|
|
states_to_fetch.append(state)
|
|
_log(f"HUD Homestore: states={states}, cache hits {cache_hits}/{len(states)}, fetching {len(states_to_fetch)}")
|
|
|
|
# Step 2: fetch fresh HTML for non-cached states
|
|
fresh_pages: dict[str, str] = {}
|
|
if states_to_fetch:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
user_agent=_CHROME_UA,
|
|
viewport={"width": 1400, "height": 900},
|
|
locale="en-US",
|
|
timezone_id="America/New_York",
|
|
)
|
|
page = context.new_page()
|
|
page.set_default_timeout(30_000)
|
|
|
|
# Load landing first to set cookies + session
|
|
try:
|
|
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
_log(f" HUD landing load failed: {e}")
|
|
|
|
last_request_at = 0.0
|
|
for state in states_to_fetch:
|
|
elapsed = time.time() - last_request_at
|
|
if elapsed < _REQUEST_INTERVAL_SECONDS:
|
|
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
|
|
last_request_at = time.time()
|
|
|
|
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
|
|
_log(f" Fetching {state}...")
|
|
try:
|
|
response = page.goto(url, wait_until="networkidle", timeout=30_000)
|
|
if response.status != 200:
|
|
_log(f" HTTP {response.status} for {state} — skip")
|
|
continue
|
|
except PlaywrightTimeout:
|
|
_log(f" timeout for {state} — skip")
|
|
continue
|
|
except Exception as e:
|
|
_log(f" error for {state}: {e}")
|
|
continue
|
|
|
|
# Wait extra for SPA render
|
|
time.sleep(_SPA_RENDER_WAIT_SECONDS)
|
|
|
|
html = page.content()
|
|
fresh_pages[state] = html
|
|
if use_cache:
|
|
save_cache(cache_namespace, url, html,
|
|
status_code=200, ttl_seconds=cache_ttl_seconds)
|
|
|
|
browser.close()
|
|
|
|
# Step 3: parse all pages (cached + fresh) via lightweight DOM eval
|
|
# For cached HTML: parse with stdlib. For fresh pages we already have rendered HTML.
|
|
for state in states:
|
|
html = cached_pages.get(state) or fresh_pages.get(state)
|
|
if not html:
|
|
continue
|
|
|
|
# Parse the HTML using a non-Playwright approach (regex + BeautifulSoup-style)
|
|
# Since the card structure is stable (div.topMap-card.card-body), we can use re
|
|
# to extract card boundaries then process text.
|
|
cards_text = _extract_card_texts_from_html(html, _log)
|
|
_log(f" {state}: extracted {len(cards_text)} card texts from HTML")
|
|
|
|
for card_text in cards_text:
|
|
card = _parse_card_text(card_text)
|
|
if card and card.get("case_number"):
|
|
deal = _build_deal_record(card, state)
|
|
if deal.get("address") or deal.get("case_number"):
|
|
deals.append(deal)
|
|
|
|
_log(f"HUD Homestore: scraped {len(deals)} total deals across {len(states)} states")
|
|
return deals
|
|
|
|
|
|
def _extract_card_texts_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[str]:
|
|
"""
|
|
Extract the inner text content of each property card from raw HTML.
|
|
|
|
Uses stdlib html parser. Card boundary: <div class="topMap-card card-body ...">.
|
|
"""
|
|
from html.parser import HTMLParser
|
|
|
|
target_class_marker = "topMap-card"
|
|
|
|
class CardExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.in_card = 0
|
|
self.depth_when_entered = 0
|
|
self.current_depth = 0
|
|
self.text_parts: list[str] = []
|
|
self.cards_texts: list[str] = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
self.current_depth += 1
|
|
if not self.in_card:
|
|
# Look for div with class containing topMap-card
|
|
if tag == "div":
|
|
for name, val in attrs:
|
|
if name == "class" and val and target_class_marker in val:
|
|
self.in_card = 1
|
|
self.depth_when_entered = self.current_depth
|
|
self.text_parts = []
|
|
return
|
|
|
|
def handle_endtag(self, tag):
|
|
if self.in_card and tag == "div" and self.current_depth == self.depth_when_entered:
|
|
# Closing tag matches the depth where we entered card
|
|
text = " ".join("".join(self.text_parts).split()).strip()
|
|
if text:
|
|
self.cards_texts.append(text)
|
|
self.in_card = 0
|
|
self.depth_when_entered = 0
|
|
self.text_parts = []
|
|
self.current_depth -= 1
|
|
|
|
def handle_data(self, data):
|
|
if self.in_card:
|
|
self.text_parts.append(data)
|
|
|
|
parser = CardExtractor()
|
|
try:
|
|
parser.feed(html)
|
|
except Exception as e:
|
|
if log_fn:
|
|
log_fn(f" HTML parse error: {e}")
|
|
return []
|
|
|
|
return parser.cards_texts
|
|
|
|
|
|
def run_scraper_to_db(
|
|
*,
|
|
states: list[str] = None,
|
|
auto_classify: bool = True,
|
|
status_cb: Optional[Callable[[str], None]] = None,
|
|
) -> dict:
|
|
"""Full pipeline: scrape HUD → persist deals.db → optionally classify."""
|
|
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
|
|
init_db()
|
|
|
|
run_id = record_scraper_run(SOURCE)
|
|
errors: list[str] = []
|
|
|
|
def _log(m: str) -> None:
|
|
if status_cb:
|
|
status_cb(m)
|
|
|
|
try:
|
|
deals = scrape_hud_homestore(states=states, status_cb=status_cb)
|
|
except Exception as e:
|
|
errors.append(f"scrape failed: {e}")
|
|
deals = []
|
|
|
|
deals_new = 0
|
|
deals_updated = 0
|
|
new_deal_ids: list[int] = []
|
|
|
|
for deal in deals:
|
|
try:
|
|
deal_id, is_new = insert_deal(deal)
|
|
if is_new:
|
|
deals_new += 1
|
|
new_deal_ids.append(deal_id)
|
|
else:
|
|
deals_updated += 1
|
|
except Exception as e:
|
|
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
|
|
|
|
classified_count = 0
|
|
if auto_classify and new_deal_ids:
|
|
_log(f"Auto-classifying {len(new_deal_ids)} new HUD deals...")
|
|
from deal_classifier import classify_deal
|
|
from deals_db import get_deal_by_id, update_classification
|
|
for did in new_deal_ids:
|
|
try:
|
|
d = get_deal_by_id(did)
|
|
if not d:
|
|
continue
|
|
result = classify_deal(d)
|
|
update_classification(
|
|
deal_id=did,
|
|
status=result["classification_status"],
|
|
score=result["score"],
|
|
reasons=result["reasons"],
|
|
strategy=result["strategy"],
|
|
)
|
|
classified_count += 1
|
|
except Exception as e:
|
|
errors.append(f"classify fail for {did}: {e}")
|
|
|
|
finish_scraper_run(
|
|
run_id,
|
|
deals_found=len(deals),
|
|
deals_new=deals_new,
|
|
deals_updated=deals_updated,
|
|
errors_count=len(errors),
|
|
errors_summary=errors if errors else None,
|
|
firecrawl_credits_used=0,
|
|
status="success" if not errors else ("partial" if deals else "failed"),
|
|
)
|
|
|
|
return {
|
|
"source": SOURCE,
|
|
"scraper_run_id": run_id,
|
|
"deals_found": len(deals),
|
|
"deals_new": deals_new,
|
|
"deals_updated": deals_updated,
|
|
"deals_classified": classified_count,
|
|
"errors_count": len(errors),
|
|
"errors": errors,
|
|
}
|