feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,523 @@
|
||||
"""scrapers/hud_homestore.py — HUD Homestore federal REO listings (FHA defaults).
|
||||
|
||||
SOURCE: https://www.hudhomestore.gov/searchresult?citystate={STATE_CODE}
|
||||
STACK: Playwright local (Chromium) — SPA pesado, requiere render JS completo
|
||||
|
||||
URL PATTERN:
|
||||
https://www.hudhomestore.gov/searchresult?citystate=FL
|
||||
→ renderiza ~30-50 properties FL después de ~6s de SPA load
|
||||
|
||||
CARD STRUCTURE (DOM):
|
||||
Each property card = <div class="topMap-card card-body col-12 col-md-7 px-2 pl-md-4">
|
||||
Text content (raw, no semantic tags):
|
||||
[optional] "Price Reduced" | "New Listing" (badge)
|
||||
"BIDS OPEN MM/DD/YYYY"
|
||||
"Listing Period: Extended" | "Exclusive" | etc
|
||||
"$XXX,XXX"
|
||||
"<street address>"
|
||||
"<city>, FL, <zip>"
|
||||
"<beds> Beds <baths> Baths <county> County"
|
||||
"Case #: <agency>-<number>" (ej: 093-676572)
|
||||
|
||||
DEAL TYPE: 'reo' (Real Estate Owned — HUD post-foreclosure de loans FHA)
|
||||
|
||||
ANTI-BOT: real Chrome UA. Sin headers especiales adicionales necesarios.
|
||||
|
||||
LISTING PERIOD significance:
|
||||
- "Exclusive": solo Owner-Occupants, nonprofits, gobiernos (primeros 10-30 dias)
|
||||
- "Extended": disponible para investors (post-exclusive)
|
||||
- "Lottery": offer aleatorio por demanda alta
|
||||
- Investor-eligible deals tienen "Extended" o no-period
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Callable, Optional
|
||||
|
||||
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
|
||||
|
||||
# Real Chrome UA — HUD usa Yardi Systems SPA framework, anti-bot leve pero presente
|
||||
_CHROME_UA = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Rate limit: 1 request c/3s al sitio HUD
|
||||
_REQUEST_INTERVAL_SECONDS = 3.0
|
||||
|
||||
SOURCE = "hud_homestore"
|
||||
|
||||
# Tiempo de espera para SPA renderize property cards (descubierto via exploration: 6s seguro)
|
||||
_SPA_RENDER_WAIT_SECONDS = 6.5
|
||||
|
||||
# Deep-link pattern descubierto via probe (B3 bugfix):
|
||||
# /PropertyDetails?caseNumber=XXX rendera el property especifico (verified status 200 + address+price)
|
||||
# Otros patterns devolvieron 404 (no funciona): /Listing/PropertyDetails, /Property/Details, /listing/{case}, etc.
|
||||
_PROPERTY_DETAIL_URL_TEMPLATE = (
|
||||
"https://www.hudhomestore.gov/PropertyDetails?caseNumber={case_number}"
|
||||
)
|
||||
|
||||
|
||||
def build_deep_link(case_number: Optional[str]) -> Optional[str]:
|
||||
"""Construye URL canonica al property detail de HUD a partir del case_number.
|
||||
|
||||
case_number format: 'XXX-XXXXXX' (e.g., '093-676572')
|
||||
Returns None si case_number invalido o vacio.
|
||||
"""
|
||||
if not case_number or not isinstance(case_number, str):
|
||||
return None
|
||||
case_number = case_number.strip()
|
||||
if not case_number:
|
||||
return None
|
||||
# HUD case numbers son formato AAA-NNNNNN. No usar URL-encoding (hyphens son safe).
|
||||
return _PROPERTY_DETAIL_URL_TEMPLATE.format(case_number=case_number)
|
||||
|
||||
|
||||
def _parse_money(s: str) -> Optional[float]:
|
||||
"""Parse '$446,000' → 446000.0"""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.]", "", s)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_card_text(text: str) -> Optional[dict]:
|
||||
"""
|
||||
Parse the raw text content of a single property card.
|
||||
|
||||
Returns dict with:
|
||||
bid_open_date (ISO YYYY-MM-DD), listing_period, price, street_address,
|
||||
city, state, zip, beds, baths, county, case_number, badges (list).
|
||||
|
||||
Returns None si parse fails.
|
||||
"""
|
||||
if not text or "Case #:" not in text:
|
||||
return None
|
||||
|
||||
out: dict = {}
|
||||
|
||||
# Badges (optional)
|
||||
badges = []
|
||||
for kw in ("New Listing", "Price Reduced", "Extended", "Exclusive", "Lottery"):
|
||||
if kw in text:
|
||||
badges.append(kw)
|
||||
out["badges"] = badges
|
||||
|
||||
# Bid open date
|
||||
m = re.search(r"BIDS OPEN (\d{2}/\d{2}/\d{4})", text)
|
||||
if m:
|
||||
try:
|
||||
d = datetime.strptime(m.group(1), "%m/%d/%Y").date()
|
||||
out["bid_open_date"] = d.isoformat()
|
||||
except ValueError:
|
||||
out["bid_open_date"] = None
|
||||
|
||||
# Listing period
|
||||
m = re.search(r"Listing Period:\s*(\w+)", text)
|
||||
if m:
|
||||
out["listing_period"] = m.group(1)
|
||||
|
||||
# Price
|
||||
m = re.search(r"\$([\d,]+)", text)
|
||||
if m:
|
||||
out["price"] = _parse_money(m.group(0))
|
||||
|
||||
# Case #
|
||||
m = re.search(r"Case #:\s*(\d{3}-\d{6})", text)
|
||||
if m:
|
||||
out["case_number"] = m.group(1)
|
||||
else:
|
||||
return None # Sin case # no es un card valido
|
||||
|
||||
# Address — pattern: "<street>" then "<city>, FL, <zip>" then "<n> Beds <n[.n]> Baths <county> County"
|
||||
# The card text es muy denso, sin tags. Parse via regex multi-line.
|
||||
# Match address block:
|
||||
# Capture lines between "Listing Period" or "$NNN,NNN" and "X Beds"
|
||||
addr_match = re.search(
|
||||
r"\$[\d,]+\s+(.+?)\s+(\d+)\s+Beds\s+([\d.]+)\s+Baths\s+(.+?)\s+County",
|
||||
text, re.DOTALL,
|
||||
)
|
||||
if addr_match:
|
||||
addr_block = addr_match.group(1).strip()
|
||||
# The addr_block has format: "<street>\n<city>, FL, <zip>"
|
||||
# Try to split: last comma-separated part should be zip, before should be "city, state"
|
||||
# Pattern: "<street> <city>, FL, <zip>" or "<street>, <city>, FL, <zip>"
|
||||
zip_m = re.search(r",\s*(FL|F\.L\.)\s*,?\s*(\d{5})", addr_block)
|
||||
if zip_m:
|
||||
out["state"] = "FL"
|
||||
out["zip"] = zip_m.group(2)
|
||||
# Remove the ", FL, zip" suffix to find street + city
|
||||
pre_zip = addr_block[:zip_m.start()].strip().rstrip(",").strip()
|
||||
# Heuristic: last word group before zip is city (often 1-2 words)
|
||||
# Use comma split first
|
||||
if "," in pre_zip:
|
||||
parts = [p.strip() for p in pre_zip.split(",")]
|
||||
out["city"] = parts[-1]
|
||||
out["address_street"] = ", ".join(parts[:-1])
|
||||
else:
|
||||
# No comma — city/street separated by newline (already collapsed). Best effort.
|
||||
# Take last 1-3 words as city, rest as street
|
||||
tokens = pre_zip.split()
|
||||
# FL cities: last 1-3 tokens typically
|
||||
# E.g., "4641 Samoset Dr Sarasota" → street="4641 Samoset Dr", city="Sarasota"
|
||||
# E.g., "8342 N Pine Haven Pt Crystal River" → street="...Pt", city="Crystal River"
|
||||
# Heuristic: city is at most 3 words; if last token looks like a street suffix
|
||||
# (Dr, St, Ave, etc), then the part before is street and we need to be careful
|
||||
street_suffixes = {"DR", "ST", "AVE", "RD", "BLVD", "LN", "WAY", "CT", "PL",
|
||||
"CIR", "TER", "PKWY", "HWY", "TRL", "XING", "PT", "LOOP"}
|
||||
# Find the LAST street suffix; city is what's after
|
||||
for i in range(len(tokens) - 1, -1, -1):
|
||||
if tokens[i].upper().rstrip(".") in street_suffixes:
|
||||
out["address_street"] = " ".join(tokens[:i+1])
|
||||
out["city"] = " ".join(tokens[i+1:])
|
||||
break
|
||||
else:
|
||||
# Fallback: split half
|
||||
half = len(tokens) // 2
|
||||
out["address_street"] = " ".join(tokens[:half])
|
||||
out["city"] = " ".join(tokens[half:])
|
||||
else:
|
||||
# No FL match — store raw
|
||||
out["address_street"] = addr_block
|
||||
|
||||
out["beds"] = int(addr_match.group(2))
|
||||
try:
|
||||
out["baths"] = float(addr_match.group(3))
|
||||
except ValueError:
|
||||
out["baths"] = None
|
||||
out["county"] = addr_match.group(4).strip()
|
||||
|
||||
# Build full address
|
||||
full_addr_parts = []
|
||||
if out.get("address_street"):
|
||||
full_addr_parts.append(out["address_street"])
|
||||
if out.get("city"):
|
||||
full_addr_parts.append(out["city"])
|
||||
if out.get("state"):
|
||||
full_addr_parts.append(out["state"])
|
||||
if out.get("zip"):
|
||||
full_addr_parts.append(out["zip"])
|
||||
if full_addr_parts:
|
||||
out["address"] = ", ".join(full_addr_parts)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _build_deal_record(card_data: dict, state: str) -> dict:
|
||||
"""
|
||||
Convert parsed HUD card → deal record para deals_db.insert_deal.
|
||||
|
||||
Bugfix B3: source_url ahora es el deep-link al property especifico
|
||||
(https://www.hudhomestore.gov/PropertyDetails?caseNumber=XXX), NO el URL
|
||||
generico del search results. Fallback: None si case_number falta.
|
||||
"""
|
||||
bid_date = card_data.get("bid_open_date")
|
||||
listing_period = card_data.get("listing_period")
|
||||
badges = card_data.get("badges", [])
|
||||
case_number = card_data.get("case_number")
|
||||
|
||||
# Build description
|
||||
desc_bits = []
|
||||
if badges:
|
||||
desc_bits.append("Badges: " + ", ".join(badges))
|
||||
if listing_period:
|
||||
desc_bits.append(f"Listing Period: {listing_period}")
|
||||
if bid_date:
|
||||
desc_bits.append(f"Bids Open: {bid_date}")
|
||||
desc_bits.append(f"HUD Case #: {case_number}")
|
||||
desc_bits.append(f"Source: HUD Homestore (FHA-default REO)")
|
||||
|
||||
return {
|
||||
"source": SOURCE,
|
||||
"source_url": build_deep_link(case_number), # BUGFIX: deep-link per case
|
||||
"address": card_data.get("address"),
|
||||
"city": card_data.get("city"),
|
||||
"state": card_data.get("state") or state,
|
||||
"zip": card_data.get("zip"),
|
||||
"county": card_data.get("county"),
|
||||
"listing_price": card_data.get("price"),
|
||||
"deal_type": "reo", # HUD properties son REO post-foreclosure
|
||||
"starting_bid": card_data.get("price"), # HUD: list price = bid floor approx
|
||||
"estimated_arv": None, # No provisto por HUD
|
||||
"beds": card_data.get("beds"),
|
||||
"baths": card_data.get("baths"),
|
||||
# year_built, sqft no en results card — necesitarian detail page scrape
|
||||
# HUD "case_number" is a tracking ID, NOT a court case. Goes in external_id.
|
||||
# case_number stays NULL (HUD listings are REO post-foreclosure, no active
|
||||
# court proceeding from the buyer's perspective).
|
||||
"case_number": None,
|
||||
"external_id": case_number,
|
||||
"auction_date": bid_date,
|
||||
"listing_description": " | ".join(desc_bits),
|
||||
}
|
||||
|
||||
|
||||
def scrape_hud_homestore(
|
||||
*,
|
||||
states: list[str] = None,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
use_cache: bool = True,
|
||||
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Scrape HUD Homestore para los estados dados (default: solo FL).
|
||||
|
||||
Args:
|
||||
states: lista de state codes a scrapear (default ["FL"])
|
||||
status_cb: log callback
|
||||
use_cache: True (24h cache)
|
||||
cache_ttl_seconds: TTL del cache
|
||||
|
||||
Returns:
|
||||
list[dict] deal records.
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||||
|
||||
if states is None:
|
||||
states = ["FL"]
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(msg)
|
||||
|
||||
cache_namespace = "hud_homestore"
|
||||
deals: list[dict] = []
|
||||
|
||||
# Step 1: cache check per state
|
||||
cached_pages: dict[str, str] = {}
|
||||
states_to_fetch: list[str] = []
|
||||
cache_hits = 0
|
||||
for state in states:
|
||||
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
|
||||
if use_cache:
|
||||
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
|
||||
if cached:
|
||||
cached_pages[state] = cached
|
||||
cache_hits += 1
|
||||
continue
|
||||
states_to_fetch.append(state)
|
||||
_log(f"HUD Homestore: states={states}, cache hits {cache_hits}/{len(states)}, fetching {len(states_to_fetch)}")
|
||||
|
||||
# Step 2: fetch fresh HTML for non-cached states
|
||||
fresh_pages: dict[str, str] = {}
|
||||
if states_to_fetch:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent=_CHROME_UA,
|
||||
viewport={"width": 1400, "height": 900},
|
||||
locale="en-US",
|
||||
timezone_id="America/New_York",
|
||||
)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(30_000)
|
||||
|
||||
# Load landing first to set cookies + session
|
||||
try:
|
||||
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
_log(f" HUD landing load failed: {e}")
|
||||
|
||||
last_request_at = 0.0
|
||||
for state in states_to_fetch:
|
||||
elapsed = time.time() - last_request_at
|
||||
if elapsed < _REQUEST_INTERVAL_SECONDS:
|
||||
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
|
||||
last_request_at = time.time()
|
||||
|
||||
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
|
||||
_log(f" Fetching {state}...")
|
||||
try:
|
||||
response = page.goto(url, wait_until="networkidle", timeout=30_000)
|
||||
if response.status != 200:
|
||||
_log(f" HTTP {response.status} for {state} — skip")
|
||||
continue
|
||||
except PlaywrightTimeout:
|
||||
_log(f" timeout for {state} — skip")
|
||||
continue
|
||||
except Exception as e:
|
||||
_log(f" error for {state}: {e}")
|
||||
continue
|
||||
|
||||
# Wait extra for SPA render
|
||||
time.sleep(_SPA_RENDER_WAIT_SECONDS)
|
||||
|
||||
html = page.content()
|
||||
fresh_pages[state] = html
|
||||
if use_cache:
|
||||
save_cache(cache_namespace, url, html,
|
||||
status_code=200, ttl_seconds=cache_ttl_seconds)
|
||||
|
||||
browser.close()
|
||||
|
||||
# Step 3: parse all pages (cached + fresh) via lightweight DOM eval
|
||||
# For cached HTML: parse with stdlib. For fresh pages we already have rendered HTML.
|
||||
for state in states:
|
||||
html = cached_pages.get(state) or fresh_pages.get(state)
|
||||
if not html:
|
||||
continue
|
||||
|
||||
# Parse the HTML using a non-Playwright approach (regex + BeautifulSoup-style)
|
||||
# Since the card structure is stable (div.topMap-card.card-body), we can use re
|
||||
# to extract card boundaries then process text.
|
||||
cards_text = _extract_card_texts_from_html(html, _log)
|
||||
_log(f" {state}: extracted {len(cards_text)} card texts from HTML")
|
||||
|
||||
for card_text in cards_text:
|
||||
card = _parse_card_text(card_text)
|
||||
if card and card.get("case_number"):
|
||||
deal = _build_deal_record(card, state)
|
||||
if deal.get("address") or deal.get("case_number"):
|
||||
deals.append(deal)
|
||||
|
||||
_log(f"HUD Homestore: scraped {len(deals)} total deals across {len(states)} states")
|
||||
return deals
|
||||
|
||||
|
||||
def _extract_card_texts_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[str]:
|
||||
"""
|
||||
Extract the inner text content of each property card from raw HTML.
|
||||
|
||||
Uses stdlib html parser. Card boundary: <div class="topMap-card card-body ...">.
|
||||
"""
|
||||
from html.parser import HTMLParser
|
||||
|
||||
target_class_marker = "topMap-card"
|
||||
|
||||
class CardExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_card = 0
|
||||
self.depth_when_entered = 0
|
||||
self.current_depth = 0
|
||||
self.text_parts: list[str] = []
|
||||
self.cards_texts: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.current_depth += 1
|
||||
if not self.in_card:
|
||||
# Look for div with class containing topMap-card
|
||||
if tag == "div":
|
||||
for name, val in attrs:
|
||||
if name == "class" and val and target_class_marker in val:
|
||||
self.in_card = 1
|
||||
self.depth_when_entered = self.current_depth
|
||||
self.text_parts = []
|
||||
return
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self.in_card and tag == "div" and self.current_depth == self.depth_when_entered:
|
||||
# Closing tag matches the depth where we entered card
|
||||
text = " ".join("".join(self.text_parts).split()).strip()
|
||||
if text:
|
||||
self.cards_texts.append(text)
|
||||
self.in_card = 0
|
||||
self.depth_when_entered = 0
|
||||
self.text_parts = []
|
||||
self.current_depth -= 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_card:
|
||||
self.text_parts.append(data)
|
||||
|
||||
parser = CardExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception as e:
|
||||
if log_fn:
|
||||
log_fn(f" HTML parse error: {e}")
|
||||
return []
|
||||
|
||||
return parser.cards_texts
|
||||
|
||||
|
||||
def run_scraper_to_db(
|
||||
*,
|
||||
states: list[str] = None,
|
||||
auto_classify: bool = True,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
) -> dict:
|
||||
"""Full pipeline: scrape HUD → persist deals.db → optionally classify."""
|
||||
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
|
||||
init_db()
|
||||
|
||||
run_id = record_scraper_run(SOURCE)
|
||||
errors: list[str] = []
|
||||
|
||||
def _log(m: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(m)
|
||||
|
||||
try:
|
||||
deals = scrape_hud_homestore(states=states, status_cb=status_cb)
|
||||
except Exception as e:
|
||||
errors.append(f"scrape failed: {e}")
|
||||
deals = []
|
||||
|
||||
deals_new = 0
|
||||
deals_updated = 0
|
||||
new_deal_ids: list[int] = []
|
||||
|
||||
for deal in deals:
|
||||
try:
|
||||
deal_id, is_new = insert_deal(deal)
|
||||
if is_new:
|
||||
deals_new += 1
|
||||
new_deal_ids.append(deal_id)
|
||||
else:
|
||||
deals_updated += 1
|
||||
except Exception as e:
|
||||
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
|
||||
|
||||
classified_count = 0
|
||||
if auto_classify and new_deal_ids:
|
||||
_log(f"Auto-classifying {len(new_deal_ids)} new HUD deals...")
|
||||
from deal_classifier import classify_deal
|
||||
from deals_db import get_deal_by_id, update_classification
|
||||
for did in new_deal_ids:
|
||||
try:
|
||||
d = get_deal_by_id(did)
|
||||
if not d:
|
||||
continue
|
||||
result = classify_deal(d)
|
||||
update_classification(
|
||||
deal_id=did,
|
||||
status=result["classification_status"],
|
||||
score=result["score"],
|
||||
reasons=result["reasons"],
|
||||
strategy=result["strategy"],
|
||||
)
|
||||
classified_count += 1
|
||||
except Exception as e:
|
||||
errors.append(f"classify fail for {did}: {e}")
|
||||
|
||||
finish_scraper_run(
|
||||
run_id,
|
||||
deals_found=len(deals),
|
||||
deals_new=deals_new,
|
||||
deals_updated=deals_updated,
|
||||
errors_count=len(errors),
|
||||
errors_summary=errors if errors else None,
|
||||
firecrawl_credits_used=0,
|
||||
status="success" if not errors else ("partial" if deals else "failed"),
|
||||
)
|
||||
|
||||
return {
|
||||
"source": SOURCE,
|
||||
"scraper_run_id": run_id,
|
||||
"deals_found": len(deals),
|
||||
"deals_new": deals_new,
|
||||
"deals_updated": deals_updated,
|
||||
"deals_classified": classified_count,
|
||||
"errors_count": len(errors),
|
||||
"errors": errors,
|
||||
}
|
||||
Reference in New Issue
Block a user