feat: AR-House initial commit

2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
@@ -0,0 +1,523 @@
+"""scrapers/hud_homestore.py — HUD Homestore federal REO listings (FHA defaults).
+
+SOURCE: https://www.hudhomestore.gov/searchresult?citystate={STATE_CODE}
+STACK:  Playwright local (Chromium) — SPA pesado, requiere render JS completo
+
+URL PATTERN:
+    https://www.hudhomestore.gov/searchresult?citystate=FL
+    → renderiza ~30-50 properties FL después de ~6s de SPA load
+
+CARD STRUCTURE (DOM):
+    Each property card = <div class="topMap-card card-body col-12 col-md-7 px-2 pl-md-4">
+    Text content (raw, no semantic tags):
+        [optional] "Price Reduced" | "New Listing" (badge)
+        "BIDS OPEN MM/DD/YYYY"
+        "Listing Period: Extended" | "Exclusive" | etc
+        "$XXX,XXX"
+        "<street address>"
+        "<city>, FL, <zip>"
+        "<beds> Beds <baths> Baths <county> County"
+        "Case #: <agency>-<number>"  (ej: 093-676572)
+
+DEAL TYPE: 'reo' (Real Estate Owned — HUD post-foreclosure de loans FHA)
+
+ANTI-BOT: real Chrome UA. Sin headers especiales adicionales necesarios.
+
+LISTING PERIOD significance:
+    - "Exclusive": solo Owner-Occupants, nonprofits, gobiernos (primeros 10-30 dias)
+    - "Extended": disponible para investors (post-exclusive)
+    - "Lottery": offer aleatorio por demanda alta
+    - Investor-eligible deals tienen "Extended" o no-period
+"""
+from __future__ import annotations
+
+import re
+import time
+from datetime import datetime, timezone
+from typing import Callable, Optional
+
+from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
+
+# Real Chrome UA — HUD usa Yardi Systems SPA framework, anti-bot leve pero presente
+_CHROME_UA = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/131.0.0.0 Safari/537.36"
+)
+
+# Rate limit: 1 request c/3s al sitio HUD
+_REQUEST_INTERVAL_SECONDS = 3.0
+
+SOURCE = "hud_homestore"
+
+# Tiempo de espera para SPA renderize property cards (descubierto via exploration: 6s seguro)
+_SPA_RENDER_WAIT_SECONDS = 6.5
+
+# Deep-link pattern descubierto via probe (B3 bugfix):
+# /PropertyDetails?caseNumber=XXX rendera el property especifico (verified status 200 + address+price)
+# Otros patterns devolvieron 404 (no funciona): /Listing/PropertyDetails, /Property/Details, /listing/{case}, etc.
+_PROPERTY_DETAIL_URL_TEMPLATE = (
+    "https://www.hudhomestore.gov/PropertyDetails?caseNumber={case_number}"
+)
+
+
+def build_deep_link(case_number: Optional[str]) -> Optional[str]:
+    """Construye URL canonica al property detail de HUD a partir del case_number.
+
+    case_number format: 'XXX-XXXXXX' (e.g., '093-676572')
+    Returns None si case_number invalido o vacio.
+    """
+    if not case_number or not isinstance(case_number, str):
+        return None
+    case_number = case_number.strip()
+    if not case_number:
+        return None
+    # HUD case numbers son formato AAA-NNNNNN. No usar URL-encoding (hyphens son safe).
+    return _PROPERTY_DETAIL_URL_TEMPLATE.format(case_number=case_number)
+
+
+def _parse_money(s: str) -> Optional[float]:
+    """Parse '$446,000' → 446000.0"""
+    if not s:
+        return None
+    cleaned = re.sub(r"[^\d.]", "", s)
+    if not cleaned:
+        return None
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None
+
+
+def _parse_card_text(text: str) -> Optional[dict]:
+    """
+    Parse the raw text content of a single property card.
+
+    Returns dict with:
+        bid_open_date (ISO YYYY-MM-DD), listing_period, price, street_address,
+        city, state, zip, beds, baths, county, case_number, badges (list).
+
+    Returns None si parse fails.
+    """
+    if not text or "Case #:" not in text:
+        return None
+
+    out: dict = {}
+
+    # Badges (optional)
+    badges = []
+    for kw in ("New Listing", "Price Reduced", "Extended", "Exclusive", "Lottery"):
+        if kw in text:
+            badges.append(kw)
+    out["badges"] = badges
+
+    # Bid open date
+    m = re.search(r"BIDS OPEN (\d{2}/\d{2}/\d{4})", text)
+    if m:
+        try:
+            d = datetime.strptime(m.group(1), "%m/%d/%Y").date()
+            out["bid_open_date"] = d.isoformat()
+        except ValueError:
+            out["bid_open_date"] = None
+
+    # Listing period
+    m = re.search(r"Listing Period:\s*(\w+)", text)
+    if m:
+        out["listing_period"] = m.group(1)
+
+    # Price
+    m = re.search(r"\$([\d,]+)", text)
+    if m:
+        out["price"] = _parse_money(m.group(0))
+
+    # Case #
+    m = re.search(r"Case #:\s*(\d{3}-\d{6})", text)
+    if m:
+        out["case_number"] = m.group(1)
+    else:
+        return None  # Sin case # no es un card valido
+
+    # Address — pattern: "<street>" then "<city>, FL, <zip>" then "<n> Beds <n[.n]> Baths <county> County"
+    # The card text es muy denso, sin tags. Parse via regex multi-line.
+    # Match address block:
+    # Capture lines between "Listing Period" or "$NNN,NNN" and "X Beds"
+    addr_match = re.search(
+        r"\$[\d,]+\s+(.+?)\s+(\d+)\s+Beds\s+([\d.]+)\s+Baths\s+(.+?)\s+County",
+        text, re.DOTALL,
+    )
+    if addr_match:
+        addr_block = addr_match.group(1).strip()
+        # The addr_block has format: "<street>\n<city>, FL, <zip>"
+        # Try to split: last comma-separated part should be zip, before should be "city, state"
+        # Pattern: "<street> <city>, FL, <zip>" or "<street>, <city>, FL, <zip>"
+        zip_m = re.search(r",\s*(FL|F\.L\.)\s*,?\s*(\d{5})", addr_block)
+        if zip_m:
+            out["state"] = "FL"
+            out["zip"] = zip_m.group(2)
+            # Remove the ", FL, zip" suffix to find street + city
+            pre_zip = addr_block[:zip_m.start()].strip().rstrip(",").strip()
+            # Heuristic: last word group before zip is city (often 1-2 words)
+            # Use comma split first
+            if "," in pre_zip:
+                parts = [p.strip() for p in pre_zip.split(",")]
+                out["city"] = parts[-1]
+                out["address_street"] = ", ".join(parts[:-1])
+            else:
+                # No comma — city/street separated by newline (already collapsed). Best effort.
+                # Take last 1-3 words as city, rest as street
+                tokens = pre_zip.split()
+                # FL cities: last 1-3 tokens typically
+                # E.g., "4641 Samoset Dr Sarasota" → street="4641 Samoset Dr", city="Sarasota"
+                # E.g., "8342 N Pine Haven Pt Crystal River" → street="...Pt", city="Crystal River"
+                # Heuristic: city is at most 3 words; if last token looks like a street suffix
+                # (Dr, St, Ave, etc), then the part before is street and we need to be careful
+                street_suffixes = {"DR", "ST", "AVE", "RD", "BLVD", "LN", "WAY", "CT", "PL",
+                                   "CIR", "TER", "PKWY", "HWY", "TRL", "XING", "PT", "LOOP"}
+                # Find the LAST street suffix; city is what's after
+                for i in range(len(tokens) - 1, -1, -1):
+                    if tokens[i].upper().rstrip(".") in street_suffixes:
+                        out["address_street"] = " ".join(tokens[:i+1])
+                        out["city"] = " ".join(tokens[i+1:])
+                        break
+                else:
+                    # Fallback: split half
+                    half = len(tokens) // 2
+                    out["address_street"] = " ".join(tokens[:half])
+                    out["city"] = " ".join(tokens[half:])
+        else:
+            # No FL match — store raw
+            out["address_street"] = addr_block
+
+        out["beds"] = int(addr_match.group(2))
+        try:
+            out["baths"] = float(addr_match.group(3))
+        except ValueError:
+            out["baths"] = None
+        out["county"] = addr_match.group(4).strip()
+
+    # Build full address
+    full_addr_parts = []
+    if out.get("address_street"):
+        full_addr_parts.append(out["address_street"])
+    if out.get("city"):
+        full_addr_parts.append(out["city"])
+    if out.get("state"):
+        full_addr_parts.append(out["state"])
+    if out.get("zip"):
+        full_addr_parts.append(out["zip"])
+    if full_addr_parts:
+        out["address"] = ", ".join(full_addr_parts)
+
+    return out
+
+
+def _build_deal_record(card_data: dict, state: str) -> dict:
+    """
+    Convert parsed HUD card → deal record para deals_db.insert_deal.
+
+    Bugfix B3: source_url ahora es el deep-link al property especifico
+    (https://www.hudhomestore.gov/PropertyDetails?caseNumber=XXX), NO el URL
+    generico del search results. Fallback: None si case_number falta.
+    """
+    bid_date = card_data.get("bid_open_date")
+    listing_period = card_data.get("listing_period")
+    badges = card_data.get("badges", [])
+    case_number = card_data.get("case_number")
+
+    # Build description
+    desc_bits = []
+    if badges:
+        desc_bits.append("Badges: " + ", ".join(badges))
+    if listing_period:
+        desc_bits.append(f"Listing Period: {listing_period}")
+    if bid_date:
+        desc_bits.append(f"Bids Open: {bid_date}")
+    desc_bits.append(f"HUD Case #: {case_number}")
+    desc_bits.append(f"Source: HUD Homestore (FHA-default REO)")
+
+    return {
+        "source": SOURCE,
+        "source_url": build_deep_link(case_number),  # BUGFIX: deep-link per case
+        "address": card_data.get("address"),
+        "city": card_data.get("city"),
+        "state": card_data.get("state") or state,
+        "zip": card_data.get("zip"),
+        "county": card_data.get("county"),
+        "listing_price": card_data.get("price"),
+        "deal_type": "reo",  # HUD properties son REO post-foreclosure
+        "starting_bid": card_data.get("price"),  # HUD: list price = bid floor approx
+        "estimated_arv": None,  # No provisto por HUD
+        "beds": card_data.get("beds"),
+        "baths": card_data.get("baths"),
+        # year_built, sqft no en results card — necesitarian detail page scrape
+        # HUD "case_number" is a tracking ID, NOT a court case. Goes in external_id.
+        # case_number stays NULL (HUD listings are REO post-foreclosure, no active
+        # court proceeding from the buyer's perspective).
+        "case_number": None,
+        "external_id": case_number,
+        "auction_date": bid_date,
+        "listing_description": " | ".join(desc_bits),
+    }
+
+
+def scrape_hud_homestore(
+    *,
+    states: list[str] = None,
+    status_cb: Optional[Callable[[str], None]] = None,
+    use_cache: bool = True,
+    cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
+) -> list[dict]:
+    """
+    Scrape HUD Homestore para los estados dados (default: solo FL).
+
+    Args:
+        states: lista de state codes a scrapear (default ["FL"])
+        status_cb: log callback
+        use_cache: True (24h cache)
+        cache_ttl_seconds: TTL del cache
+
+    Returns:
+        list[dict] deal records.
+    """
+    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
+
+    if states is None:
+        states = ["FL"]
+
+    def _log(msg: str) -> None:
+        if status_cb:
+            status_cb(msg)
+
+    cache_namespace = "hud_homestore"
+    deals: list[dict] = []
+
+    # Step 1: cache check per state
+    cached_pages: dict[str, str] = {}
+    states_to_fetch: list[str] = []
+    cache_hits = 0
+    for state in states:
+        url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
+        if use_cache:
+            cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
+            if cached:
+                cached_pages[state] = cached
+                cache_hits += 1
+                continue
+        states_to_fetch.append(state)
+    _log(f"HUD Homestore: states={states}, cache hits {cache_hits}/{len(states)}, fetching {len(states_to_fetch)}")
+
+    # Step 2: fetch fresh HTML for non-cached states
+    fresh_pages: dict[str, str] = {}
+    if states_to_fetch:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            context = browser.new_context(
+                user_agent=_CHROME_UA,
+                viewport={"width": 1400, "height": 900},
+                locale="en-US",
+                timezone_id="America/New_York",
+            )
+            page = context.new_page()
+            page.set_default_timeout(30_000)
+
+            # Load landing first to set cookies + session
+            try:
+                page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
+                time.sleep(2)
+            except Exception as e:
+                _log(f"  HUD landing load failed: {e}")
+
+            last_request_at = 0.0
+            for state in states_to_fetch:
+                elapsed = time.time() - last_request_at
+                if elapsed < _REQUEST_INTERVAL_SECONDS:
+                    time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
+                last_request_at = time.time()
+
+                url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
+                _log(f"  Fetching {state}...")
+                try:
+                    response = page.goto(url, wait_until="networkidle", timeout=30_000)
+                    if response.status != 200:
+                        _log(f"    HTTP {response.status} for {state} — skip")
+                        continue
+                except PlaywrightTimeout:
+                    _log(f"    timeout for {state} — skip")
+                    continue
+                except Exception as e:
+                    _log(f"    error for {state}: {e}")
+                    continue
+
+                # Wait extra for SPA render
+                time.sleep(_SPA_RENDER_WAIT_SECONDS)
+
+                html = page.content()
+                fresh_pages[state] = html
+                if use_cache:
+                    save_cache(cache_namespace, url, html,
+                               status_code=200, ttl_seconds=cache_ttl_seconds)
+
+            browser.close()
+
+    # Step 3: parse all pages (cached + fresh) via lightweight DOM eval
+    # For cached HTML: parse with stdlib. For fresh pages we already have rendered HTML.
+    for state in states:
+        html = cached_pages.get(state) or fresh_pages.get(state)
+        if not html:
+            continue
+
+        # Parse the HTML using a non-Playwright approach (regex + BeautifulSoup-style)
+        # Since the card structure is stable (div.topMap-card.card-body), we can use re
+        # to extract card boundaries then process text.
+        cards_text = _extract_card_texts_from_html(html, _log)
+        _log(f"  {state}: extracted {len(cards_text)} card texts from HTML")
+
+        for card_text in cards_text:
+            card = _parse_card_text(card_text)
+            if card and card.get("case_number"):
+                deal = _build_deal_record(card, state)
+                if deal.get("address") or deal.get("case_number"):
+                    deals.append(deal)
+
+    _log(f"HUD Homestore: scraped {len(deals)} total deals across {len(states)} states")
+    return deals
+
+
+def _extract_card_texts_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[str]:
+    """
+    Extract the inner text content of each property card from raw HTML.
+
+    Uses stdlib html parser. Card boundary: <div class="topMap-card card-body ...">.
+    """
+    from html.parser import HTMLParser
+
+    target_class_marker = "topMap-card"
+
+    class CardExtractor(HTMLParser):
+        def __init__(self):
+            super().__init__()
+            self.in_card = 0
+            self.depth_when_entered = 0
+            self.current_depth = 0
+            self.text_parts: list[str] = []
+            self.cards_texts: list[str] = []
+
+        def handle_starttag(self, tag, attrs):
+            self.current_depth += 1
+            if not self.in_card:
+                # Look for div with class containing topMap-card
+                if tag == "div":
+                    for name, val in attrs:
+                        if name == "class" and val and target_class_marker in val:
+                            self.in_card = 1
+                            self.depth_when_entered = self.current_depth
+                            self.text_parts = []
+                            return
+
+        def handle_endtag(self, tag):
+            if self.in_card and tag == "div" and self.current_depth == self.depth_when_entered:
+                # Closing tag matches the depth where we entered card
+                text = " ".join("".join(self.text_parts).split()).strip()
+                if text:
+                    self.cards_texts.append(text)
+                self.in_card = 0
+                self.depth_when_entered = 0
+                self.text_parts = []
+            self.current_depth -= 1
+
+        def handle_data(self, data):
+            if self.in_card:
+                self.text_parts.append(data)
+
+    parser = CardExtractor()
+    try:
+        parser.feed(html)
+    except Exception as e:
+        if log_fn:
+            log_fn(f"    HTML parse error: {e}")
+        return []
+
+    return parser.cards_texts
+
+
+def run_scraper_to_db(
+    *,
+    states: list[str] = None,
+    auto_classify: bool = True,
+    status_cb: Optional[Callable[[str], None]] = None,
+) -> dict:
+    """Full pipeline: scrape HUD → persist deals.db → optionally classify."""
+    from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
+    init_db()
+
+    run_id = record_scraper_run(SOURCE)
+    errors: list[str] = []
+
+    def _log(m: str) -> None:
+        if status_cb:
+            status_cb(m)
+
+    try:
+        deals = scrape_hud_homestore(states=states, status_cb=status_cb)
+    except Exception as e:
+        errors.append(f"scrape failed: {e}")
+        deals = []
+
+    deals_new = 0
+    deals_updated = 0
+    new_deal_ids: list[int] = []
+
+    for deal in deals:
+        try:
+            deal_id, is_new = insert_deal(deal)
+            if is_new:
+                deals_new += 1
+                new_deal_ids.append(deal_id)
+            else:
+                deals_updated += 1
+        except Exception as e:
+            errors.append(f"insert fail for {deal.get('case_number')}: {e}")
+
+    classified_count = 0
+    if auto_classify and new_deal_ids:
+        _log(f"Auto-classifying {len(new_deal_ids)} new HUD deals...")
+        from deal_classifier import classify_deal
+        from deals_db import get_deal_by_id, update_classification
+        for did in new_deal_ids:
+            try:
+                d = get_deal_by_id(did)
+                if not d:
+                    continue
+                result = classify_deal(d)
+                update_classification(
+                    deal_id=did,
+                    status=result["classification_status"],
+                    score=result["score"],
+                    reasons=result["reasons"],
+                    strategy=result["strategy"],
+                )
+                classified_count += 1
+            except Exception as e:
+                errors.append(f"classify fail for {did}: {e}")
+
+    finish_scraper_run(
+        run_id,
+        deals_found=len(deals),
+        deals_new=deals_new,
+        deals_updated=deals_updated,
+        errors_count=len(errors),
+        errors_summary=errors if errors else None,
+        firecrawl_credits_used=0,
+        status="success" if not errors else ("partial" if deals else "failed"),
+    )
+
+    return {
+        "source": SOURCE,
+        "scraper_run_id": run_id,
+        "deals_found": len(deals),
+        "deals_new": deals_new,
+        "deals_updated": deals_updated,
+        "deals_classified": classified_count,
+        "errors_count": len(errors),
+        "errors": errors,
+    }