feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
"""scrapers — Phase 3B DealFinder source adapters.
|
||||
|
||||
Cada scraper devuelve list[dict] con campos compatibles con deals_db.insert_deal:
|
||||
source, deal_type, address, city, state, zip, county,
|
||||
listing_price, starting_bid, estimated_arv,
|
||||
beds, baths, sqft, year_built, lot_sqft,
|
||||
photos_urls, listing_description, case_number, auction_date
|
||||
|
||||
Plus el scraper anota:
|
||||
- record_scraper_run() al inicio
|
||||
- insert_deal() per result
|
||||
- record_firecrawl_usage() si consumio credits
|
||||
- finish_scraper_run() al final con metricas
|
||||
|
||||
Stack:
|
||||
- Playwright local primero (gratis) — para sitios sin anti-bot pesado
|
||||
- Firecrawl fallback — solo cuando Playwright se bloquea
|
||||
- Cada scraper documenta cual usa en su docstring
|
||||
"""
|
||||
@@ -0,0 +1,145 @@
|
||||
"""scrapers/_cache.py — File-based HTTP response cache para scrapers.
|
||||
|
||||
Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido
|
||||
no-cambiable durante la TTL window (default 24h para auction calendars).
|
||||
|
||||
Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que
|
||||
actualicen diariamente.
|
||||
|
||||
KEY: SHA256 del URL. VALUE: response body (text/html).
|
||||
PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json
|
||||
|
||||
Schema meta:
|
||||
{
|
||||
"url": str,
|
||||
"fetched_at": iso8601,
|
||||
"ttl_seconds": int,
|
||||
"content_length": int,
|
||||
"status_code": int
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Cache root absoluto al proyecto
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers"
|
||||
|
||||
# TTL defaults
|
||||
DEFAULT_TTL_SECONDS_DAILY = 86_400 # 24h — auction calendars actualizan diariamente
|
||||
DEFAULT_TTL_SECONDS_HOURLY = 3600 # 1h — listings live
|
||||
DEFAULT_TTL_SECONDS_WEEKLY = 604_800 # 7d — datos lentos (court records, official records)
|
||||
|
||||
|
||||
def _url_hash(url: str) -> str:
|
||||
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]
|
||||
|
||||
|
||||
def _paths(namespace: str, url: str) -> tuple[Path, Path]:
|
||||
"""Returns (html_path, meta_path)."""
|
||||
nshash = _url_hash(url)
|
||||
ns_dir = DEFAULT_CACHE_DIR / namespace
|
||||
return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json"
|
||||
|
||||
|
||||
def get_cached(
|
||||
namespace: str,
|
||||
url: str,
|
||||
*,
|
||||
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> Optional[str]:
|
||||
"""Returns cached HTML if cache hit AND not expired. None si miss/expired."""
|
||||
html_path, meta_path = _paths(namespace, url)
|
||||
if not html_path.exists() or not meta_path.exists():
|
||||
return None
|
||||
try:
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return None
|
||||
fetched_at_str = meta.get("fetched_at")
|
||||
if not fetched_at_str:
|
||||
return None
|
||||
try:
|
||||
fetched_at = datetime.fromisoformat(fetched_at_str)
|
||||
except Exception:
|
||||
return None
|
||||
age = (datetime.now(timezone.utc) - fetched_at).total_seconds()
|
||||
if age > ttl_seconds:
|
||||
return None
|
||||
try:
|
||||
return html_path.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def save_cache(
|
||||
namespace: str,
|
||||
url: str,
|
||||
html: str,
|
||||
*,
|
||||
status_code: int = 200,
|
||||
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> None:
|
||||
"""Persiste HTML + meta."""
|
||||
html_path, meta_path = _paths(namespace, url)
|
||||
html_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
html_path.write_text(html, encoding="utf-8")
|
||||
meta = {
|
||||
"url": url,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"ttl_seconds": ttl_seconds,
|
||||
"content_length": len(html),
|
||||
"status_code": status_code,
|
||||
}
|
||||
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def cache_stats(namespace: Optional[str] = None) -> dict:
|
||||
"""Returns stats sobre uso del cache para debugging."""
|
||||
root = DEFAULT_CACHE_DIR
|
||||
if namespace:
|
||||
root = root / namespace
|
||||
if not root.exists():
|
||||
return {"entries": 0, "total_bytes": 0, "namespaces": []}
|
||||
total_bytes = 0
|
||||
entries = 0
|
||||
namespaces = set()
|
||||
for meta_file in root.rglob("*.meta.json"):
|
||||
entries += 1
|
||||
try:
|
||||
total_bytes += meta_file.stat().st_size
|
||||
html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html"))
|
||||
if html_sibling.exists():
|
||||
total_bytes += html_sibling.stat().st_size
|
||||
namespaces.add(meta_file.parent.name)
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"entries": entries,
|
||||
"total_bytes": total_bytes,
|
||||
"total_mb": round(total_bytes / 1024 / 1024, 2),
|
||||
"namespaces": sorted(namespaces),
|
||||
}
|
||||
|
||||
|
||||
def clear_cache(namespace: Optional[str] = None) -> int:
|
||||
"""Elimina cache entries. Returns count of files deleted."""
|
||||
root = DEFAULT_CACHE_DIR
|
||||
if namespace:
|
||||
root = root / namespace
|
||||
if not root.exists():
|
||||
return 0
|
||||
deleted = 0
|
||||
for f in root.rglob("*"):
|
||||
if f.is_file():
|
||||
try:
|
||||
f.unlink()
|
||||
deleted += 1
|
||||
except Exception:
|
||||
pass
|
||||
return deleted
|
||||
@@ -0,0 +1,523 @@
|
||||
"""scrapers/hud_homestore.py — HUD Homestore federal REO listings (FHA defaults).
|
||||
|
||||
SOURCE: https://www.hudhomestore.gov/searchresult?citystate={STATE_CODE}
|
||||
STACK: Playwright local (Chromium) — SPA pesado, requiere render JS completo
|
||||
|
||||
URL PATTERN:
|
||||
https://www.hudhomestore.gov/searchresult?citystate=FL
|
||||
→ renderiza ~30-50 properties FL después de ~6s de SPA load
|
||||
|
||||
CARD STRUCTURE (DOM):
|
||||
Each property card = <div class="topMap-card card-body col-12 col-md-7 px-2 pl-md-4">
|
||||
Text content (raw, no semantic tags):
|
||||
[optional] "Price Reduced" | "New Listing" (badge)
|
||||
"BIDS OPEN MM/DD/YYYY"
|
||||
"Listing Period: Extended" | "Exclusive" | etc
|
||||
"$XXX,XXX"
|
||||
"<street address>"
|
||||
"<city>, FL, <zip>"
|
||||
"<beds> Beds <baths> Baths <county> County"
|
||||
"Case #: <agency>-<number>" (ej: 093-676572)
|
||||
|
||||
DEAL TYPE: 'reo' (Real Estate Owned — HUD post-foreclosure de loans FHA)
|
||||
|
||||
ANTI-BOT: real Chrome UA. Sin headers especiales adicionales necesarios.
|
||||
|
||||
LISTING PERIOD significance:
|
||||
- "Exclusive": solo Owner-Occupants, nonprofits, gobiernos (primeros 10-30 dias)
|
||||
- "Extended": disponible para investors (post-exclusive)
|
||||
- "Lottery": offer aleatorio por demanda alta
|
||||
- Investor-eligible deals tienen "Extended" o no-period
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Callable, Optional
|
||||
|
||||
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
|
||||
|
||||
# Real Chrome UA — HUD usa Yardi Systems SPA framework, anti-bot leve pero presente
|
||||
_CHROME_UA = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Rate limit: 1 request c/3s al sitio HUD
|
||||
_REQUEST_INTERVAL_SECONDS = 3.0
|
||||
|
||||
SOURCE = "hud_homestore"
|
||||
|
||||
# Tiempo de espera para SPA renderize property cards (descubierto via exploration: 6s seguro)
|
||||
_SPA_RENDER_WAIT_SECONDS = 6.5
|
||||
|
||||
# Deep-link pattern descubierto via probe (B3 bugfix):
|
||||
# /PropertyDetails?caseNumber=XXX rendera el property especifico (verified status 200 + address+price)
|
||||
# Otros patterns devolvieron 404 (no funciona): /Listing/PropertyDetails, /Property/Details, /listing/{case}, etc.
|
||||
_PROPERTY_DETAIL_URL_TEMPLATE = (
|
||||
"https://www.hudhomestore.gov/PropertyDetails?caseNumber={case_number}"
|
||||
)
|
||||
|
||||
|
||||
def build_deep_link(case_number: Optional[str]) -> Optional[str]:
|
||||
"""Construye URL canonica al property detail de HUD a partir del case_number.
|
||||
|
||||
case_number format: 'XXX-XXXXXX' (e.g., '093-676572')
|
||||
Returns None si case_number invalido o vacio.
|
||||
"""
|
||||
if not case_number or not isinstance(case_number, str):
|
||||
return None
|
||||
case_number = case_number.strip()
|
||||
if not case_number:
|
||||
return None
|
||||
# HUD case numbers son formato AAA-NNNNNN. No usar URL-encoding (hyphens son safe).
|
||||
return _PROPERTY_DETAIL_URL_TEMPLATE.format(case_number=case_number)
|
||||
|
||||
|
||||
def _parse_money(s: str) -> Optional[float]:
|
||||
"""Parse '$446,000' → 446000.0"""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.]", "", s)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_card_text(text: str) -> Optional[dict]:
|
||||
"""
|
||||
Parse the raw text content of a single property card.
|
||||
|
||||
Returns dict with:
|
||||
bid_open_date (ISO YYYY-MM-DD), listing_period, price, street_address,
|
||||
city, state, zip, beds, baths, county, case_number, badges (list).
|
||||
|
||||
Returns None si parse fails.
|
||||
"""
|
||||
if not text or "Case #:" not in text:
|
||||
return None
|
||||
|
||||
out: dict = {}
|
||||
|
||||
# Badges (optional)
|
||||
badges = []
|
||||
for kw in ("New Listing", "Price Reduced", "Extended", "Exclusive", "Lottery"):
|
||||
if kw in text:
|
||||
badges.append(kw)
|
||||
out["badges"] = badges
|
||||
|
||||
# Bid open date
|
||||
m = re.search(r"BIDS OPEN (\d{2}/\d{2}/\d{4})", text)
|
||||
if m:
|
||||
try:
|
||||
d = datetime.strptime(m.group(1), "%m/%d/%Y").date()
|
||||
out["bid_open_date"] = d.isoformat()
|
||||
except ValueError:
|
||||
out["bid_open_date"] = None
|
||||
|
||||
# Listing period
|
||||
m = re.search(r"Listing Period:\s*(\w+)", text)
|
||||
if m:
|
||||
out["listing_period"] = m.group(1)
|
||||
|
||||
# Price
|
||||
m = re.search(r"\$([\d,]+)", text)
|
||||
if m:
|
||||
out["price"] = _parse_money(m.group(0))
|
||||
|
||||
# Case #
|
||||
m = re.search(r"Case #:\s*(\d{3}-\d{6})", text)
|
||||
if m:
|
||||
out["case_number"] = m.group(1)
|
||||
else:
|
||||
return None # Sin case # no es un card valido
|
||||
|
||||
# Address — pattern: "<street>" then "<city>, FL, <zip>" then "<n> Beds <n[.n]> Baths <county> County"
|
||||
# The card text es muy denso, sin tags. Parse via regex multi-line.
|
||||
# Match address block:
|
||||
# Capture lines between "Listing Period" or "$NNN,NNN" and "X Beds"
|
||||
addr_match = re.search(
|
||||
r"\$[\d,]+\s+(.+?)\s+(\d+)\s+Beds\s+([\d.]+)\s+Baths\s+(.+?)\s+County",
|
||||
text, re.DOTALL,
|
||||
)
|
||||
if addr_match:
|
||||
addr_block = addr_match.group(1).strip()
|
||||
# The addr_block has format: "<street>\n<city>, FL, <zip>"
|
||||
# Try to split: last comma-separated part should be zip, before should be "city, state"
|
||||
# Pattern: "<street> <city>, FL, <zip>" or "<street>, <city>, FL, <zip>"
|
||||
zip_m = re.search(r",\s*(FL|F\.L\.)\s*,?\s*(\d{5})", addr_block)
|
||||
if zip_m:
|
||||
out["state"] = "FL"
|
||||
out["zip"] = zip_m.group(2)
|
||||
# Remove the ", FL, zip" suffix to find street + city
|
||||
pre_zip = addr_block[:zip_m.start()].strip().rstrip(",").strip()
|
||||
# Heuristic: last word group before zip is city (often 1-2 words)
|
||||
# Use comma split first
|
||||
if "," in pre_zip:
|
||||
parts = [p.strip() for p in pre_zip.split(",")]
|
||||
out["city"] = parts[-1]
|
||||
out["address_street"] = ", ".join(parts[:-1])
|
||||
else:
|
||||
# No comma — city/street separated by newline (already collapsed). Best effort.
|
||||
# Take last 1-3 words as city, rest as street
|
||||
tokens = pre_zip.split()
|
||||
# FL cities: last 1-3 tokens typically
|
||||
# E.g., "4641 Samoset Dr Sarasota" → street="4641 Samoset Dr", city="Sarasota"
|
||||
# E.g., "8342 N Pine Haven Pt Crystal River" → street="...Pt", city="Crystal River"
|
||||
# Heuristic: city is at most 3 words; if last token looks like a street suffix
|
||||
# (Dr, St, Ave, etc), then the part before is street and we need to be careful
|
||||
street_suffixes = {"DR", "ST", "AVE", "RD", "BLVD", "LN", "WAY", "CT", "PL",
|
||||
"CIR", "TER", "PKWY", "HWY", "TRL", "XING", "PT", "LOOP"}
|
||||
# Find the LAST street suffix; city is what's after
|
||||
for i in range(len(tokens) - 1, -1, -1):
|
||||
if tokens[i].upper().rstrip(".") in street_suffixes:
|
||||
out["address_street"] = " ".join(tokens[:i+1])
|
||||
out["city"] = " ".join(tokens[i+1:])
|
||||
break
|
||||
else:
|
||||
# Fallback: split half
|
||||
half = len(tokens) // 2
|
||||
out["address_street"] = " ".join(tokens[:half])
|
||||
out["city"] = " ".join(tokens[half:])
|
||||
else:
|
||||
# No FL match — store raw
|
||||
out["address_street"] = addr_block
|
||||
|
||||
out["beds"] = int(addr_match.group(2))
|
||||
try:
|
||||
out["baths"] = float(addr_match.group(3))
|
||||
except ValueError:
|
||||
out["baths"] = None
|
||||
out["county"] = addr_match.group(4).strip()
|
||||
|
||||
# Build full address
|
||||
full_addr_parts = []
|
||||
if out.get("address_street"):
|
||||
full_addr_parts.append(out["address_street"])
|
||||
if out.get("city"):
|
||||
full_addr_parts.append(out["city"])
|
||||
if out.get("state"):
|
||||
full_addr_parts.append(out["state"])
|
||||
if out.get("zip"):
|
||||
full_addr_parts.append(out["zip"])
|
||||
if full_addr_parts:
|
||||
out["address"] = ", ".join(full_addr_parts)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _build_deal_record(card_data: dict, state: str) -> dict:
|
||||
"""
|
||||
Convert parsed HUD card → deal record para deals_db.insert_deal.
|
||||
|
||||
Bugfix B3: source_url ahora es el deep-link al property especifico
|
||||
(https://www.hudhomestore.gov/PropertyDetails?caseNumber=XXX), NO el URL
|
||||
generico del search results. Fallback: None si case_number falta.
|
||||
"""
|
||||
bid_date = card_data.get("bid_open_date")
|
||||
listing_period = card_data.get("listing_period")
|
||||
badges = card_data.get("badges", [])
|
||||
case_number = card_data.get("case_number")
|
||||
|
||||
# Build description
|
||||
desc_bits = []
|
||||
if badges:
|
||||
desc_bits.append("Badges: " + ", ".join(badges))
|
||||
if listing_period:
|
||||
desc_bits.append(f"Listing Period: {listing_period}")
|
||||
if bid_date:
|
||||
desc_bits.append(f"Bids Open: {bid_date}")
|
||||
desc_bits.append(f"HUD Case #: {case_number}")
|
||||
desc_bits.append(f"Source: HUD Homestore (FHA-default REO)")
|
||||
|
||||
return {
|
||||
"source": SOURCE,
|
||||
"source_url": build_deep_link(case_number), # BUGFIX: deep-link per case
|
||||
"address": card_data.get("address"),
|
||||
"city": card_data.get("city"),
|
||||
"state": card_data.get("state") or state,
|
||||
"zip": card_data.get("zip"),
|
||||
"county": card_data.get("county"),
|
||||
"listing_price": card_data.get("price"),
|
||||
"deal_type": "reo", # HUD properties son REO post-foreclosure
|
||||
"starting_bid": card_data.get("price"), # HUD: list price = bid floor approx
|
||||
"estimated_arv": None, # No provisto por HUD
|
||||
"beds": card_data.get("beds"),
|
||||
"baths": card_data.get("baths"),
|
||||
# year_built, sqft no en results card — necesitarian detail page scrape
|
||||
# HUD "case_number" is a tracking ID, NOT a court case. Goes in external_id.
|
||||
# case_number stays NULL (HUD listings are REO post-foreclosure, no active
|
||||
# court proceeding from the buyer's perspective).
|
||||
"case_number": None,
|
||||
"external_id": case_number,
|
||||
"auction_date": bid_date,
|
||||
"listing_description": " | ".join(desc_bits),
|
||||
}
|
||||
|
||||
|
||||
def scrape_hud_homestore(
|
||||
*,
|
||||
states: list[str] = None,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
use_cache: bool = True,
|
||||
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Scrape HUD Homestore para los estados dados (default: solo FL).
|
||||
|
||||
Args:
|
||||
states: lista de state codes a scrapear (default ["FL"])
|
||||
status_cb: log callback
|
||||
use_cache: True (24h cache)
|
||||
cache_ttl_seconds: TTL del cache
|
||||
|
||||
Returns:
|
||||
list[dict] deal records.
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||||
|
||||
if states is None:
|
||||
states = ["FL"]
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(msg)
|
||||
|
||||
cache_namespace = "hud_homestore"
|
||||
deals: list[dict] = []
|
||||
|
||||
# Step 1: cache check per state
|
||||
cached_pages: dict[str, str] = {}
|
||||
states_to_fetch: list[str] = []
|
||||
cache_hits = 0
|
||||
for state in states:
|
||||
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
|
||||
if use_cache:
|
||||
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
|
||||
if cached:
|
||||
cached_pages[state] = cached
|
||||
cache_hits += 1
|
||||
continue
|
||||
states_to_fetch.append(state)
|
||||
_log(f"HUD Homestore: states={states}, cache hits {cache_hits}/{len(states)}, fetching {len(states_to_fetch)}")
|
||||
|
||||
# Step 2: fetch fresh HTML for non-cached states
|
||||
fresh_pages: dict[str, str] = {}
|
||||
if states_to_fetch:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent=_CHROME_UA,
|
||||
viewport={"width": 1400, "height": 900},
|
||||
locale="en-US",
|
||||
timezone_id="America/New_York",
|
||||
)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(30_000)
|
||||
|
||||
# Load landing first to set cookies + session
|
||||
try:
|
||||
page.goto("https://www.hudhomestore.gov/", wait_until="networkidle", timeout=30_000)
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
_log(f" HUD landing load failed: {e}")
|
||||
|
||||
last_request_at = 0.0
|
||||
for state in states_to_fetch:
|
||||
elapsed = time.time() - last_request_at
|
||||
if elapsed < _REQUEST_INTERVAL_SECONDS:
|
||||
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
|
||||
last_request_at = time.time()
|
||||
|
||||
url = f"https://www.hudhomestore.gov/searchresult?citystate={state}"
|
||||
_log(f" Fetching {state}...")
|
||||
try:
|
||||
response = page.goto(url, wait_until="networkidle", timeout=30_000)
|
||||
if response.status != 200:
|
||||
_log(f" HTTP {response.status} for {state} — skip")
|
||||
continue
|
||||
except PlaywrightTimeout:
|
||||
_log(f" timeout for {state} — skip")
|
||||
continue
|
||||
except Exception as e:
|
||||
_log(f" error for {state}: {e}")
|
||||
continue
|
||||
|
||||
# Wait extra for SPA render
|
||||
time.sleep(_SPA_RENDER_WAIT_SECONDS)
|
||||
|
||||
html = page.content()
|
||||
fresh_pages[state] = html
|
||||
if use_cache:
|
||||
save_cache(cache_namespace, url, html,
|
||||
status_code=200, ttl_seconds=cache_ttl_seconds)
|
||||
|
||||
browser.close()
|
||||
|
||||
# Step 3: parse all pages (cached + fresh) via lightweight DOM eval
|
||||
# For cached HTML: parse with stdlib. For fresh pages we already have rendered HTML.
|
||||
for state in states:
|
||||
html = cached_pages.get(state) or fresh_pages.get(state)
|
||||
if not html:
|
||||
continue
|
||||
|
||||
# Parse the HTML using a non-Playwright approach (regex + BeautifulSoup-style)
|
||||
# Since the card structure is stable (div.topMap-card.card-body), we can use re
|
||||
# to extract card boundaries then process text.
|
||||
cards_text = _extract_card_texts_from_html(html, _log)
|
||||
_log(f" {state}: extracted {len(cards_text)} card texts from HTML")
|
||||
|
||||
for card_text in cards_text:
|
||||
card = _parse_card_text(card_text)
|
||||
if card and card.get("case_number"):
|
||||
deal = _build_deal_record(card, state)
|
||||
if deal.get("address") or deal.get("case_number"):
|
||||
deals.append(deal)
|
||||
|
||||
_log(f"HUD Homestore: scraped {len(deals)} total deals across {len(states)} states")
|
||||
return deals
|
||||
|
||||
|
||||
def _extract_card_texts_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[str]:
|
||||
"""
|
||||
Extract the inner text content of each property card from raw HTML.
|
||||
|
||||
Uses stdlib html parser. Card boundary: <div class="topMap-card card-body ...">.
|
||||
"""
|
||||
from html.parser import HTMLParser
|
||||
|
||||
target_class_marker = "topMap-card"
|
||||
|
||||
class CardExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_card = 0
|
||||
self.depth_when_entered = 0
|
||||
self.current_depth = 0
|
||||
self.text_parts: list[str] = []
|
||||
self.cards_texts: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.current_depth += 1
|
||||
if not self.in_card:
|
||||
# Look for div with class containing topMap-card
|
||||
if tag == "div":
|
||||
for name, val in attrs:
|
||||
if name == "class" and val and target_class_marker in val:
|
||||
self.in_card = 1
|
||||
self.depth_when_entered = self.current_depth
|
||||
self.text_parts = []
|
||||
return
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self.in_card and tag == "div" and self.current_depth == self.depth_when_entered:
|
||||
# Closing tag matches the depth where we entered card
|
||||
text = " ".join("".join(self.text_parts).split()).strip()
|
||||
if text:
|
||||
self.cards_texts.append(text)
|
||||
self.in_card = 0
|
||||
self.depth_when_entered = 0
|
||||
self.text_parts = []
|
||||
self.current_depth -= 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_card:
|
||||
self.text_parts.append(data)
|
||||
|
||||
parser = CardExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception as e:
|
||||
if log_fn:
|
||||
log_fn(f" HTML parse error: {e}")
|
||||
return []
|
||||
|
||||
return parser.cards_texts
|
||||
|
||||
|
||||
def run_scraper_to_db(
|
||||
*,
|
||||
states: list[str] = None,
|
||||
auto_classify: bool = True,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
) -> dict:
|
||||
"""Full pipeline: scrape HUD → persist deals.db → optionally classify."""
|
||||
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
|
||||
init_db()
|
||||
|
||||
run_id = record_scraper_run(SOURCE)
|
||||
errors: list[str] = []
|
||||
|
||||
def _log(m: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(m)
|
||||
|
||||
try:
|
||||
deals = scrape_hud_homestore(states=states, status_cb=status_cb)
|
||||
except Exception as e:
|
||||
errors.append(f"scrape failed: {e}")
|
||||
deals = []
|
||||
|
||||
deals_new = 0
|
||||
deals_updated = 0
|
||||
new_deal_ids: list[int] = []
|
||||
|
||||
for deal in deals:
|
||||
try:
|
||||
deal_id, is_new = insert_deal(deal)
|
||||
if is_new:
|
||||
deals_new += 1
|
||||
new_deal_ids.append(deal_id)
|
||||
else:
|
||||
deals_updated += 1
|
||||
except Exception as e:
|
||||
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
|
||||
|
||||
classified_count = 0
|
||||
if auto_classify and new_deal_ids:
|
||||
_log(f"Auto-classifying {len(new_deal_ids)} new HUD deals...")
|
||||
from deal_classifier import classify_deal
|
||||
from deals_db import get_deal_by_id, update_classification
|
||||
for did in new_deal_ids:
|
||||
try:
|
||||
d = get_deal_by_id(did)
|
||||
if not d:
|
||||
continue
|
||||
result = classify_deal(d)
|
||||
update_classification(
|
||||
deal_id=did,
|
||||
status=result["classification_status"],
|
||||
score=result["score"],
|
||||
reasons=result["reasons"],
|
||||
strategy=result["strategy"],
|
||||
)
|
||||
classified_count += 1
|
||||
except Exception as e:
|
||||
errors.append(f"classify fail for {did}: {e}")
|
||||
|
||||
finish_scraper_run(
|
||||
run_id,
|
||||
deals_found=len(deals),
|
||||
deals_new=deals_new,
|
||||
deals_updated=deals_updated,
|
||||
errors_count=len(errors),
|
||||
errors_summary=errors if errors else None,
|
||||
firecrawl_credits_used=0,
|
||||
status="success" if not errors else ("partial" if deals else "failed"),
|
||||
)
|
||||
|
||||
return {
|
||||
"source": SOURCE,
|
||||
"scraper_run_id": run_id,
|
||||
"deals_found": len(deals),
|
||||
"deals_new": deals_new,
|
||||
"deals_updated": deals_updated,
|
||||
"deals_classified": classified_count,
|
||||
"errors_count": len(errors),
|
||||
"errors": errors,
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
"""scrapers/miami_dade_clerk.py — Backward-compat shim.
|
||||
|
||||
ESTA MODULO ES UN ALIAS DE realauction_clerk.py para Miami-Dade.
|
||||
|
||||
La logica generica fue extraida a `realauction_clerk.py` cuando se agregaron
|
||||
soporte para Duval, Broward, Palm Beach, Hillsborough, Orange (todos comparten
|
||||
el mismo white-label platform de realauction.com).
|
||||
|
||||
Mantener este shim mientras:
|
||||
- Tests legacy importen de `scrapers.miami_dade_clerk`
|
||||
- Migration scripts referencien `_parse_cases_from_html`, `_is_status_dead`
|
||||
- El source_id 'miami_dade_clerk' siga en deals.db
|
||||
|
||||
API publica preservada:
|
||||
SOURCE — string 'miami_dade_clerk'
|
||||
scrape_miami_dade_auctions(...) — scrape function
|
||||
run_scraper_to_db(...) — pipeline
|
||||
_parse_cases_from_html(...) — exposed for tests
|
||||
_is_status_dead(...) — exposed for tests
|
||||
_DEAD_STATUS_SUBSTRINGS — exposed for tests
|
||||
_build_deal_record(case, date) — exposed for tests
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable, Optional
|
||||
|
||||
from scrapers.realauction_clerk import (
|
||||
REALAUCTION_COUNTIES,
|
||||
get_county_config,
|
||||
scrape_realauction_county,
|
||||
run_scraper_to_db as _generic_run_scraper_to_db,
|
||||
# Re-exports for backward compat
|
||||
_parse_cases_from_html, # noqa: F401
|
||||
_is_status_dead, # noqa: F401
|
||||
_DEAD_STATUS_SUBSTRINGS, # noqa: F401
|
||||
_extract_case_from_table_rows, # noqa: F401
|
||||
_parse_address, # noqa: F401
|
||||
_parse_money, # noqa: F401
|
||||
_build_description, # noqa: F401
|
||||
)
|
||||
|
||||
SOURCE = "miami_dade_clerk"
|
||||
_COUNTY = "Miami-Dade"
|
||||
|
||||
|
||||
def scrape_miami_dade_auctions(
|
||||
*,
|
||||
days_ahead: int = 14,
|
||||
days_back: int = 0,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
max_dates: Optional[int] = None,
|
||||
use_cache: bool = True,
|
||||
cache_ttl_seconds: int = 86400,
|
||||
) -> list[dict]:
|
||||
"""Scrape Miami-Dade County auction calendar (delegates to realauction_clerk)."""
|
||||
return scrape_realauction_county(
|
||||
county=_COUNTY,
|
||||
days_ahead=days_ahead,
|
||||
days_back=days_back,
|
||||
status_cb=status_cb,
|
||||
max_dates=max_dates,
|
||||
use_cache=use_cache,
|
||||
cache_ttl_seconds=cache_ttl_seconds,
|
||||
)
|
||||
|
||||
|
||||
def run_scraper_to_db(
|
||||
*,
|
||||
days_ahead: int = 14,
|
||||
days_back: int = 0,
|
||||
auto_classify: bool = True,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
max_dates: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Full pipeline for Miami-Dade (delegates to realauction_clerk)."""
|
||||
return _generic_run_scraper_to_db(
|
||||
county=_COUNTY,
|
||||
days_ahead=days_ahead,
|
||||
days_back=days_back,
|
||||
auto_classify=auto_classify,
|
||||
status_cb=status_cb,
|
||||
max_dates=max_dates,
|
||||
)
|
||||
|
||||
|
||||
def _build_deal_record(case: dict, auction_date_iso: str) -> dict:
|
||||
"""Backward-compat: original signature was (case, date) without county_config.
|
||||
|
||||
Tests use this. Internally delegates to the generic builder with Miami-Dade config.
|
||||
"""
|
||||
from scrapers.realauction_clerk import _build_deal_record as _generic_build
|
||||
return _generic_build(case, auction_date_iso, REALAUCTION_COUNTIES[_COUNTY])
|
||||
@@ -0,0 +1,854 @@
|
||||
"""scrapers/realauction_clerk.py — Generic scraper para 5+ Florida counties.
|
||||
|
||||
REALAUCTION.COM WHITE-LABEL PLATFORM:
|
||||
Multiples county clerks usan el mismo SaaS de realforeclose.com con subdominios
|
||||
distintos. Comparten ~95% del HTML structure → un solo scraper sirve para todos.
|
||||
|
||||
Counties soportados (FL):
|
||||
Miami-Dade miamidade.realforeclose.com
|
||||
Duval duval.realforeclose.com
|
||||
Broward broward.realforeclose.com
|
||||
Palm Beach mypalmbeach.realforeclose.com
|
||||
Hillsborough hillsborough.realforeclose.com
|
||||
Orange myorangeclerk.realforeclose.com
|
||||
|
||||
URL PATTERN:
|
||||
https://www.{subdomain}.realforeclose.com/index.cfm
|
||||
?zaction=AUCTION
|
||||
&zmethod=PREVIEW
|
||||
&AuctionDate=MM/DD/YYYY
|
||||
|
||||
ESTRUCTURA del page (descubierta via DOM inspection en Miami-Dade, asumida igual
|
||||
en otros county subdomains porque comparten platform):
|
||||
- <div class="AUCTION_ITEM"> wrapper por listing
|
||||
- <div class="ASTAT_MSGB Astat_DATA">{STATUS}</div> con auction status
|
||||
- <table> con rows key/value (Case#, Parcel ID, Property Address, etc.)
|
||||
- Status posibles dead: Redeemed, Canceled per *, Sold, Title Transferred,
|
||||
Withdrawn, Dismissed (parser filtra estos)
|
||||
|
||||
ANTI-BOT: Chrome UA real bypassa el 403 que tira el sitio para UA no-standard.
|
||||
RATE LIMIT: 2.5s entre requests del mismo county.
|
||||
|
||||
USO:
|
||||
from scrapers.realauction_clerk import scrape_realauction_county
|
||||
deals = scrape_realauction_county(county="Duval", days_ahead=7)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Callable, Optional
|
||||
|
||||
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Configuration: county registry
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Subdomain mapping. Si un county no esta aqui, scrape_realauction_county() raises.
|
||||
# IMPORTANT: subdomains validados via DOM inspection. Cuando se agrega un county
|
||||
# nuevo, hacer un curl al URL pattern para confirmar HTTP 200.
|
||||
# Configuracion per (county, platform) tuple. The 'domain' field defaults to
|
||||
# 'realforeclose.com' if omitted (backward compat). For tax deed sales it's
|
||||
# 'realtaxdeed.com' instead. Both share the same Realauction.com platform with
|
||||
# identical HTML structure — only the URL host differs.
|
||||
REALAUCTION_DEFAULT_DOMAIN = "realforeclose.com"
|
||||
|
||||
REALAUCTION_COUNTIES = {
|
||||
# ALL 41 FL counties confirmed working en realforeclose.com (probe 2026-05-14)
|
||||
# Detection: title='RealForeclose- {County} County -Splash Page'
|
||||
# 26 small FL counties NOT en realforeclose (usan plataformas distintas)
|
||||
# ─── Major urban counties ──────────────────────────────────────────────
|
||||
"Miami-Dade": {"subdomain": "miamidade", "source_id": "miami_dade_clerk", "state": "FL", "label": "Miami-Dade"},
|
||||
"Duval": {"subdomain": "duval", "source_id": "duval_clerk", "state": "FL", "label": "Duval (Jacksonville)"},
|
||||
"Broward": {"subdomain": "broward", "source_id": "broward_clerk", "state": "FL", "label": "Broward (Fort Lauderdale)"},
|
||||
"Palm Beach": {"subdomain": "palmbeach", "source_id": "palm_beach_clerk", "state": "FL", "label": "Palm Beach (West Palm)"},
|
||||
"Hillsborough": {"subdomain": "hillsborough","source_id": "hillsborough_clerk", "state": "FL", "label": "Hillsborough (Tampa)"},
|
||||
"Orange": {"subdomain": "orange", "source_id": "orange_clerk", "state": "FL", "label": "Orange (Orlando)"},
|
||||
"Pinellas": {"subdomain": "pinellas", "source_id": "pinellas_clerk", "state": "FL", "label": "Pinellas (St Petersburg)"},
|
||||
"Lee": {"subdomain": "lee", "source_id": "lee_clerk", "state": "FL", "label": "Lee (Fort Myers)"},
|
||||
"Polk": {"subdomain": "polk", "source_id": "polk_clerk", "state": "FL", "label": "Polk (Lakeland)"},
|
||||
"Brevard": {"subdomain": "brevard", "source_id": "brevard_clerk", "state": "FL", "label": "Brevard (Cocoa, Melbourne)"},
|
||||
"Volusia": {"subdomain": "volusia", "source_id": "volusia_clerk", "state": "FL", "label": "Volusia (Daytona Beach)"},
|
||||
"Pasco": {"subdomain": "pasco", "source_id": "pasco_clerk", "state": "FL", "label": "Pasco"},
|
||||
"Sarasota": {"subdomain": "sarasota", "source_id": "sarasota_clerk", "state": "FL", "label": "Sarasota"},
|
||||
"Manatee": {"subdomain": "manatee", "source_id": "manatee_clerk", "state": "FL", "label": "Manatee (Bradenton)"},
|
||||
"Seminole": {"subdomain": "seminole", "source_id": "seminole_clerk", "state": "FL", "label": "Seminole (Sanford)"},
|
||||
# ─── Mid-size counties ─────────────────────────────────────────────────
|
||||
"Marion": {"subdomain": "marion", "source_id": "marion_clerk", "state": "FL", "label": "Marion (Ocala)"},
|
||||
"Lake": {"subdomain": "lake", "source_id": "lake_clerk", "state": "FL", "label": "Lake (Mt Dora, Tavares)"},
|
||||
"Osceola": {"subdomain": "osceola", "source_id": "osceola_clerk", "state": "FL", "label": "Osceola (Kissimmee)"},
|
||||
"Leon": {"subdomain": "leon", "source_id": "leon_clerk", "state": "FL", "label": "Leon (Tallahassee)"},
|
||||
"Alachua": {"subdomain": "alachua", "source_id": "alachua_clerk", "state": "FL", "label": "Alachua (Gainesville)"},
|
||||
"Escambia": {"subdomain": "escambia", "source_id": "escambia_clerk", "state": "FL", "label": "Escambia (Pensacola)"},
|
||||
"Santa Rosa": {"subdomain": "santarosa", "source_id": "santa_rosa_clerk", "state": "FL", "label": "Santa Rosa (Milton)"},
|
||||
"Bay": {"subdomain": "bay", "source_id": "bay_clerk", "state": "FL", "label": "Bay (Panama City)"},
|
||||
"St Lucie": {"subdomain": "stlucie", "source_id": "st_lucie_clerk", "state": "FL", "label": "St Lucie (Port St Lucie)"},
|
||||
"Indian River": {"subdomain": "indianriver", "source_id": "indian_river_clerk", "state": "FL", "label": "Indian River (Vero Beach)"},
|
||||
"Martin": {"subdomain": "martin", "source_id": "martin_clerk", "state": "FL", "label": "Martin (Stuart)"},
|
||||
"Citrus": {"subdomain": "citrus", "source_id": "citrus_clerk", "state": "FL", "label": "Citrus (Crystal River)"},
|
||||
"Charlotte": {"subdomain": "charlotte", "source_id": "charlotte_clerk", "state": "FL", "label": "Charlotte (Port Charlotte)"},
|
||||
"Clay": {"subdomain": "clay", "source_id": "clay_clerk", "state": "FL", "label": "Clay (Green Cove Springs)"},
|
||||
"Nassau": {"subdomain": "nassau", "source_id": "nassau_clerk", "state": "FL", "label": "Nassau (Fernandina Beach)"},
|
||||
"Putnam": {"subdomain": "putnam", "source_id": "putnam_clerk", "state": "FL", "label": "Putnam (Palatka)"},
|
||||
"Flagler": {"subdomain": "flagler", "source_id": "flagler_clerk", "state": "FL", "label": "Flagler (Palm Coast)"},
|
||||
"Walton": {"subdomain": "walton", "source_id": "walton_clerk", "state": "FL", "label": "Walton (DeFuniak Springs)"},
|
||||
"Okeechobee": {"subdomain": "okeechobee", "source_id": "okeechobee_clerk", "state": "FL", "label": "Okeechobee"},
|
||||
# ─── Small rural counties (small auction volume but still on platform) ─
|
||||
"Baker": {"subdomain": "baker", "source_id": "baker_clerk", "state": "FL", "label": "Baker (Macclenny)"},
|
||||
"Calhoun": {"subdomain": "calhoun", "source_id": "calhoun_clerk", "state": "FL", "label": "Calhoun"},
|
||||
"Gilchrist": {"subdomain": "gilchrist", "source_id": "gilchrist_clerk", "state": "FL", "label": "Gilchrist"},
|
||||
"Gulf": {"subdomain": "gulf", "source_id": "gulf_clerk", "state": "FL", "label": "Gulf (Port St Joe)"},
|
||||
"Jackson": {"subdomain": "jackson", "source_id": "jackson_clerk", "state": "FL", "label": "Jackson (Marianna)"},
|
||||
"Suwannee": {"subdomain": "suwannee", "source_id": "suwannee_clerk", "state": "FL", "label": "Suwannee (Live Oak)"},
|
||||
"Washington": {"subdomain": "washington", "source_id": "washington_clerk", "state": "FL", "label": "Washington (Chipley)"},
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# COLORADO — mismo platform (realauction.com is multi-state)
|
||||
# Discovered via https://www.realauction.com/clients
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
"Denver": {"subdomain": "denver", "source_id": "denver_clerk_co", "state": "CO", "label": "Denver County, CO"},
|
||||
"Eagle": {"subdomain": "eagle", "source_id": "eagle_clerk_co", "state": "CO", "label": "Eagle County, CO"},
|
||||
"El Paso": {"subdomain": "elpasoco", "source_id": "el_paso_clerk_co", "state": "CO", "label": "El Paso County, CO (Colorado Springs)"},
|
||||
"Larimer": {"subdomain": "larimer", "source_id": "larimer_clerk_co", "state": "CO", "label": "Larimer County, CO (Fort Collins)"},
|
||||
"Mesa": {"subdomain": "mesa", "source_id": "mesa_clerk_co", "state": "CO", "label": "Mesa County, CO (Grand Junction)"},
|
||||
"Summit": {"subdomain": "summit", "source_id": "summit_clerk_co", "state": "CO", "label": "Summit County, CO (Breckenridge)"},
|
||||
"Weld": {"subdomain": "weld", "source_id": "weld_clerk_co", "state": "CO", "label": "Weld County, CO (Greeley)"},
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# TAX DEED SALES via .realtaxdeed.com (same engine, diff domain)
|
||||
# 17 portales: 3 AZ + 14 FL. SAME HTML structure as realforeclose.com.
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# Arizona
|
||||
"Apache TD": {"subdomain": "apache", "domain": "realtaxdeed.com", "source_id": "apache_taxdeed_az", "state": "AZ", "label": "Apache County, AZ — Tax Deed"},
|
||||
"Coconino TD": {"subdomain": "coconino", "domain": "realtaxdeed.com", "source_id": "coconino_taxdeed_az", "state": "AZ", "label": "Coconino County, AZ — Tax Deed (Flagstaff)"},
|
||||
"Mohave TD": {"subdomain": "mohave", "domain": "realtaxdeed.com", "source_id": "mohave_taxdeed_az", "state": "AZ", "label": "Mohave County, AZ — Tax Deed (Kingman)"},
|
||||
# Florida tax deed (distinct from foreclosure entries above)
|
||||
"Alachua TD": {"subdomain": "alachua", "domain": "realtaxdeed.com", "source_id": "alachua_taxdeed", "state": "FL", "label": "Alachua County — Tax Deed"},
|
||||
"Baker TD": {"subdomain": "baker", "domain": "realtaxdeed.com", "source_id": "baker_taxdeed", "state": "FL", "label": "Baker County — Tax Deed"},
|
||||
"Bay TD": {"subdomain": "bay", "domain": "realtaxdeed.com", "source_id": "bay_taxdeed", "state": "FL", "label": "Bay County — Tax Deed"},
|
||||
"Citrus TD": {"subdomain": "citrus", "domain": "realtaxdeed.com", "source_id": "citrus_taxdeed", "state": "FL", "label": "Citrus County — Tax Deed"},
|
||||
"Clay TD": {"subdomain": "clay", "domain": "realtaxdeed.com", "source_id": "clay_taxdeed", "state": "FL", "label": "Clay County — Tax Deed"},
|
||||
"Duval TD": {"subdomain": "duval", "domain": "realtaxdeed.com", "source_id": "duval_taxdeed", "state": "FL", "label": "Duval County — Tax Deed (Jacksonville)"},
|
||||
"Escambia TD": {"subdomain": "escambia", "domain": "realtaxdeed.com", "source_id": "escambia_taxdeed", "state": "FL", "label": "Escambia County — Tax Deed (Pensacola)"},
|
||||
"Flagler TD": {"subdomain": "flagler", "domain": "realtaxdeed.com", "source_id": "flagler_taxdeed", "state": "FL", "label": "Flagler County — Tax Deed"},
|
||||
"Gilchrist TD": {"subdomain": "gilchrist", "domain": "realtaxdeed.com", "source_id": "gilchrist_taxdeed", "state": "FL", "label": "Gilchrist County — Tax Deed"},
|
||||
"Gulf TD": {"subdomain": "gulf", "domain": "realtaxdeed.com", "source_id": "gulf_taxdeed", "state": "FL", "label": "Gulf County — Tax Deed"},
|
||||
"Hendry TD": {"subdomain": "hendry", "domain": "realtaxdeed.com", "source_id": "hendry_taxdeed", "state": "FL", "label": "Hendry County — Tax Deed"},
|
||||
"Hernando TD": {"subdomain": "hernando", "domain": "realtaxdeed.com", "source_id": "hernando_taxdeed", "state": "FL", "label": "Hernando County — Tax Deed"},
|
||||
"Highlands TD": {"subdomain": "highlands", "domain": "realtaxdeed.com", "source_id": "highlands_taxdeed", "state": "FL", "label": "Highlands County — Tax Deed (Sebring)"},
|
||||
"Hillsborough TD": {"subdomain": "hillsborough", "domain": "realtaxdeed.com", "source_id": "hillsborough_taxdeed", "state": "FL", "label": "Hillsborough County — Tax Deed (Tampa)"},
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# COLORADO TAX DEEDS via .treasurersdeedsale.com (same engine again)
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
"Adams TD": {"subdomain": "adams", "domain": "treasurersdeedsale.com", "source_id": "adams_taxdeed_co", "state": "CO", "label": "Adams County, CO — Treasurer's Deed"},
|
||||
"Denver TD": {"subdomain": "denver", "domain": "treasurersdeedsale.com", "source_id": "denver_taxdeed_co", "state": "CO", "label": "Denver County, CO — Treasurer's Deed"},
|
||||
"Eagle TD": {"subdomain": "eagle", "domain": "treasurersdeedsale.com", "source_id": "eagle_taxdeed_co", "state": "CO", "label": "Eagle County, CO — Treasurer's Deed"},
|
||||
"El Paso TD": {"subdomain": "elpasoco", "domain": "treasurersdeedsale.com", "source_id": "el_paso_taxdeed_co", "state": "CO", "label": "El Paso County, CO — Treasurer's Deed"},
|
||||
"Larimer TD": {"subdomain": "larimer", "domain": "treasurersdeedsale.com", "source_id": "larimer_taxdeed_co", "state": "CO", "label": "Larimer County, CO — Treasurer's Deed"},
|
||||
"Mesa TD": {"subdomain": "mesa", "domain": "treasurersdeedsale.com", "source_id": "mesa_taxdeed_co", "state": "CO", "label": "Mesa County, CO — Treasurer's Deed"},
|
||||
"Pitkin TD": {"subdomain": "pitkin", "domain": "treasurersdeedsale.com", "source_id": "pitkin_taxdeed_co", "state": "CO", "label": "Pitkin County, CO — Treasurer's Deed (Aspen)"},
|
||||
"Weld TD": {"subdomain": "weld", "domain": "treasurersdeedsale.com", "source_id": "weld_taxdeed_co", "state": "CO", "label": "Weld County, CO — Treasurer's Deed"},
|
||||
|
||||
# ─── Counties NOT en realforeclose.com (parking only — usan plataformas
|
||||
# distintas): Bradford, Collier, Columbia, DeSoto, Dixie, Franklin,
|
||||
# Gadsden, Glades, Hamilton, Hardee, Hendry, Hernando, Highlands,
|
||||
# Holmes, Jefferson, Lafayette, Levy, Liberty, Madison, Monroe,
|
||||
# Okaloosa, St. Johns, Sumter, Taylor, Union, Wakulla.
|
||||
# ─── Otras plataformas Realauction observadas (TODO add support):
|
||||
# - .realtaxdeed.com (Apache/Coconino/Mohave AZ, FL tax deed sales)
|
||||
# - .treasurersdeedsale.com (CO tax deed sales — Adams, Pitkin, etc.)
|
||||
# - california.taxdefaultsale.com (CA Fresno)
|
||||
}
|
||||
|
||||
|
||||
def get_county_config(county: str) -> dict:
|
||||
"""Returns the realauction config para un county. Raises ValueError si no soportado."""
|
||||
config = REALAUCTION_COUNTIES.get(county)
|
||||
if not config:
|
||||
valid = ", ".join(REALAUCTION_COUNTIES.keys())
|
||||
raise ValueError(
|
||||
f"County '{county}' no soportado por realauction_clerk. "
|
||||
f"Validos: {valid}"
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
# Real Chrome UA — bypassa anti-bot del sitio
|
||||
_CHROME_UA = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Rate limit por dominio: 1 request c/2.5s para no joder al sitio
|
||||
_REQUEST_INTERVAL_SECONDS = 2.5
|
||||
|
||||
# Mapping del clerk's "Auction Type" → nuestro deal_type canonico
|
||||
_AUCTION_TYPE_MAP = {
|
||||
"FORECLOSURE": "foreclosure",
|
||||
"TAXDEED": "tax_deed",
|
||||
"TAX DEED": "tax_deed",
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Parsing helpers (county-agnostic)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _parse_money(s: str) -> Optional[float]:
|
||||
"""Parse '$353,041.78' → 353041.78. Return None si invalido."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.]", "", s)
|
||||
if not cleaned or cleaned == ".":
|
||||
return None
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_address(line1: str, line2: str) -> dict:
|
||||
"""Parse property address.
|
||||
|
||||
line1 = "7355 POINCIANA CT" (street)
|
||||
line2 = "MIAMI LAKES, FL- 33014" (city, state-zip)
|
||||
"""
|
||||
out = {"address": None, "city": None, "state": None, "zip": None}
|
||||
line1 = (line1 or "").strip()
|
||||
line2 = (line2 or "").strip()
|
||||
|
||||
line2_clean = re.sub(r"\bFL-\s*", "FL ", line2).strip()
|
||||
if line1 and line2_clean:
|
||||
out["address"] = f"{line1}, {line2_clean}"
|
||||
elif line1:
|
||||
out["address"] = line1
|
||||
elif line2_clean:
|
||||
out["address"] = line2_clean
|
||||
|
||||
if line2:
|
||||
m = re.match(r"^(.+?),\s*([A-Z]{2})[-\s]\s*(\d{5})(?:-\d{4})?", line2)
|
||||
if m:
|
||||
out["city"] = m.group(1).title()
|
||||
out["state"] = m.group(2)
|
||||
out["zip"] = m.group(3)
|
||||
else:
|
||||
m2 = re.search(r"\b([A-Z]{2})[-\s]\s*(\d{5})", line2)
|
||||
if m2:
|
||||
out["state"] = m2.group(1)
|
||||
out["zip"] = m2.group(2)
|
||||
out["city"] = line2.split(",")[0].strip().title() if "," in line2 else None
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _extract_case_from_table_rows(rows: list[list[str]]) -> Optional[dict]:
|
||||
"""Dado las rows de una tabla case, extrae el case dict.
|
||||
Returns None si no es una tabla de case valida.
|
||||
"""
|
||||
fields: dict[str, str] = {}
|
||||
addr_line2: Optional[str] = None
|
||||
next_row_is_addr_line2 = False
|
||||
|
||||
_ADDR_LINE2_DISALLOWED_KEYWORDS = (
|
||||
"Assessed Value", "Plaintiff Max Bid", "Auction Type", "Case #",
|
||||
"Certificate #", "Final Judgment", "Opening Bid", "Parcel ID",
|
||||
)
|
||||
|
||||
for row in rows:
|
||||
non_empty = [c for c in row if c]
|
||||
if not non_empty:
|
||||
continue
|
||||
|
||||
if next_row_is_addr_line2 and len(non_empty) >= 1:
|
||||
candidate = non_empty[0].strip()
|
||||
looks_like_addr = not any(
|
||||
kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS
|
||||
)
|
||||
if looks_like_addr:
|
||||
addr_line2 = candidate
|
||||
next_row_is_addr_line2 = False
|
||||
if looks_like_addr:
|
||||
continue
|
||||
|
||||
if len(non_empty) >= 2:
|
||||
key = non_empty[0].rstrip(":").strip()
|
||||
value = non_empty[1].strip()
|
||||
fields[key] = value
|
||||
if key == "Property Address":
|
||||
if value:
|
||||
next_row_is_addr_line2 = False
|
||||
else:
|
||||
next_row_is_addr_line2 = True
|
||||
elif len(non_empty) == 1:
|
||||
if next_row_is_addr_line2:
|
||||
candidate = non_empty[0].strip()
|
||||
if not any(kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS):
|
||||
addr_line2 = candidate
|
||||
next_row_is_addr_line2 = False
|
||||
|
||||
if not fields.get("Case #"):
|
||||
return None
|
||||
|
||||
# Auction Type explicit (Miami-Dade/Duval/Broward style) → source of truth.
|
||||
# Si NO esta (Orange style — solo AD_LBL/AD_DTA divs sin Auction Type field),
|
||||
# inferir desde case_number con fallback a foreclosure (mas comun):
|
||||
# - "TD" en case# o "TAXDEED" → tax_deed
|
||||
# - default → foreclosure (clerks listan mayoria foreclosure)
|
||||
auction_type_raw = (fields.get("Auction Type") or "").upper().strip()
|
||||
if auction_type_raw:
|
||||
deal_type = _AUCTION_TYPE_MAP.get(auction_type_raw)
|
||||
if not deal_type:
|
||||
return None # Explicit pero unknown — skip
|
||||
else:
|
||||
case_num_upper = (fields.get("Case #") or "").upper()
|
||||
if "TAXDEED" in case_num_upper or "-TD-" in case_num_upper:
|
||||
deal_type = "tax_deed"
|
||||
auction_type_raw = "TAXDEED (inferred)"
|
||||
else:
|
||||
deal_type = "foreclosure"
|
||||
auction_type_raw = "FORECLOSURE (inferred)"
|
||||
|
||||
case = {
|
||||
"deal_type": deal_type,
|
||||
"case_number": fields.get("Case #"),
|
||||
"auction_type_raw": auction_type_raw,
|
||||
}
|
||||
|
||||
if fields.get("Certificate #"):
|
||||
case["certificate_number"] = fields["Certificate #"]
|
||||
if fields.get("Final Judgment Amount"):
|
||||
case["final_judgment_amount"] = _parse_money(fields["Final Judgment Amount"])
|
||||
if fields.get("Opening Bid"):
|
||||
case["starting_bid"] = _parse_money(fields["Opening Bid"])
|
||||
if fields.get("Parcel ID"):
|
||||
case["parcel_id"] = fields["Parcel ID"]
|
||||
if fields.get("Assessed Value"):
|
||||
case["assessed_value"] = _parse_money(fields["Assessed Value"])
|
||||
if fields.get("Plaintiff Max Bid"):
|
||||
case["plaintiff_max_bid_raw"] = fields["Plaintiff Max Bid"]
|
||||
|
||||
addr_parts = _parse_address(fields.get("Property Address", ""), addr_line2 or "")
|
||||
case.update(addr_parts)
|
||||
|
||||
return case
|
||||
|
||||
|
||||
def _build_description(case: dict) -> str:
|
||||
"""Compact text description from case facts — useful for DealClassifier context."""
|
||||
bits = []
|
||||
status = case.get("auction_status")
|
||||
if status:
|
||||
bits.append(f"Status: {status}")
|
||||
if case.get("auction_type_raw"):
|
||||
bits.append(f"Auction Type: {case['auction_type_raw']}")
|
||||
if case.get("certificate_number"):
|
||||
bits.append(f"Tax Cert #: {case['certificate_number']}")
|
||||
if case.get("final_judgment_amount"):
|
||||
bits.append(f"Final Judgment Amount: ${case['final_judgment_amount']:,.2f}")
|
||||
if case.get("starting_bid"):
|
||||
bits.append(f"Opening Bid: ${case['starting_bid']:,.2f}")
|
||||
if case.get("assessed_value"):
|
||||
bits.append(f"Assessed Value (PA): ${case['assessed_value']:,.2f}")
|
||||
if case.get("parcel_id"):
|
||||
bits.append(f"Parcel ID: {case['parcel_id']}")
|
||||
if case.get("plaintiff_max_bid_raw"):
|
||||
bits.append(f"Plaintiff Max Bid: {case['plaintiff_max_bid_raw']}")
|
||||
return " | ".join(bits)
|
||||
|
||||
|
||||
def _build_deal_record(case: dict, auction_date_iso: str, county_config: dict) -> dict:
|
||||
"""Convert clerk case dict → deal record compatible with deals_db.insert_deal.
|
||||
|
||||
Reglas pricing (heredadas del Miami-Dade v1.1 fix):
|
||||
- tax_deed: listing_price = starting_bid
|
||||
- foreclosure: listing_price = None (bid hidden pre-auction)
|
||||
- final_judgment_amount stored separately (NOT confused with listing_price)
|
||||
"""
|
||||
deal_type = case.get("deal_type")
|
||||
starting_bid = case.get("starting_bid")
|
||||
assessed_value = case.get("assessed_value")
|
||||
final_judgment = case.get("final_judgment_amount")
|
||||
|
||||
if deal_type == "tax_deed":
|
||||
listing_price = starting_bid
|
||||
elif deal_type == "foreclosure":
|
||||
listing_price = None
|
||||
else:
|
||||
listing_price = starting_bid or assessed_value
|
||||
|
||||
subdomain = county_config["subdomain"]
|
||||
domain = county_config.get("domain", REALAUCTION_DEFAULT_DOMAIN)
|
||||
source_id = county_config["source_id"]
|
||||
state = county_config["state"]
|
||||
county_label = county_config["label"].split(" (")[0] # "Duval (Jacksonville)" → "Duval"
|
||||
|
||||
deal = {
|
||||
"source": source_id,
|
||||
"source_url": (
|
||||
f"https://{subdomain}.{domain}/index.cfm"
|
||||
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate="
|
||||
f"{auction_date_iso[5:7]}/{auction_date_iso[8:10]}/{auction_date_iso[0:4]}"
|
||||
),
|
||||
"address": case.get("address"),
|
||||
"city": case.get("city"),
|
||||
"state": case.get("state") or state,
|
||||
"zip": case.get("zip"),
|
||||
"county": county_label,
|
||||
"parcel_id": case.get("parcel_id"),
|
||||
"listing_price": listing_price,
|
||||
"deal_type": deal_type,
|
||||
"starting_bid": starting_bid,
|
||||
"estimated_arv": assessed_value,
|
||||
"final_judgment_amount": final_judgment,
|
||||
"auction_status": case.get("auction_status") or "scheduled",
|
||||
"case_number": case.get("case_number"),
|
||||
"auction_date": auction_date_iso,
|
||||
"listing_description": _build_description(case),
|
||||
}
|
||||
return deal
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Status filtering (REDEEMED/CANCELED bug fix)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Cases con estos statuses NO van a auction → NO incluir en results.
|
||||
# Substring matching: "Canceled per Bankruptcy" → dead (contains "canceled").
|
||||
_DEAD_STATUS_SUBSTRINGS = (
|
||||
"redeemed",
|
||||
"canceled",
|
||||
"cancelled",
|
||||
"sold",
|
||||
"closed", # case closed/disposed
|
||||
"title transferred",
|
||||
"withdrawn",
|
||||
"dismissed",
|
||||
)
|
||||
|
||||
|
||||
def _is_status_dead(status: Optional[str]) -> bool:
|
||||
"""Returns True si el case esta inactivo (off-market)."""
|
||||
if not status:
|
||||
return False
|
||||
s = status.strip().lower()
|
||||
if not s:
|
||||
return False
|
||||
return any(dead in s for dead in _DEAD_STATUS_SUBSTRINGS)
|
||||
|
||||
|
||||
def _parse_cases_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[dict]:
|
||||
"""Parse all auction cases from a Realforeclose page HTML using stdlib only."""
|
||||
from html.parser import HTMLParser
|
||||
|
||||
class AuctionItemParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.items: list[dict] = []
|
||||
self.auction_item_depth = 0
|
||||
self.current_item_depth_start = None
|
||||
self.div_depth = 0
|
||||
self.status_label_active = False
|
||||
self.expecting_status_value = False
|
||||
self.in_astat_msga = False
|
||||
self.in_astat_msgb = False
|
||||
self.astat_msga_text = ""
|
||||
self.astat_msgb_text = ""
|
||||
self.in_table = 0
|
||||
self.current_table: list[list[str]] = []
|
||||
self.current_row: list[str] = []
|
||||
self.in_cell = 0
|
||||
self.cell_text_parts: list[str] = []
|
||||
self.current_status = ""
|
||||
self.current_tables: list[list[list[str]]] = []
|
||||
# Orange-style div-based fields: <div class="AD_LBL">Label:</div>
|
||||
# <div class="AD_DTA">Value</div>
|
||||
# We collect these as synthetic [Label, Value] rows so el extractor
|
||||
# downstream funciona sin cambios.
|
||||
self.in_ad_lbl = False
|
||||
self.in_ad_dta = False
|
||||
self.ad_lbl_text = ""
|
||||
self.ad_dta_text = ""
|
||||
self.last_ad_lbl: Optional[str] = None
|
||||
self.current_ad_rows: list[list[str]] = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs_d = dict(attrs)
|
||||
classes = (attrs_d.get("class") or "").split()
|
||||
|
||||
if tag == "div":
|
||||
self.div_depth += 1
|
||||
if "AUCTION_ITEM" in classes:
|
||||
if self.auction_item_depth == 0:
|
||||
self.current_item_depth_start = self.div_depth
|
||||
self.current_status = ""
|
||||
self.current_tables = []
|
||||
self.current_ad_rows = []
|
||||
self.last_ad_lbl = None
|
||||
self.auction_item_depth += 1
|
||||
elif self.auction_item_depth:
|
||||
if "ASTAT_MSGA" in classes:
|
||||
self.in_astat_msga = True
|
||||
self.astat_msga_text = ""
|
||||
elif "ASTAT_MSGB" in classes:
|
||||
self.in_astat_msgb = True
|
||||
self.astat_msgb_text = ""
|
||||
elif "AD_LBL" in classes:
|
||||
self.in_ad_lbl = True
|
||||
self.ad_lbl_text = ""
|
||||
elif "AD_DTA" in classes:
|
||||
self.in_ad_dta = True
|
||||
self.ad_dta_text = ""
|
||||
elif tag == "table" and self.auction_item_depth:
|
||||
self.in_table += 1
|
||||
self.current_table = []
|
||||
elif tag == "tr" and self.in_table:
|
||||
self.current_row = []
|
||||
elif tag in ("td", "th") and self.in_table:
|
||||
self.in_cell += 1
|
||||
self.cell_text_parts = []
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "div":
|
||||
if self.in_astat_msga:
|
||||
self.in_astat_msga = False
|
||||
if "auction status" in self.astat_msga_text.strip().lower():
|
||||
self.expecting_status_value = True
|
||||
elif self.in_astat_msgb:
|
||||
self.in_astat_msgb = False
|
||||
if self.expecting_status_value and self.auction_item_depth:
|
||||
self.current_status = self.astat_msgb_text.strip()
|
||||
self.expecting_status_value = False
|
||||
elif self.in_ad_lbl:
|
||||
self.in_ad_lbl = False
|
||||
self.last_ad_lbl = self.ad_lbl_text.strip()
|
||||
elif self.in_ad_dta:
|
||||
self.in_ad_dta = False
|
||||
# Pair with most recent AD_LBL if any
|
||||
if self.last_ad_lbl and self.auction_item_depth:
|
||||
label = self.last_ad_lbl
|
||||
value = " ".join(self.ad_dta_text.split()).strip()
|
||||
self.current_ad_rows.append([label, value])
|
||||
self.last_ad_lbl = None
|
||||
if self.auction_item_depth and self.div_depth == self.current_item_depth_start:
|
||||
self.auction_item_depth -= 1
|
||||
if self.auction_item_depth == 0:
|
||||
# If item had AD_LBL/AD_DTA pairs (Orange-style), add them
|
||||
# as a synthetic table so el extractor downstream funciona.
|
||||
tables = list(self.current_tables)
|
||||
if self.current_ad_rows:
|
||||
tables.append(self.current_ad_rows)
|
||||
self.items.append({
|
||||
"status": self.current_status,
|
||||
"tables": tables,
|
||||
})
|
||||
self.current_status = ""
|
||||
self.current_tables = []
|
||||
self.current_ad_rows = []
|
||||
self.current_item_depth_start = None
|
||||
elif self.auction_item_depth:
|
||||
pass
|
||||
self.div_depth -= 1
|
||||
elif tag in ("td", "th") and self.in_cell:
|
||||
self.in_cell -= 1
|
||||
text = " ".join("".join(self.cell_text_parts).split()).strip()
|
||||
self.current_row.append(text)
|
||||
self.cell_text_parts = []
|
||||
elif tag == "tr" and self.in_table:
|
||||
if self.current_row:
|
||||
self.current_table.append(self.current_row)
|
||||
self.current_row = []
|
||||
elif tag == "table":
|
||||
if self.in_table:
|
||||
self.in_table -= 1
|
||||
if self.current_table and self.auction_item_depth:
|
||||
self.current_tables.append(self.current_table)
|
||||
self.current_table = []
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_cell:
|
||||
self.cell_text_parts.append(data)
|
||||
elif self.in_astat_msga:
|
||||
self.astat_msga_text += data
|
||||
elif self.in_astat_msgb:
|
||||
self.astat_msgb_text += data
|
||||
elif self.in_ad_lbl:
|
||||
self.ad_lbl_text += data
|
||||
elif self.in_ad_dta:
|
||||
self.ad_dta_text += data
|
||||
|
||||
parser = AuctionItemParser()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception as e:
|
||||
if log_fn:
|
||||
log_fn(f" HTML parser error: {e}")
|
||||
return []
|
||||
|
||||
cases_on_page: list[dict] = []
|
||||
skipped_dead = 0
|
||||
for item in parser.items:
|
||||
status = item["status"] or ""
|
||||
if _is_status_dead(status):
|
||||
skipped_dead += 1
|
||||
continue
|
||||
for rows in item["tables"]:
|
||||
try:
|
||||
case = _extract_case_from_table_rows(rows)
|
||||
if case and case.get("case_number"):
|
||||
case["auction_status"] = status or "scheduled"
|
||||
if not any(c.get("case_number") == case["case_number"] for c in cases_on_page):
|
||||
cases_on_page.append(case)
|
||||
break
|
||||
except Exception as e:
|
||||
if log_fn:
|
||||
log_fn(f" table parse error: {e}")
|
||||
|
||||
if skipped_dead and log_fn:
|
||||
log_fn(f" filtered {skipped_dead} dead case(s) (Redeemed/Canceled/Sold/etc)")
|
||||
|
||||
return cases_on_page
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Main scraper entry point
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def scrape_realauction_county(
|
||||
*,
|
||||
county: str,
|
||||
days_ahead: int = 14,
|
||||
days_back: int = 0,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
max_dates: Optional[int] = None,
|
||||
use_cache: bool = True,
|
||||
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> list[dict]:
|
||||
"""Scrape realauction.com calendar para un county especifico.
|
||||
|
||||
Args:
|
||||
county: nombre del county (e.g. "Miami-Dade", "Duval", "Broward").
|
||||
Debe estar en REALAUCTION_COUNTIES.
|
||||
days_ahead: dias hacia adelante (default 14)
|
||||
days_back: dias hacia atras (default 0)
|
||||
status_cb: callback opcional para logging
|
||||
max_dates: limita el numero de dias scrapeados (testing)
|
||||
use_cache: usar cache 24h (default True)
|
||||
cache_ttl_seconds: TTL del cache
|
||||
|
||||
Returns:
|
||||
list[dict] de deal records listos para deals_db.insert_deal
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||||
|
||||
config = get_county_config(county)
|
||||
subdomain = config["subdomain"]
|
||||
domain = config.get("domain", REALAUCTION_DEFAULT_DOMAIN)
|
||||
source_id = config["source_id"]
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(msg)
|
||||
|
||||
deals: list[dict] = []
|
||||
today = datetime.now(timezone.utc).date()
|
||||
dates_to_scrape = []
|
||||
for offset in range(-days_back, days_ahead + 1):
|
||||
d = today + timedelta(days=offset)
|
||||
dates_to_scrape.append(d)
|
||||
if max_dates:
|
||||
dates_to_scrape = dates_to_scrape[:max_dates]
|
||||
|
||||
_log(f"{config['label']} Clerk: scraping {len(dates_to_scrape)} dates (cache={'ON' if use_cache else 'OFF'})")
|
||||
|
||||
cache_namespace = source_id # e.g. "duval_clerk"
|
||||
cached_pages: dict[str, str] = {}
|
||||
dates_needing_fetch: list = []
|
||||
cache_hits = 0
|
||||
for date in dates_to_scrape:
|
||||
date_str = date.strftime("%m/%d/%Y")
|
||||
url = (
|
||||
f"https://{subdomain}.{domain}/index.cfm"
|
||||
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
|
||||
)
|
||||
if use_cache:
|
||||
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
|
||||
if cached:
|
||||
cached_pages[url] = cached
|
||||
cache_hits += 1
|
||||
continue
|
||||
dates_needing_fetch.append(date)
|
||||
_log(f" cache hits: {cache_hits}/{len(dates_to_scrape)}; need to fetch {len(dates_needing_fetch)}")
|
||||
|
||||
fresh_pages: dict[str, str] = {}
|
||||
if dates_needing_fetch:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent=_CHROME_UA,
|
||||
viewport={"width": 1280, "height": 800},
|
||||
locale="en-US",
|
||||
timezone_id="America/New_York",
|
||||
)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(20_000)
|
||||
|
||||
last_request_at = 0.0
|
||||
for date in dates_needing_fetch:
|
||||
elapsed = time.time() - last_request_at
|
||||
if elapsed < _REQUEST_INTERVAL_SECONDS:
|
||||
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
|
||||
last_request_at = time.time()
|
||||
|
||||
date_str = date.strftime("%m/%d/%Y")
|
||||
url = (
|
||||
f"https://{subdomain}.{domain}/index.cfm"
|
||||
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
|
||||
)
|
||||
_log(f" Fetching {date_str}...")
|
||||
|
||||
try:
|
||||
response = page.goto(url, wait_until="networkidle", timeout=20_000)
|
||||
if response.status != 200:
|
||||
_log(f" HTTP {response.status} — skipping")
|
||||
continue
|
||||
except PlaywrightTimeout:
|
||||
_log(f" timeout — skipping")
|
||||
continue
|
||||
except Exception as e:
|
||||
_log(f" error {e} — skipping")
|
||||
continue
|
||||
|
||||
try:
|
||||
time.sleep(1.5)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
html = page.content()
|
||||
fresh_pages[url] = html
|
||||
if use_cache:
|
||||
save_cache(cache_namespace, url, html,
|
||||
status_code=200, ttl_seconds=cache_ttl_seconds)
|
||||
|
||||
browser.close()
|
||||
|
||||
for date in dates_to_scrape:
|
||||
date_str = date.strftime("%m/%d/%Y")
|
||||
auction_date_iso = date.isoformat()
|
||||
url = (
|
||||
f"https://{subdomain}.{domain}/index.cfm"
|
||||
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
|
||||
)
|
||||
html = cached_pages.get(url) or fresh_pages.get(url)
|
||||
if not html:
|
||||
continue
|
||||
|
||||
cases_on_page = _parse_cases_from_html(html, _log)
|
||||
_log(f" {date_str}: parsed {len(cases_on_page)} case(s)")
|
||||
|
||||
for case in cases_on_page:
|
||||
deal = _build_deal_record(case, auction_date_iso, config)
|
||||
if not deal.get("address") and not deal.get("listing_price"):
|
||||
continue
|
||||
deals.append(deal)
|
||||
|
||||
_log(f"{config['label']} Clerk: scraped {len(deals)} total deals")
|
||||
return deals
|
||||
|
||||
|
||||
def run_scraper_to_db(
|
||||
*,
|
||||
county: str,
|
||||
days_ahead: int = 14,
|
||||
days_back: int = 0,
|
||||
auto_classify: bool = True,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
max_dates: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Full pipeline: scrape county → persist → optionally auto-classify nuevos."""
|
||||
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
|
||||
init_db()
|
||||
|
||||
config = get_county_config(county)
|
||||
source_id = config["source_id"]
|
||||
|
||||
run_id = record_scraper_run(source_id)
|
||||
errors: list[str] = []
|
||||
|
||||
def _log(m: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(m)
|
||||
|
||||
try:
|
||||
deals = scrape_realauction_county(
|
||||
county=county,
|
||||
days_ahead=days_ahead,
|
||||
days_back=days_back,
|
||||
status_cb=status_cb,
|
||||
max_dates=max_dates,
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(f"scrape failed: {e}")
|
||||
deals = []
|
||||
|
||||
deals_new = 0
|
||||
deals_updated = 0
|
||||
new_deal_ids: list[int] = []
|
||||
|
||||
for deal in deals:
|
||||
try:
|
||||
deal_id, is_new = insert_deal(deal)
|
||||
if is_new:
|
||||
deals_new += 1
|
||||
new_deal_ids.append(deal_id)
|
||||
else:
|
||||
deals_updated += 1
|
||||
except Exception as e:
|
||||
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
|
||||
|
||||
classified_count = 0
|
||||
if auto_classify and new_deal_ids:
|
||||
_log(f"Auto-classifying {len(new_deal_ids)} new deals...")
|
||||
from deal_classifier import classify_deal
|
||||
from deals_db import get_deal_by_id, update_classification
|
||||
for did in new_deal_ids:
|
||||
try:
|
||||
d = get_deal_by_id(did)
|
||||
if not d:
|
||||
continue
|
||||
result = classify_deal(d)
|
||||
update_classification(
|
||||
deal_id=did,
|
||||
status=result["classification_status"],
|
||||
score=result["score"],
|
||||
reasons=result["reasons"],
|
||||
strategy=result["strategy"],
|
||||
)
|
||||
classified_count += 1
|
||||
except Exception as e:
|
||||
errors.append(f"classify fail for deal_id={did}: {e}")
|
||||
|
||||
finish_scraper_run(
|
||||
run_id,
|
||||
deals_found=len(deals),
|
||||
deals_new=deals_new,
|
||||
deals_updated=deals_updated,
|
||||
errors_count=len(errors),
|
||||
errors_summary=errors if errors else None,
|
||||
firecrawl_credits_used=0,
|
||||
status="success" if not errors else ("partial" if deals else "failed"),
|
||||
)
|
||||
|
||||
return {
|
||||
"source": source_id,
|
||||
"scraper_run_id": run_id,
|
||||
"deals_found": len(deals),
|
||||
"deals_new": deals_new,
|
||||
"deals_updated": deals_updated,
|
||||
"deals_classified": classified_count,
|
||||
"errors_count": len(errors),
|
||||
"errors": errors,
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
"""scrapers/registry.py — Single source of truth for scraper metadata.
|
||||
|
||||
Phase 3B introduces multiple scrapers. The Search UI (B4) needs to know:
|
||||
- Which sources exist
|
||||
- Which counties each supports (Miami-Dade Clerk = solo Miami-Dade;
|
||||
HUD = cualquier estado; Zillow = cualquier county)
|
||||
- Firecrawl credit cost estimate per run
|
||||
- The callable entry point that executes the scraper end-to-end
|
||||
|
||||
Cuando se agrega un nuevo scraper (B5 Zillow, B6 Realtor, etc), se registra acá.
|
||||
|
||||
REGISTRY STRUCTURE:
|
||||
{
|
||||
"source_id": {
|
||||
"label": "Human-readable name",
|
||||
"callable_path": "scrapers.miami_dade_clerk:run_scraper_to_db",
|
||||
"scope": "county" | "state" | "national",
|
||||
"supported_counties": ["Miami-Dade"] | None (None = todos),
|
||||
"supported_states": ["FL"] | None (None = todos),
|
||||
"deal_types_produced": ["foreclosure", "tax_deed", "reo", "mls"],
|
||||
"firecrawl_credits_per_run": int (estimacion conservadora),
|
||||
"stack": "playwright" | "firecrawl" | "hybrid",
|
||||
"free": bool (True si NO consume Firecrawl),
|
||||
"description": "...",
|
||||
"parameters_schema": dict (kwargs que acepta el callable),
|
||||
}
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# REGISTRY DATA
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Helper: realauction.com county clerks share the same engine (realauction_clerk.py).
|
||||
# DRY: build the registry entries from a config table.
|
||||
def _realauction_entry(county_label: str, county_name: str, source_id: str, area_hint: str = "") -> dict:
|
||||
"""Build a registry entry for a realauction.com white-label county clerk."""
|
||||
nice_label = f"{county_label} Clerk{f' ({area_hint})' if area_hint else ''} — Foreclosure + Tax Deed"
|
||||
return {
|
||||
"label": nice_label,
|
||||
"callable_path": "scrapers.realauction_clerk:run_scraper_to_db",
|
||||
"scope": "county",
|
||||
"supported_counties": [county_name],
|
||||
"supported_states": ["FL"],
|
||||
"deal_types_produced": ["foreclosure", "tax_deed"],
|
||||
"firecrawl_credits_per_run": 0,
|
||||
"stack": "playwright",
|
||||
"free": True,
|
||||
"description": (
|
||||
f"Auctions de foreclosure (Circuit Court cases) + tax deed sales de "
|
||||
f"{county_label} County, FL, via realauction.com. Cubre ~7-15 dias "
|
||||
f"hacia adelante por corrida. Gratis (Playwright local)."
|
||||
),
|
||||
"parameters_schema": {
|
||||
# county kwarg se inyecta automaticamente desde aqui — search_engine
|
||||
# construye kwargs leyendo este schema y pasa "county" al callable
|
||||
"county": {"type": "str", "default": county_name},
|
||||
"days_ahead": {"type": "int", "default": 14, "min": 1, "max": 30},
|
||||
"days_back": {"type": "int", "default": 0, "min": 0, "max": 7},
|
||||
"auto_classify": {"type": "bool", "default": True},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Auto-generate clerk entries from REALAUCTION_COUNTIES (single source of truth).
|
||||
# Adding a county there → auto-registers here. No duplication.
|
||||
from scrapers.realauction_clerk import REALAUCTION_COUNTIES as _RA_COUNTIES
|
||||
|
||||
|
||||
def _all_realauction_clerk_entries() -> dict:
|
||||
"""Generate registry entries for ALL counties in REALAUCTION_COUNTIES."""
|
||||
out = {}
|
||||
for county_name, cfg in _RA_COUNTIES.items():
|
||||
source_id = cfg["source_id"]
|
||||
label = cfg.get("label", county_name)
|
||||
# Extract area_hint from label (e.g. "Duval (Jacksonville)" → "Jacksonville")
|
||||
area_hint = ""
|
||||
if "(" in label and ")" in label:
|
||||
area_hint = label.split("(", 1)[1].rstrip(")").strip()
|
||||
out[source_id] = _realauction_entry(county_name, county_name, source_id, area_hint=area_hint)
|
||||
return out
|
||||
|
||||
|
||||
SOURCES: dict[str, dict] = {
|
||||
# All 41 FL counties on realforeclose.com platform (auto-generated)
|
||||
**_all_realauction_clerk_entries(),
|
||||
"hud_homestore": {
|
||||
"label": "HUD Homestore — Federal REO (FHA defaults)",
|
||||
"callable_path": "scrapers.hud_homestore:run_scraper_to_db",
|
||||
"scope": "state",
|
||||
"supported_counties": None, # cualquier county dentro del state
|
||||
"supported_states": None, # cualquier USA state — pero default FL
|
||||
"deal_types_produced": ["reo"],
|
||||
"firecrawl_credits_per_run": 0,
|
||||
"stack": "playwright",
|
||||
"free": True,
|
||||
"description": (
|
||||
"Listings federales de propiedades HUD (REO post-foreclosure de loans "
|
||||
"FHA defaulted). Cubre el estado entero en una corrida. "
|
||||
"Listing periods: Exclusive (owner-occ primero) vs Extended (investors)."
|
||||
),
|
||||
"parameters_schema": {
|
||||
"states": {"type": "list[str]", "default": ["FL"], "description": "USA state codes (e.g., FL, GA)"},
|
||||
"auto_classify": {"type": "bool", "default": True},
|
||||
},
|
||||
},
|
||||
"zillow": {
|
||||
"label": "Zillow MLS Listings",
|
||||
"callable_path": "scrapers.zillow:run_scraper_to_db",
|
||||
"scope": "county",
|
||||
"supported_counties": None, # cualquier county
|
||||
"supported_states": None, # cualquier state
|
||||
"deal_types_produced": ["mls", "foreclosure", "auction"],
|
||||
# Firecrawl: 1 credit por page scrape (verified). Default 1 page/county.
|
||||
"firecrawl_credits_per_run": 1,
|
||||
"stack": "firecrawl",
|
||||
"free": False,
|
||||
"description": (
|
||||
"Zillow MLS listings parametrizable por county. Cubre Single Family "
|
||||
"Homes (SFH). ~9-30 listings por page scrape. Detecta badges "
|
||||
"automaticamente (New construction, Price reduced, Foreclosure, Auction). "
|
||||
"Costo: 1 Firecrawl credit por page scrape (~$0.001 USD)."
|
||||
),
|
||||
"parameters_schema": {
|
||||
"counties": {"type": "list[str]", "default": ["Miami-Dade"]},
|
||||
"state": {"type": "str", "default": "FL"},
|
||||
"pages_per_county": {"type": "int", "default": 1, "min": 1, "max": 5},
|
||||
"auto_classify": {"type": "bool", "default": True},
|
||||
},
|
||||
},
|
||||
# Slot for future scrapers (uncomment when shipped):
|
||||
# "realtor": {...},
|
||||
# "broward_clerk": {...},
|
||||
# "palm_beach_clerk": {...},
|
||||
# "hillsborough_clerk": {...},
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def list_sources() -> list[dict]:
|
||||
"""Returns list of all registered sources with their metadata."""
|
||||
return [{"id": k, **v} for k, v in SOURCES.items()]
|
||||
|
||||
|
||||
def get_source(source_id: str) -> Optional[dict]:
|
||||
return SOURCES.get(source_id)
|
||||
|
||||
|
||||
def get_sources_for_county(county: str) -> list[dict]:
|
||||
"""Returns list of sources que soportan el condado dado.
|
||||
|
||||
"Miami-Dade" → [miami_dade_clerk, hud_homestore (national)]
|
||||
"Broward" → [hud_homestore] (until broward_clerk added)
|
||||
"""
|
||||
result = []
|
||||
for src_id, src in SOURCES.items():
|
||||
sup = src.get("supported_counties")
|
||||
if sup is None or county in sup:
|
||||
result.append({"id": src_id, **src})
|
||||
return result
|
||||
|
||||
|
||||
def get_sources_for_state(state: str) -> list[dict]:
|
||||
"""Returns sources que soportan el state dado."""
|
||||
result = []
|
||||
for src_id, src in SOURCES.items():
|
||||
sup_states = src.get("supported_states")
|
||||
if sup_states is None or state in sup_states:
|
||||
result.append({"id": src_id, **src})
|
||||
return result
|
||||
|
||||
|
||||
def estimate_credits(source_ids: list[str]) -> dict:
|
||||
"""Calcula total Firecrawl credits que consumirian estos sources combinados."""
|
||||
total = 0
|
||||
breakdown = []
|
||||
for src_id in source_ids:
|
||||
src = SOURCES.get(src_id)
|
||||
if not src:
|
||||
continue
|
||||
cr = src.get("firecrawl_credits_per_run", 0)
|
||||
total += cr
|
||||
breakdown.append({
|
||||
"source_id": src_id,
|
||||
"label": src["label"],
|
||||
"credits": cr,
|
||||
"free": src.get("free", False),
|
||||
})
|
||||
return {
|
||||
"total_credits": total,
|
||||
"breakdown": breakdown,
|
||||
}
|
||||
|
||||
|
||||
def resolve_callable(source_id: str) -> Optional[Callable[..., Any]]:
|
||||
"""Importa el callable real de un source. Returns None si fallo."""
|
||||
src = SOURCES.get(source_id)
|
||||
if not src:
|
||||
return None
|
||||
path = src.get("callable_path")
|
||||
if not path or ":" not in path:
|
||||
return None
|
||||
module_name, func_name = path.split(":", 1)
|
||||
try:
|
||||
mod = importlib.import_module(module_name)
|
||||
return getattr(mod, func_name)
|
||||
except (ImportError, AttributeError) as e:
|
||||
# Defensive: don't crash UI if scraper module is broken
|
||||
return None
|
||||
@@ -0,0 +1,762 @@
|
||||
"""scrapers/zillow.py — Zillow MLS scraper via Firecrawl.
|
||||
|
||||
SOURCE: https://www.zillow.com/{county-slug}-county-{state}/houses/
|
||||
STACK: Firecrawl (no Playwright — Zillow has aggressive anti-bot)
|
||||
|
||||
URL PATTERN:
|
||||
https://www.zillow.com/miami-dade-county-fl/houses/
|
||||
https://www.zillow.com/broward-county-fl/houses/
|
||||
https://www.zillow.com/palm-beach-county-fl/houses/
|
||||
|
||||
COST: ~1 Firecrawl credit per page scrape (~25-40 listings/page)
|
||||
Default: 1 page = 1 credit per county
|
||||
|
||||
MARKDOWN STRUCTURE (descubierto via exploration):
|
||||
- [$PRICE](URL_zpid)
|
||||
- **N** bds
|
||||
- **N** ba
|
||||
- **N,NNN** sqft
|
||||
[optional badges like "New construction" / "Price reduced"]
|
||||
[STREET, CITY, STATE ZIP](URL_zpid)
|
||||
[REALTOR/BROKERAGE_NAME]
|
||||
|
||||
DEAL TYPE: 'mls'
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
|
||||
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_HOURLY
|
||||
|
||||
|
||||
SOURCE = "zillow"
|
||||
ZILLOW_BASE = "https://www.zillow.com"
|
||||
|
||||
# Rate limit conservador (Zillow es sensible)
|
||||
_REQUEST_INTERVAL_SECONDS = 4.0
|
||||
|
||||
# Default cache TTL: 1h (MLS listings se mueven mas rapido que court records)
|
||||
_CACHE_TTL = DEFAULT_TTL_SECONDS_HOURLY
|
||||
|
||||
|
||||
def _build_zillow_url(county: str, state: str, page: int = 1) -> str:
|
||||
"""Genera URL canonica de Zillow para un county.
|
||||
|
||||
Args:
|
||||
county: e.g. "Miami-Dade", "Palm Beach"
|
||||
state: 2-letter code, e.g. "FL"
|
||||
page: 1-indexed page number
|
||||
|
||||
Returns:
|
||||
URL string.
|
||||
"""
|
||||
slug = county.lower().replace(" ", "-")
|
||||
base = f"{ZILLOW_BASE}/{slug}-county-{state.lower()}/houses/"
|
||||
if page > 1:
|
||||
base += f"{page}_p/"
|
||||
return base
|
||||
|
||||
|
||||
# ─── Parser ────────────────────────────────────────────────────────────────
|
||||
|
||||
# Regex para extraer cada listing block del markdown.
|
||||
# Captura: price, URL, bds, ba, sqft (cuando hay), address line, brokerage line.
|
||||
# Format del markdown:
|
||||
# - [$PRICE](URL_TO_DETAIL)
|
||||
# (texto opcional con bds/ba/sqft o badges)
|
||||
# [STREET, CITY, FL ZIP](URL_TO_DETAIL)
|
||||
# [BROKERAGE]
|
||||
_LISTING_PATTERN = re.compile(
|
||||
r"-\s+\[\$([\d,]+)\]\((https?://[^)]+zpid[^)]*)\)"
|
||||
r"([\s\S]*?)" # body (lazy match hasta el siguiente listing)
|
||||
r"(?=\n-\s+\[\$[\d,]+\]\(|\Z)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
# Image URL pattern within listing body (Zillow CDN)
|
||||
_IMG_PATTERN = re.compile(
|
||||
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png))\)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_BEDS_RE = re.compile(r"\*\*\s*([\d.]+)\s*\*\*\s*bds", re.IGNORECASE)
|
||||
_BATHS_RE = re.compile(r"\*\*\s*([\d.]+)\s*\*\*\s*ba\b", re.IGNORECASE)
|
||||
_SQFT_RE = re.compile(r"\*\*\s*([\d,]+)\s*\*\*\s*sqft", re.IGNORECASE)
|
||||
_ADDRESS_LINK_RE = re.compile(r"\[([^\]]+,\s*[A-Z]{2}\s+\d{5})\]\(([^)]+zpid[^)]+)\)")
|
||||
_STATE_ZIP_RE = re.compile(r",\s*([A-Z]{2})\s+(\d{5})")
|
||||
|
||||
|
||||
def _parse_money(s: str) -> Optional[float]:
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
return float(s.replace(",", "").replace("$", ""))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_listings_from_markdown(md: str) -> list[dict]:
|
||||
"""Parse Zillow markdown into listing dicts.
|
||||
|
||||
BUG FIX 2026-05-15: el parser previo hacia 'lookback 800 chars' por cada
|
||||
listing, lo que causaba que multiples listings reclamaran la MISMA foto
|
||||
(ej. 2352 Scenic View, 1950 Holly Oaks y 11189 Stapleton compartian foto).
|
||||
|
||||
Algoritmo corregido:
|
||||
1. Indexar todas las imagenes Zillow con su posicion en el markdown
|
||||
2. Para cada listing, buscar la foto CLOSEST PRECEDING (entre el end del
|
||||
listing previo y el start del current) que NO haya sido reclamada
|
||||
3. Cada photo URL se "claima" → solo se asigna a UN listing
|
||||
"""
|
||||
listings: list[dict] = []
|
||||
|
||||
# Pre-index all listings and all photos with positions
|
||||
listing_matches = list(_LISTING_PATTERN.finditer(md))
|
||||
all_photos_positioned: list[tuple[int, str]] = []
|
||||
for img_m in re.finditer(
|
||||
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png))\)",
|
||||
md, re.IGNORECASE,
|
||||
):
|
||||
all_photos_positioned.append((img_m.start(), img_m.group(1)))
|
||||
|
||||
claimed_photos: set[str] = set()
|
||||
|
||||
for i, m in enumerate(listing_matches):
|
||||
price_raw = m.group(1)
|
||||
url = m.group(2)
|
||||
body = m.group(3)
|
||||
listing_start = m.start()
|
||||
# Use prev listing's START (not .end()) as boundary because each listing's
|
||||
# body lazily matches up to the next listing — so the photo for listing N+1
|
||||
# technically lives inside listing N's match.body. Using prev.start() gives
|
||||
# us the "between" range that includes that photo.
|
||||
prev_listing_start = listing_matches[i - 1].start() if i > 0 else 0
|
||||
|
||||
# Find the LAST photo in (prev_listing_start, listing_start) that isn't claimed.
|
||||
# 'Last' = closest to current listing in markdown order = the right one.
|
||||
# claimed_photos set ensures each photo URL gets assigned to AT MOST one listing.
|
||||
my_photo: Optional[str] = None
|
||||
for pos, photo_url in reversed(all_photos_positioned):
|
||||
if pos < prev_listing_start or pos >= listing_start:
|
||||
continue
|
||||
if photo_url in claimed_photos:
|
||||
continue
|
||||
my_photo = photo_url
|
||||
claimed_photos.add(photo_url)
|
||||
break
|
||||
|
||||
photos_unique = [my_photo] if my_photo else []
|
||||
|
||||
price = _parse_money(price_raw)
|
||||
beds = None
|
||||
baths = None
|
||||
sqft = None
|
||||
|
||||
bm = _BEDS_RE.search(body)
|
||||
if bm:
|
||||
try:
|
||||
beds = int(float(bm.group(1)))
|
||||
except ValueError:
|
||||
pass
|
||||
bm = _BATHS_RE.search(body)
|
||||
if bm:
|
||||
try:
|
||||
baths = float(bm.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
sm = _SQFT_RE.search(body)
|
||||
if sm:
|
||||
try:
|
||||
sqft = int(sm.group(1).replace(",", ""))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Extract address from the body (markdown link with address text)
|
||||
addr_match = _ADDRESS_LINK_RE.search(body)
|
||||
address_full = addr_match.group(1) if addr_match else None
|
||||
state = None
|
||||
zip_code = None
|
||||
city = None
|
||||
if address_full:
|
||||
sz_match = _STATE_ZIP_RE.search(address_full)
|
||||
if sz_match:
|
||||
state = sz_match.group(1)
|
||||
zip_code = sz_match.group(2)
|
||||
# city = penultimate comma part (after street, before STATE)
|
||||
parts = [p.strip() for p in address_full.split(",")]
|
||||
if len(parts) >= 3:
|
||||
# parts = [street, city, "STATE ZIP"]
|
||||
city = parts[-2] if not _STATE_ZIP_RE.match(parts[-2]) else None
|
||||
if not city and len(parts) >= 4:
|
||||
city = parts[-3]
|
||||
|
||||
# Extract zpid for source_url canonical
|
||||
zpid_match = re.search(r"/(\d+)_zpid", url)
|
||||
zpid = zpid_match.group(1) if zpid_match else None
|
||||
|
||||
# Badges (text between price block and address)
|
||||
badges = []
|
||||
for kw in ("New construction", "Price reduced", "Price cut",
|
||||
"New listing", "Open House", "Foreclosure", "Pre-foreclosure",
|
||||
"Coming soon", "Auction"):
|
||||
if kw.lower() in body.lower():
|
||||
badges.append(kw)
|
||||
|
||||
listings.append({
|
||||
"price": price,
|
||||
"source_url": url,
|
||||
"zpid": zpid,
|
||||
"beds": beds,
|
||||
"baths": baths,
|
||||
"sqft": sqft,
|
||||
"address": address_full,
|
||||
"city": city,
|
||||
"state": state,
|
||||
"zip": zip_code,
|
||||
"badges": badges,
|
||||
"photos_urls": photos_unique,
|
||||
})
|
||||
|
||||
return listings
|
||||
|
||||
|
||||
# ─── Property Detail Parser (individual property page) ────────────────────
|
||||
# Bug fix 2026-05-15: search results pages NO incluyen condition/features/
|
||||
# year_built. La pagina individual de cada property SI los tiene. Esta funcion
|
||||
# se llama on-demand durante pre-screening cuando vale gastar 1 credit Firecrawl
|
||||
# para enriquecer un deal especifico.
|
||||
|
||||
# Keywords que indican condition tag (Zillow lo expone explicito)
|
||||
_CONDITION_TAG_RE = re.compile(
|
||||
r"(?:Condition|Property\s*Condition)\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s\-/]+?)(?:\n|$|\|)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# "Updated/Remodeled" tag aparece tambien standalone (sin label)
|
||||
_CONDITION_STANDALONE_RE = re.compile(
|
||||
r"\b(Updated/Remodeled|Remodeled|Renovated|Updated|Original|New\s+construction|Newly\s+built|Fixer[- ]upper|Needs\s+work)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Year built (multiple formats)
|
||||
_YEAR_BUILT_RE = re.compile(
|
||||
r"(?:Year\s*built|Built\s*in|Construction\s*year)\s*[:\-]?\s*(\d{4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Home status (active, under contract, pending, sold)
|
||||
_HOME_STATUS_RE = re.compile(
|
||||
r"\b(For\s*sale|Active\s+under\s+contract|Active\s+contingent|Pending|Sold|Off\s*market|Coming\s*soon|Auction)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# "What's special" extraction — usually H2 + bullet list right after
|
||||
_WHATS_SPECIAL_RE = re.compile(
|
||||
r"(?:##|\*\*)\s*What\W*s?\s+special\W*?\s*(?:##|\*\*)?\s*([\s\S]+?)"
|
||||
r"(?=\n##\s|\n\*\*[A-Z]|\Z)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Bullet item extractor (after "What's special" matches a section)
|
||||
_BULLET_ITEM_RE = re.compile(r"(?:^|\n)\s*(?:-|\*)\s+([^\n]+)")
|
||||
|
||||
# Zestimate
|
||||
_ZESTIMATE_RE = re.compile(r"Zestimate\W*?\$?([\d,]+)", re.IGNORECASE)
|
||||
|
||||
# Tax assessed value
|
||||
_TAX_ASSESSED_RE = re.compile(
|
||||
r"(?:Tax\s+assessed\s+value|Assessed\s+value|Assessment)\W*?\$?([\d,]+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# HOA monthly fee (multiple formats Zillow uses)
|
||||
# Examples: "HOA: $350/mo", "HOA fee: $250 monthly", "Monthly HOA: $400"
|
||||
# Also: "HOA: None", "No HOA", "HOA: 0"
|
||||
_HOA_RE = re.compile(
|
||||
r"HOA(?:\s+fee)?[\s:]+\$?([\d,]+)\s*(?:/\s*(?:mo|month)|monthly)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_HOA_MONTHLY_RE = re.compile(
|
||||
r"Monthly\s+HOA[\s:]+\$?([\d,]+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_NO_HOA_RE = re.compile(
|
||||
r"(?:HOA[\s:]+(?:None|N/A|0|No)\b)|(?:\bNo\s+HOA\b)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _parse_property_detail_md(md: str) -> dict:
|
||||
"""Extract enriched fields from Zillow property detail page markdown.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"condition_status": str | None,
|
||||
"year_built": int | None,
|
||||
"home_status": str | None, # "For sale" | "Active under contract" | etc
|
||||
"features_special": [str], # tags from "What's special" section
|
||||
"description": str, # best-effort full description
|
||||
"zestimate": int | None,
|
||||
"tax_assessed_value": int | None,
|
||||
"active_under_contract": bool, # convenience flag
|
||||
"renovation_keywords_found": [str], # keywords detected anywhere
|
||||
}
|
||||
"""
|
||||
out: dict = {
|
||||
"condition_status": None,
|
||||
"year_built": None,
|
||||
"home_status": None,
|
||||
"features_special": [],
|
||||
"description": "",
|
||||
"zestimate": None,
|
||||
"tax_assessed_value": None,
|
||||
"active_under_contract": False,
|
||||
"renovation_keywords_found": [],
|
||||
"hoa_monthly": None, # None=unknown, 0=confirmed no-HOA, >0=has HOA
|
||||
}
|
||||
|
||||
if not md:
|
||||
return out
|
||||
|
||||
# 1. Condition status — try explicit label first, then standalone tag
|
||||
m = _CONDITION_TAG_RE.search(md)
|
||||
if m:
|
||||
out["condition_status"] = m.group(1).strip()
|
||||
else:
|
||||
m = _CONDITION_STANDALONE_RE.search(md)
|
||||
if m:
|
||||
out["condition_status"] = m.group(1).strip()
|
||||
|
||||
# 2. Year built
|
||||
m = _YEAR_BUILT_RE.search(md)
|
||||
if m:
|
||||
try:
|
||||
yb = int(m.group(1))
|
||||
if 1800 < yb < 2100:
|
||||
out["year_built"] = yb
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 3. Home status — capture first occurrence
|
||||
m = _HOME_STATUS_RE.search(md)
|
||||
if m:
|
||||
out["home_status"] = m.group(1).strip()
|
||||
if "under contract" in out["home_status"].lower() or "contingent" in out["home_status"].lower():
|
||||
out["active_under_contract"] = True
|
||||
|
||||
# 4. What's special features
|
||||
m = _WHATS_SPECIAL_RE.search(md)
|
||||
if m:
|
||||
section = m.group(1)
|
||||
# Extract bullet items
|
||||
for bm in _BULLET_ITEM_RE.finditer(section):
|
||||
item = bm.group(1).strip().rstrip(",.")
|
||||
# Filter out very short / irrelevant items
|
||||
if 2 <= len(item) <= 100 and not item.startswith("["):
|
||||
out["features_special"].append(item)
|
||||
# Cap to first 12 items (Zillow usually has 4-8)
|
||||
out["features_special"] = out["features_special"][:12]
|
||||
|
||||
# 5. Description — heuristic: look for "Description" section header or use the
|
||||
# longest paragraph after the price block
|
||||
desc_match = re.search(
|
||||
r"(?:##|\*\*)\s*Description\W*?\s*(?:##|\*\*)?\s*([\s\S]+?)(?=\n##\s|\n\*\*[A-Z]|\Z)",
|
||||
md, re.IGNORECASE,
|
||||
)
|
||||
if desc_match:
|
||||
out["description"] = desc_match.group(1).strip()[:2000]
|
||||
else:
|
||||
# Fallback: largest paragraph in first 8KB
|
||||
paras = [p.strip() for p in md[:8000].split("\n\n") if len(p.strip()) > 200]
|
||||
if paras:
|
||||
out["description"] = max(paras, key=len)[:2000]
|
||||
|
||||
# 6. Zestimate
|
||||
m = _ZESTIMATE_RE.search(md)
|
||||
if m:
|
||||
try:
|
||||
out["zestimate"] = int(m.group(1).replace(",", ""))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 7. Tax assessed value
|
||||
m = _TAX_ASSESSED_RE.search(md)
|
||||
if m:
|
||||
try:
|
||||
out["tax_assessed_value"] = int(m.group(1).replace(",", ""))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 7b. HOA monthly fee
|
||||
# First check "No HOA" pattern (preferred — explicit negative)
|
||||
if _NO_HOA_RE.search(md):
|
||||
out["hoa_monthly"] = 0
|
||||
else:
|
||||
# Then look for explicit HOA $X
|
||||
m = _HOA_MONTHLY_RE.search(md) or _HOA_RE.search(md)
|
||||
if m:
|
||||
try:
|
||||
val = int(m.group(1).replace(",", ""))
|
||||
# Sanity: HOA between $1 and $5000/mo
|
||||
if 1 <= val <= 5000:
|
||||
out["hoa_monthly"] = val
|
||||
elif val == 0:
|
||||
out["hoa_monthly"] = 0
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 8. Renovation keywords found in description + features (for downstream consumers)
|
||||
from data_fetchers.property_value import NEW_ITEM_KEYWORDS, RENOVATED_GLOBAL_KEYWORDS
|
||||
combined = (out["description"] + " " + " ".join(out["features_special"])).lower()
|
||||
found: list[str] = []
|
||||
for kw in RENOVATED_GLOBAL_KEYWORDS:
|
||||
if kw.lower() in combined:
|
||||
found.append(kw)
|
||||
for cat, kws in NEW_ITEM_KEYWORDS.items():
|
||||
for kw in kws:
|
||||
if kw.lower() in combined:
|
||||
found.append(f"{cat}:{kw}")
|
||||
break
|
||||
out["renovation_keywords_found"] = found
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def scrape_zillow_property_detail(
|
||||
url_or_zpid: str,
|
||||
*,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
use_cache: bool = True,
|
||||
) -> tuple[dict, int]:
|
||||
"""Scrape individual Zillow property page para extraer condition + features + status.
|
||||
|
||||
Costo: 1 Firecrawl credit. Cachea 24h por URL.
|
||||
|
||||
Args:
|
||||
url_or_zpid: full Zillow URL OR just the zpid (will build URL).
|
||||
status_cb: optional logging callback.
|
||||
use_cache: skip Firecrawl if cached.
|
||||
|
||||
Returns:
|
||||
(detail_dict, credits_used)
|
||||
detail_dict: {condition_status, year_built, home_status, features_special,
|
||||
description, zestimate, tax_assessed_value, active_under_contract,
|
||||
renovation_keywords_found, source_url, _fetched_at, _errors}
|
||||
credits_used: 0 if cached, 1 if fresh fetch.
|
||||
"""
|
||||
def _log(m: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(m)
|
||||
|
||||
# Normalize to URL
|
||||
if url_or_zpid.startswith("http"):
|
||||
url = url_or_zpid
|
||||
else:
|
||||
# Assume zpid; Zillow accepts /homedetails/{zpid}_zpid/ as canonical
|
||||
url = f"{ZILLOW_BASE}/homedetails/{url_or_zpid}_zpid/"
|
||||
|
||||
out: dict = {
|
||||
"source_url": url,
|
||||
"_fetched_at": None,
|
||||
"_errors": [],
|
||||
}
|
||||
|
||||
# Cache check
|
||||
if use_cache:
|
||||
cached = get_cached("zillow_detail", url, ttl_seconds=86400)
|
||||
if cached:
|
||||
_log(f" zillow detail cache HIT for {url}")
|
||||
from datetime import datetime, timezone
|
||||
parsed = _parse_property_detail_md(cached)
|
||||
out.update(parsed)
|
||||
out["_fetched_at"] = datetime.now(timezone.utc).isoformat()
|
||||
out["_cached"] = True
|
||||
return out, 0
|
||||
|
||||
# Fresh fetch via Firecrawl
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "")
|
||||
if not api_key:
|
||||
out["_errors"].append("FIRECRAWL_API_KEY not configured")
|
||||
return out, 0
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
out["_errors"].append(f"firecrawl-py not installed: {e}")
|
||||
return out, 0
|
||||
|
||||
from deals_db import record_firecrawl_usage, is_firecrawl_paused
|
||||
if is_firecrawl_paused():
|
||||
out["_errors"].append("Firecrawl budget paused")
|
||||
return out, 0
|
||||
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
try:
|
||||
result = app.scrape(url, formats=["markdown"])
|
||||
md = result.markdown if hasattr(result, "markdown") else None
|
||||
if not md:
|
||||
out["_errors"].append("Firecrawl returned no markdown")
|
||||
return out, 1 # still consumed a credit even if empty
|
||||
record_firecrawl_usage(source=SOURCE, credits=1, url=url,
|
||||
description=f"Property detail enrichment: {url}")
|
||||
save_cache("zillow_detail", url, md, ttl_seconds=86400)
|
||||
parsed = _parse_property_detail_md(md)
|
||||
out.update(parsed)
|
||||
from datetime import datetime, timezone
|
||||
out["_fetched_at"] = datetime.now(timezone.utc).isoformat()
|
||||
return out, 1
|
||||
except Exception as e:
|
||||
out["_errors"].append(f"Firecrawl scrape failed: {type(e).__name__}: {e}")
|
||||
return out, 0
|
||||
|
||||
|
||||
# ─── Public API ────────────────────────────────────────────────────────────
|
||||
|
||||
def scrape_zillow_county(
|
||||
*,
|
||||
county: str,
|
||||
state: str,
|
||||
pages: int = 1,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
use_cache: bool = True,
|
||||
cache_ttl_seconds: int = _CACHE_TTL,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Scrape Zillow para un county.
|
||||
|
||||
Args:
|
||||
county: nombre del condado (e.g. "Miami-Dade")
|
||||
state: 2-letter state code
|
||||
pages: cuantas paginas scrapear (default 1 = ~25-40 listings = 1 credit)
|
||||
use_cache: si True, lookup en cache 1h primero
|
||||
cache_ttl_seconds: TTL del cache
|
||||
|
||||
Returns:
|
||||
(list[dict] compatible con deals_db.insert_deal, credits_actually_used)
|
||||
"""
|
||||
def _log(m: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(m)
|
||||
|
||||
deals: list[dict] = []
|
||||
credits_used = 0
|
||||
cache_namespace = "zillow"
|
||||
|
||||
# Lazy import — Firecrawl is heavy
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "")
|
||||
if not api_key:
|
||||
_log("❌ FIRECRAWL_API_KEY no configurada; abortando")
|
||||
return deals
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
_log(f"❌ firecrawl-py no instalado: {e}")
|
||||
return deals
|
||||
|
||||
from deals_db import record_firecrawl_usage, is_firecrawl_paused
|
||||
|
||||
if is_firecrawl_paused():
|
||||
_log("🚨 Firecrawl budget paused (95%+ used) — aborting Zillow scrape")
|
||||
return deals, credits_used
|
||||
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
|
||||
last_request_at = 0.0
|
||||
for page in range(1, pages + 1):
|
||||
url = _build_zillow_url(county, state, page)
|
||||
_log(f" Zillow {county} {state} page {page}: {url}")
|
||||
|
||||
# Cache check
|
||||
md: Optional[str] = None
|
||||
if use_cache:
|
||||
md = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
|
||||
if md:
|
||||
_log(f" cache HIT ({len(md):,} chars)")
|
||||
|
||||
if md is None:
|
||||
# Rate limit
|
||||
elapsed = time.time() - last_request_at
|
||||
if elapsed < _REQUEST_INTERVAL_SECONDS:
|
||||
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
|
||||
last_request_at = time.time()
|
||||
|
||||
try:
|
||||
result = app.scrape(url, formats=["markdown"])
|
||||
md = result.markdown if hasattr(result, "markdown") else None
|
||||
credits_used += 1 # 1 credit per scrape (confirmed via exploration)
|
||||
record_firecrawl_usage(
|
||||
source=SOURCE,
|
||||
credits=1,
|
||||
url=url,
|
||||
description=f"Zillow county scrape: {county} {state} page {page}",
|
||||
)
|
||||
except Exception as e:
|
||||
_log(f" Firecrawl error: {type(e).__name__}: {e}")
|
||||
continue
|
||||
|
||||
if not md:
|
||||
_log(f" Firecrawl returned empty markdown")
|
||||
continue
|
||||
|
||||
if use_cache:
|
||||
save_cache(cache_namespace, url, md,
|
||||
status_code=200, ttl_seconds=cache_ttl_seconds)
|
||||
|
||||
# Parse listings
|
||||
listings = _extract_listings_from_markdown(md)
|
||||
_log(f" parsed {len(listings)} listings from page {page}")
|
||||
|
||||
for lst in listings:
|
||||
deal = _build_deal_record(lst, county=county, state=state)
|
||||
if deal.get("address") and deal.get("listing_price"):
|
||||
deals.append(deal)
|
||||
|
||||
_log(f"Zillow {county} {state}: {len(deals)} deals, {credits_used} credits used")
|
||||
return deals, credits_used
|
||||
|
||||
|
||||
def _build_deal_record(listing: dict, *, county: str, state: str) -> dict:
|
||||
"""Convert parsed Zillow listing → deal record para deals_db."""
|
||||
badges = listing.get("badges") or []
|
||||
desc_bits = []
|
||||
if badges:
|
||||
desc_bits.append("Badges: " + ", ".join(badges))
|
||||
if listing.get("zpid"):
|
||||
desc_bits.append(f"Zillow zpid: {listing['zpid']}")
|
||||
desc_bits.append("Source: Zillow MLS")
|
||||
|
||||
# Inferir deal_type: si tiene badge "Foreclosure"/"Pre-foreclosure"/"Auction"
|
||||
# marcamos como foreclosure/auction; sino mls.
|
||||
deal_type = "mls"
|
||||
badge_str = " ".join(badges).lower()
|
||||
if "auction" in badge_str:
|
||||
deal_type = "auction"
|
||||
elif "foreclosure" in badge_str or "pre-foreclosure" in badge_str:
|
||||
deal_type = "foreclosure"
|
||||
|
||||
return {
|
||||
"source": SOURCE,
|
||||
"source_url": listing.get("source_url"),
|
||||
"address": listing.get("address"),
|
||||
"city": listing.get("city"),
|
||||
"state": listing.get("state") or state,
|
||||
"zip": listing.get("zip"),
|
||||
"county": county,
|
||||
"listing_price": listing.get("price"),
|
||||
"deal_type": deal_type,
|
||||
"starting_bid": None,
|
||||
"estimated_arv": None,
|
||||
"beds": listing.get("beds"),
|
||||
"baths": listing.get("baths"),
|
||||
"sqft": listing.get("sqft"),
|
||||
"year_built": None, # No disponible en el card de resultados
|
||||
# Zillow zpid es un ID INTERNO de Zillow, NO un court case number.
|
||||
# Va en external_id (separado de case_number que es solo para court cases).
|
||||
"case_number": None,
|
||||
"external_id": listing.get("zpid"),
|
||||
"auction_date": None,
|
||||
"listing_description": " | ".join(desc_bits),
|
||||
"photos_urls": listing.get("photos_urls") or [],
|
||||
}
|
||||
|
||||
|
||||
def run_scraper_to_db(
|
||||
*,
|
||||
counties: list[str] = None,
|
||||
state: str = "FL",
|
||||
pages_per_county: int = 1,
|
||||
auto_classify: bool = True,
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
) -> dict:
|
||||
"""Full pipeline: scrape Zillow para counties dados → persist → classify."""
|
||||
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
|
||||
init_db()
|
||||
|
||||
if counties is None:
|
||||
counties = ["Miami-Dade"]
|
||||
|
||||
run_id = record_scraper_run(SOURCE)
|
||||
errors: list[str] = []
|
||||
total_credits = 0
|
||||
|
||||
def _log(m: str) -> None:
|
||||
if status_cb:
|
||||
status_cb(m)
|
||||
|
||||
all_deals: list[dict] = []
|
||||
for county in counties:
|
||||
try:
|
||||
deals, credits_actual = scrape_zillow_county(
|
||||
county=county,
|
||||
state=state,
|
||||
pages=pages_per_county,
|
||||
status_cb=status_cb,
|
||||
)
|
||||
all_deals.extend(deals)
|
||||
total_credits += credits_actual
|
||||
except Exception as e:
|
||||
errors.append(f"scrape failed for {county}: {e}")
|
||||
|
||||
deals_new = 0
|
||||
deals_updated = 0
|
||||
new_deal_ids: list[int] = []
|
||||
|
||||
for deal in all_deals:
|
||||
try:
|
||||
deal_id, is_new = insert_deal(deal)
|
||||
if is_new:
|
||||
deals_new += 1
|
||||
new_deal_ids.append(deal_id)
|
||||
else:
|
||||
deals_updated += 1
|
||||
except Exception as e:
|
||||
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
|
||||
|
||||
classified_count = 0
|
||||
if auto_classify and new_deal_ids:
|
||||
_log(f"Auto-classifying {len(new_deal_ids)} new Zillow deals...")
|
||||
from deal_classifier import classify_deal
|
||||
from deals_db import get_deal_by_id, update_classification
|
||||
for did in new_deal_ids:
|
||||
try:
|
||||
d = get_deal_by_id(did)
|
||||
if not d:
|
||||
continue
|
||||
result = classify_deal(d)
|
||||
update_classification(
|
||||
deal_id=did,
|
||||
status=result["classification_status"],
|
||||
score=result["score"],
|
||||
reasons=result["reasons"],
|
||||
strategy=result["strategy"],
|
||||
)
|
||||
classified_count += 1
|
||||
except Exception as e:
|
||||
errors.append(f"classify fail for {did}: {e}")
|
||||
|
||||
finish_scraper_run(
|
||||
run_id,
|
||||
deals_found=len(all_deals),
|
||||
deals_new=deals_new,
|
||||
deals_updated=deals_updated,
|
||||
errors_count=len(errors),
|
||||
errors_summary=errors if errors else None,
|
||||
firecrawl_credits_used=total_credits,
|
||||
status="success" if not errors else ("partial" if all_deals else "failed"),
|
||||
)
|
||||
|
||||
return {
|
||||
"source": SOURCE,
|
||||
"scraper_run_id": run_id,
|
||||
"deals_found": len(all_deals),
|
||||
"deals_new": deals_new,
|
||||
"deals_updated": deals_updated,
|
||||
"deals_classified": classified_count,
|
||||
"firecrawl_credits_used": total_credits,
|
||||
"errors_count": len(errors),
|
||||
"errors": errors,
|
||||
}
|
||||
Reference in New Issue
Block a user