Files
2026-07-03 12:24:58 -04:00

218 lines
9.1 KiB
Python

"""scrapers/registry.py — Single source of truth for scraper metadata.
Phase 3B introduces multiple scrapers. The Search UI (B4) needs to know:
- Which sources exist
- Which counties each supports (Miami-Dade Clerk = solo Miami-Dade;
HUD = cualquier estado; Zillow = cualquier county)
- Firecrawl credit cost estimate per run
- The callable entry point that executes the scraper end-to-end
Cuando se agrega un nuevo scraper (B5 Zillow, B6 Realtor, etc), se registra acá.
REGISTRY STRUCTURE:
{
"source_id": {
"label": "Human-readable name",
"callable_path": "scrapers.miami_dade_clerk:run_scraper_to_db",
"scope": "county" | "state" | "national",
"supported_counties": ["Miami-Dade"] | None (None = todos),
"supported_states": ["FL"] | None (None = todos),
"deal_types_produced": ["foreclosure", "tax_deed", "reo", "mls"],
"firecrawl_credits_per_run": int (estimacion conservadora),
"stack": "playwright" | "firecrawl" | "hybrid",
"free": bool (True si NO consume Firecrawl),
"description": "...",
"parameters_schema": dict (kwargs que acepta el callable),
}
}
"""
from __future__ import annotations
import importlib
from typing import Any, Callable, Optional
# ════════════════════════════════════════════════════════════════════════════
# REGISTRY DATA
# ════════════════════════════════════════════════════════════════════════════
# Helper: realauction.com county clerks share the same engine (realauction_clerk.py).
# DRY: build the registry entries from a config table.
def _realauction_entry(county_label: str, county_name: str, source_id: str, area_hint: str = "") -> dict:
"""Build a registry entry for a realauction.com white-label county clerk."""
nice_label = f"{county_label} Clerk{f' ({area_hint})' if area_hint else ''} — Foreclosure + Tax Deed"
return {
"label": nice_label,
"callable_path": "scrapers.realauction_clerk:run_scraper_to_db",
"scope": "county",
"supported_counties": [county_name],
"supported_states": ["FL"],
"deal_types_produced": ["foreclosure", "tax_deed"],
"firecrawl_credits_per_run": 0,
"stack": "playwright",
"free": True,
"description": (
f"Auctions de foreclosure (Circuit Court cases) + tax deed sales de "
f"{county_label} County, FL, via realauction.com. Cubre ~7-15 dias "
f"hacia adelante por corrida. Gratis (Playwright local)."
),
"parameters_schema": {
# county kwarg se inyecta automaticamente desde aqui — search_engine
# construye kwargs leyendo este schema y pasa "county" al callable
"county": {"type": "str", "default": county_name},
"days_ahead": {"type": "int", "default": 14, "min": 1, "max": 30},
"days_back": {"type": "int", "default": 0, "min": 0, "max": 7},
"auto_classify": {"type": "bool", "default": True},
},
}
# Auto-generate clerk entries from REALAUCTION_COUNTIES (single source of truth).
# Adding a county there → auto-registers here. No duplication.
from scrapers.realauction_clerk import REALAUCTION_COUNTIES as _RA_COUNTIES
def _all_realauction_clerk_entries() -> dict:
"""Generate registry entries for ALL counties in REALAUCTION_COUNTIES."""
out = {}
for county_name, cfg in _RA_COUNTIES.items():
source_id = cfg["source_id"]
label = cfg.get("label", county_name)
# Extract area_hint from label (e.g. "Duval (Jacksonville)" → "Jacksonville")
area_hint = ""
if "(" in label and ")" in label:
area_hint = label.split("(", 1)[1].rstrip(")").strip()
out[source_id] = _realauction_entry(county_name, county_name, source_id, area_hint=area_hint)
return out
SOURCES: dict[str, dict] = {
# All 41 FL counties on realforeclose.com platform (auto-generated)
**_all_realauction_clerk_entries(),
"hud_homestore": {
"label": "HUD Homestore — Federal REO (FHA defaults)",
"callable_path": "scrapers.hud_homestore:run_scraper_to_db",
"scope": "state",
"supported_counties": None, # cualquier county dentro del state
"supported_states": None, # cualquier USA state — pero default FL
"deal_types_produced": ["reo"],
"firecrawl_credits_per_run": 0,
"stack": "playwright",
"free": True,
"description": (
"Listings federales de propiedades HUD (REO post-foreclosure de loans "
"FHA defaulted). Cubre el estado entero en una corrida. "
"Listing periods: Exclusive (owner-occ primero) vs Extended (investors)."
),
"parameters_schema": {
"states": {"type": "list[str]", "default": ["FL"], "description": "USA state codes (e.g., FL, GA)"},
"auto_classify": {"type": "bool", "default": True},
},
},
"zillow": {
"label": "Zillow MLS Listings",
"callable_path": "scrapers.zillow:run_scraper_to_db",
"scope": "county",
"supported_counties": None, # cualquier county
"supported_states": None, # cualquier state
"deal_types_produced": ["mls", "foreclosure", "auction"],
# Firecrawl: 1 credit por page scrape (verified). Default 1 page/county.
"firecrawl_credits_per_run": 1,
"stack": "firecrawl",
"free": False,
"description": (
"Zillow MLS listings parametrizable por county. Cubre Single Family "
"Homes (SFH). ~9-30 listings por page scrape. Detecta badges "
"automaticamente (New construction, Price reduced, Foreclosure, Auction). "
"Costo: 1 Firecrawl credit por page scrape (~$0.001 USD)."
),
"parameters_schema": {
"counties": {"type": "list[str]", "default": ["Miami-Dade"]},
"state": {"type": "str", "default": "FL"},
"pages_per_county": {"type": "int", "default": 1, "min": 1, "max": 5},
"auto_classify": {"type": "bool", "default": True},
},
},
# Slot for future scrapers (uncomment when shipped):
# "realtor": {...},
# "broward_clerk": {...},
# "palm_beach_clerk": {...},
# "hillsborough_clerk": {...},
}
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def list_sources() -> list[dict]:
"""Returns list of all registered sources with their metadata."""
return [{"id": k, **v} for k, v in SOURCES.items()]
def get_source(source_id: str) -> Optional[dict]:
return SOURCES.get(source_id)
def get_sources_for_county(county: str) -> list[dict]:
"""Returns list of sources que soportan el condado dado.
"Miami-Dade" → [miami_dade_clerk, hud_homestore (national)]
"Broward" → [hud_homestore] (until broward_clerk added)
"""
result = []
for src_id, src in SOURCES.items():
sup = src.get("supported_counties")
if sup is None or county in sup:
result.append({"id": src_id, **src})
return result
def get_sources_for_state(state: str) -> list[dict]:
"""Returns sources que soportan el state dado."""
result = []
for src_id, src in SOURCES.items():
sup_states = src.get("supported_states")
if sup_states is None or state in sup_states:
result.append({"id": src_id, **src})
return result
def estimate_credits(source_ids: list[str]) -> dict:
"""Calcula total Firecrawl credits que consumirian estos sources combinados."""
total = 0
breakdown = []
for src_id in source_ids:
src = SOURCES.get(src_id)
if not src:
continue
cr = src.get("firecrawl_credits_per_run", 0)
total += cr
breakdown.append({
"source_id": src_id,
"label": src["label"],
"credits": cr,
"free": src.get("free", False),
})
return {
"total_credits": total,
"breakdown": breakdown,
}
def resolve_callable(source_id: str) -> Optional[Callable[..., Any]]:
"""Importa el callable real de un source. Returns None si fallo."""
src = SOURCES.get(source_id)
if not src:
return None
path = src.get("callable_path")
if not path or ":" not in path:
return None
module_name, func_name = path.split(":", 1)
try:
mod = importlib.import_module(module_name)
return getattr(mod, func_name)
except (ImportError, AttributeError) as e:
# Defensive: don't crash UI if scraper module is broken
return None