"""scrapers/registry.py — Single source of truth for scraper metadata. Phase 3B introduces multiple scrapers. The Search UI (B4) needs to know: - Which sources exist - Which counties each supports (Miami-Dade Clerk = solo Miami-Dade; HUD = cualquier estado; Zillow = cualquier county) - Firecrawl credit cost estimate per run - The callable entry point that executes the scraper end-to-end Cuando se agrega un nuevo scraper (B5 Zillow, B6 Realtor, etc), se registra acá. REGISTRY STRUCTURE: { "source_id": { "label": "Human-readable name", "callable_path": "scrapers.miami_dade_clerk:run_scraper_to_db", "scope": "county" | "state" | "national", "supported_counties": ["Miami-Dade"] | None (None = todos), "supported_states": ["FL"] | None (None = todos), "deal_types_produced": ["foreclosure", "tax_deed", "reo", "mls"], "firecrawl_credits_per_run": int (estimacion conservadora), "stack": "playwright" | "firecrawl" | "hybrid", "free": bool (True si NO consume Firecrawl), "description": "...", "parameters_schema": dict (kwargs que acepta el callable), } } """ from __future__ import annotations import importlib from typing import Any, Callable, Optional # ════════════════════════════════════════════════════════════════════════════ # REGISTRY DATA # ════════════════════════════════════════════════════════════════════════════ # Helper: realauction.com county clerks share the same engine (realauction_clerk.py). # DRY: build the registry entries from a config table. def _realauction_entry(county_label: str, county_name: str, source_id: str, area_hint: str = "") -> dict: """Build a registry entry for a realauction.com white-label county clerk.""" nice_label = f"{county_label} Clerk{f' ({area_hint})' if area_hint else ''} — Foreclosure + Tax Deed" return { "label": nice_label, "callable_path": "scrapers.realauction_clerk:run_scraper_to_db", "scope": "county", "supported_counties": [county_name], "supported_states": ["FL"], "deal_types_produced": ["foreclosure", "tax_deed"], "firecrawl_credits_per_run": 0, "stack": "playwright", "free": True, "description": ( f"Auctions de foreclosure (Circuit Court cases) + tax deed sales de " f"{county_label} County, FL, via realauction.com. Cubre ~7-15 dias " f"hacia adelante por corrida. Gratis (Playwright local)." ), "parameters_schema": { # county kwarg se inyecta automaticamente desde aqui — search_engine # construye kwargs leyendo este schema y pasa "county" al callable "county": {"type": "str", "default": county_name}, "days_ahead": {"type": "int", "default": 14, "min": 1, "max": 30}, "days_back": {"type": "int", "default": 0, "min": 0, "max": 7}, "auto_classify": {"type": "bool", "default": True}, }, } # Auto-generate clerk entries from REALAUCTION_COUNTIES (single source of truth). # Adding a county there → auto-registers here. No duplication. from scrapers.realauction_clerk import REALAUCTION_COUNTIES as _RA_COUNTIES def _all_realauction_clerk_entries() -> dict: """Generate registry entries for ALL counties in REALAUCTION_COUNTIES.""" out = {} for county_name, cfg in _RA_COUNTIES.items(): source_id = cfg["source_id"] label = cfg.get("label", county_name) # Extract area_hint from label (e.g. "Duval (Jacksonville)" → "Jacksonville") area_hint = "" if "(" in label and ")" in label: area_hint = label.split("(", 1)[1].rstrip(")").strip() out[source_id] = _realauction_entry(county_name, county_name, source_id, area_hint=area_hint) return out SOURCES: dict[str, dict] = { # All 41 FL counties on realforeclose.com platform (auto-generated) **_all_realauction_clerk_entries(), "hud_homestore": { "label": "HUD Homestore — Federal REO (FHA defaults)", "callable_path": "scrapers.hud_homestore:run_scraper_to_db", "scope": "state", "supported_counties": None, # cualquier county dentro del state "supported_states": None, # cualquier USA state — pero default FL "deal_types_produced": ["reo"], "firecrawl_credits_per_run": 0, "stack": "playwright", "free": True, "description": ( "Listings federales de propiedades HUD (REO post-foreclosure de loans " "FHA defaulted). Cubre el estado entero en una corrida. " "Listing periods: Exclusive (owner-occ primero) vs Extended (investors)." ), "parameters_schema": { "states": {"type": "list[str]", "default": ["FL"], "description": "USA state codes (e.g., FL, GA)"}, "auto_classify": {"type": "bool", "default": True}, }, }, "zillow": { "label": "Zillow MLS Listings", "callable_path": "scrapers.zillow:run_scraper_to_db", "scope": "county", "supported_counties": None, # cualquier county "supported_states": None, # cualquier state "deal_types_produced": ["mls", "foreclosure", "auction"], # Firecrawl: 1 credit por page scrape (verified). Default 1 page/county. "firecrawl_credits_per_run": 1, "stack": "firecrawl", "free": False, "description": ( "Zillow MLS listings parametrizable por county. Cubre Single Family " "Homes (SFH). ~9-30 listings por page scrape. Detecta badges " "automaticamente (New construction, Price reduced, Foreclosure, Auction). " "Costo: 1 Firecrawl credit por page scrape (~$0.001 USD)." ), "parameters_schema": { "counties": {"type": "list[str]", "default": ["Miami-Dade"]}, "state": {"type": "str", "default": "FL"}, "pages_per_county": {"type": "int", "default": 1, "min": 1, "max": 5}, "auto_classify": {"type": "bool", "default": True}, }, }, # Slot for future scrapers (uncomment when shipped): # "realtor": {...}, # "broward_clerk": {...}, # "palm_beach_clerk": {...}, # "hillsborough_clerk": {...}, } # ════════════════════════════════════════════════════════════════════════════ # Public API # ════════════════════════════════════════════════════════════════════════════ def list_sources() -> list[dict]: """Returns list of all registered sources with their metadata.""" return [{"id": k, **v} for k, v in SOURCES.items()] def get_source(source_id: str) -> Optional[dict]: return SOURCES.get(source_id) def get_sources_for_county(county: str) -> list[dict]: """Returns list of sources que soportan el condado dado. "Miami-Dade" → [miami_dade_clerk, hud_homestore (national)] "Broward" → [hud_homestore] (until broward_clerk added) """ result = [] for src_id, src in SOURCES.items(): sup = src.get("supported_counties") if sup is None or county in sup: result.append({"id": src_id, **src}) return result def get_sources_for_state(state: str) -> list[dict]: """Returns sources que soportan el state dado.""" result = [] for src_id, src in SOURCES.items(): sup_states = src.get("supported_states") if sup_states is None or state in sup_states: result.append({"id": src_id, **src}) return result def estimate_credits(source_ids: list[str]) -> dict: """Calcula total Firecrawl credits que consumirian estos sources combinados.""" total = 0 breakdown = [] for src_id in source_ids: src = SOURCES.get(src_id) if not src: continue cr = src.get("firecrawl_credits_per_run", 0) total += cr breakdown.append({ "source_id": src_id, "label": src["label"], "credits": cr, "free": src.get("free", False), }) return { "total_credits": total, "breakdown": breakdown, } def resolve_callable(source_id: str) -> Optional[Callable[..., Any]]: """Importa el callable real de un source. Returns None si fallo.""" src = SOURCES.get(source_id) if not src: return None path = src.get("callable_path") if not path or ":" not in path: return None module_name, func_name = path.split(":", 1) try: mod = importlib.import_module(module_name) return getattr(mod, func_name) except (ImportError, AttributeError) as e: # Defensive: don't crash UI if scraper module is broken return None