AR-House/scrapers/registry.py

"""scrapers/registry.py — Single source of truth for scraper metadata.

Phase 3B introduces multiple scrapers. The Search UI (B4) needs to know:
- Which sources exist
- Which counties each supports (Miami-Dade Clerk = solo Miami-Dade;
  HUD = cualquier estado; Zillow = cualquier county)
- Firecrawl credit cost estimate per run
- The callable entry point that executes the scraper end-to-end

Cuando se agrega un nuevo scraper (B5 Zillow, B6 Realtor, etc), se registra acá.

REGISTRY STRUCTURE:
{
  "source_id": {
    "label": "Human-readable name",
    "callable_path": "scrapers.miami_dade_clerk:run_scraper_to_db",
    "scope": "county" | "state" | "national",
    "supported_counties": ["Miami-Dade"] | None (None = todos),
    "supported_states": ["FL"] | None (None = todos),
    "deal_types_produced": ["foreclosure", "tax_deed", "reo", "mls"],
    "firecrawl_credits_per_run": int (estimacion conservadora),
    "stack": "playwright" | "firecrawl" | "hybrid",
    "free": bool (True si NO consume Firecrawl),
    "description": "...",
    "parameters_schema": dict (kwargs que acepta el callable),
  }
}
"""
from __future__ import annotations

import importlib
from typing import Any, Callable, Optional


# ════════════════════════════════════════════════════════════════════════════
# REGISTRY DATA
# ════════════════════════════════════════════════════════════════════════════

# Helper: realauction.com county clerks share the same engine (realauction_clerk.py).
# DRY: build the registry entries from a config table.
def _realauction_entry(county_label: str, county_name: str, source_id: str, area_hint: str = "") -> dict:
    """Build a registry entry for a realauction.com white-label county clerk."""
    nice_label = f"{county_label} Clerk{f' ({area_hint})' if area_hint else ''} — Foreclosure + Tax Deed"
    return {
        "label": nice_label,
        "callable_path": "scrapers.realauction_clerk:run_scraper_to_db",
        "scope": "county",
        "supported_counties": [county_name],
        "supported_states": ["FL"],
        "deal_types_produced": ["foreclosure", "tax_deed"],
        "firecrawl_credits_per_run": 0,
        "stack": "playwright",
        "free": True,
        "description": (
            f"Auctions de foreclosure (Circuit Court cases) + tax deed sales de "
            f"{county_label} County, FL, via realauction.com. Cubre ~7-15 dias "
            f"hacia adelante por corrida. Gratis (Playwright local)."
        ),
        "parameters_schema": {
            # county kwarg se inyecta automaticamente desde aqui — search_engine
            # construye kwargs leyendo este schema y pasa "county" al callable
            "county": {"type": "str", "default": county_name},
            "days_ahead": {"type": "int", "default": 14, "min": 1, "max": 30},
            "days_back": {"type": "int", "default": 0, "min": 0, "max": 7},
            "auto_classify": {"type": "bool", "default": True},
        },
    }


# Auto-generate clerk entries from REALAUCTION_COUNTIES (single source of truth).
# Adding a county there → auto-registers here. No duplication.
from scrapers.realauction_clerk import REALAUCTION_COUNTIES as _RA_COUNTIES


def _all_realauction_clerk_entries() -> dict:
    """Generate registry entries for ALL counties in REALAUCTION_COUNTIES."""
    out = {}
    for county_name, cfg in _RA_COUNTIES.items():
        source_id = cfg["source_id"]
        label = cfg.get("label", county_name)
        # Extract area_hint from label (e.g. "Duval (Jacksonville)" → "Jacksonville")
        area_hint = ""
        if "(" in label and ")" in label:
            area_hint = label.split("(", 1)[1].rstrip(")").strip()
        out[source_id] = _realauction_entry(county_name, county_name, source_id, area_hint=area_hint)
    return out


SOURCES: dict[str, dict] = {
    # All 41 FL counties on realforeclose.com platform (auto-generated)
    **_all_realauction_clerk_entries(),
    "hud_homestore": {
        "label": "HUD Homestore — Federal REO (FHA defaults)",
        "callable_path": "scrapers.hud_homestore:run_scraper_to_db",
        "scope": "state",
        "supported_counties": None,  # cualquier county dentro del state
        "supported_states": None,    # cualquier USA state — pero default FL
        "deal_types_produced": ["reo"],
        "firecrawl_credits_per_run": 0,
        "stack": "playwright",
        "free": True,
        "description": (
            "Listings federales de propiedades HUD (REO post-foreclosure de loans "
            "FHA defaulted). Cubre el estado entero en una corrida. "
            "Listing periods: Exclusive (owner-occ primero) vs Extended (investors)."
        ),
        "parameters_schema": {
            "states": {"type": "list[str]", "default": ["FL"], "description": "USA state codes (e.g., FL, GA)"},
            "auto_classify": {"type": "bool", "default": True},
        },
    },
    "zillow": {
        "label": "Zillow MLS Listings",
        "callable_path": "scrapers.zillow:run_scraper_to_db",
        "scope": "county",
        "supported_counties": None,  # cualquier county
        "supported_states": None,    # cualquier state
        "deal_types_produced": ["mls", "foreclosure", "auction"],
        # Firecrawl: 1 credit por page scrape (verified). Default 1 page/county.
        "firecrawl_credits_per_run": 1,
        "stack": "firecrawl",
        "free": False,
        "description": (
            "Zillow MLS listings parametrizable por county. Cubre Single Family "
            "Homes (SFH). ~9-30 listings por page scrape. Detecta badges "
            "automaticamente (New construction, Price reduced, Foreclosure, Auction). "
            "Costo: 1 Firecrawl credit por page scrape (~$0.001 USD)."
        ),
        "parameters_schema": {
            "counties": {"type": "list[str]", "default": ["Miami-Dade"]},
            "state": {"type": "str", "default": "FL"},
            "pages_per_county": {"type": "int", "default": 1, "min": 1, "max": 5},
            "auto_classify": {"type": "bool", "default": True},
        },
    },
    # Slot for future scrapers (uncomment when shipped):
    # "realtor": {...},
    # "broward_clerk": {...},
    # "palm_beach_clerk": {...},
    # "hillsborough_clerk": {...},
}


# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════

def list_sources() -> list[dict]:
    """Returns list of all registered sources with their metadata."""
    return [{"id": k, **v} for k, v in SOURCES.items()]


def get_source(source_id: str) -> Optional[dict]:
    return SOURCES.get(source_id)


def get_sources_for_county(county: str) -> list[dict]:
    """Returns list of sources que soportan el condado dado.

    "Miami-Dade" → [miami_dade_clerk, hud_homestore (national)]
    "Broward"    → [hud_homestore]  (until broward_clerk added)
    """
    result = []
    for src_id, src in SOURCES.items():
        sup = src.get("supported_counties")
        if sup is None or county in sup:
            result.append({"id": src_id, **src})
    return result


def get_sources_for_state(state: str) -> list[dict]:
    """Returns sources que soportan el state dado."""
    result = []
    for src_id, src in SOURCES.items():
        sup_states = src.get("supported_states")
        if sup_states is None or state in sup_states:
            result.append({"id": src_id, **src})
    return result


def estimate_credits(source_ids: list[str]) -> dict:
    """Calcula total Firecrawl credits que consumirian estos sources combinados."""
    total = 0
    breakdown = []
    for src_id in source_ids:
        src = SOURCES.get(src_id)
        if not src:
            continue
        cr = src.get("firecrawl_credits_per_run", 0)
        total += cr
        breakdown.append({
            "source_id": src_id,
            "label": src["label"],
            "credits": cr,
            "free": src.get("free", False),
        })
    return {
        "total_credits": total,
        "breakdown": breakdown,
    }


def resolve_callable(source_id: str) -> Optional[Callable[..., Any]]:
    """Importa el callable real de un source. Returns None si fallo."""
    src = SOURCES.get(source_id)
    if not src:
        return None
    path = src.get("callable_path")
    if not path or ":" not in path:
        return None
    module_name, func_name = path.split(":", 1)
    try:
        mod = importlib.import_module(module_name)
        return getattr(mod, func_name)
    except (ImportError, AttributeError) as e:
        # Defensive: don't crash UI if scraper module is broken
        return None