218 lines
9.1 KiB
Python
218 lines
9.1 KiB
Python
"""scrapers/registry.py — Single source of truth for scraper metadata.
|
|
|
|
Phase 3B introduces multiple scrapers. The Search UI (B4) needs to know:
|
|
- Which sources exist
|
|
- Which counties each supports (Miami-Dade Clerk = solo Miami-Dade;
|
|
HUD = cualquier estado; Zillow = cualquier county)
|
|
- Firecrawl credit cost estimate per run
|
|
- The callable entry point that executes the scraper end-to-end
|
|
|
|
Cuando se agrega un nuevo scraper (B5 Zillow, B6 Realtor, etc), se registra acá.
|
|
|
|
REGISTRY STRUCTURE:
|
|
{
|
|
"source_id": {
|
|
"label": "Human-readable name",
|
|
"callable_path": "scrapers.miami_dade_clerk:run_scraper_to_db",
|
|
"scope": "county" | "state" | "national",
|
|
"supported_counties": ["Miami-Dade"] | None (None = todos),
|
|
"supported_states": ["FL"] | None (None = todos),
|
|
"deal_types_produced": ["foreclosure", "tax_deed", "reo", "mls"],
|
|
"firecrawl_credits_per_run": int (estimacion conservadora),
|
|
"stack": "playwright" | "firecrawl" | "hybrid",
|
|
"free": bool (True si NO consume Firecrawl),
|
|
"description": "...",
|
|
"parameters_schema": dict (kwargs que acepta el callable),
|
|
}
|
|
}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
from typing import Any, Callable, Optional
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# REGISTRY DATA
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Helper: realauction.com county clerks share the same engine (realauction_clerk.py).
|
|
# DRY: build the registry entries from a config table.
|
|
def _realauction_entry(county_label: str, county_name: str, source_id: str, area_hint: str = "") -> dict:
|
|
"""Build a registry entry for a realauction.com white-label county clerk."""
|
|
nice_label = f"{county_label} Clerk{f' ({area_hint})' if area_hint else ''} — Foreclosure + Tax Deed"
|
|
return {
|
|
"label": nice_label,
|
|
"callable_path": "scrapers.realauction_clerk:run_scraper_to_db",
|
|
"scope": "county",
|
|
"supported_counties": [county_name],
|
|
"supported_states": ["FL"],
|
|
"deal_types_produced": ["foreclosure", "tax_deed"],
|
|
"firecrawl_credits_per_run": 0,
|
|
"stack": "playwright",
|
|
"free": True,
|
|
"description": (
|
|
f"Auctions de foreclosure (Circuit Court cases) + tax deed sales de "
|
|
f"{county_label} County, FL, via realauction.com. Cubre ~7-15 dias "
|
|
f"hacia adelante por corrida. Gratis (Playwright local)."
|
|
),
|
|
"parameters_schema": {
|
|
# county kwarg se inyecta automaticamente desde aqui — search_engine
|
|
# construye kwargs leyendo este schema y pasa "county" al callable
|
|
"county": {"type": "str", "default": county_name},
|
|
"days_ahead": {"type": "int", "default": 14, "min": 1, "max": 30},
|
|
"days_back": {"type": "int", "default": 0, "min": 0, "max": 7},
|
|
"auto_classify": {"type": "bool", "default": True},
|
|
},
|
|
}
|
|
|
|
|
|
# Auto-generate clerk entries from REALAUCTION_COUNTIES (single source of truth).
|
|
# Adding a county there → auto-registers here. No duplication.
|
|
from scrapers.realauction_clerk import REALAUCTION_COUNTIES as _RA_COUNTIES
|
|
|
|
|
|
def _all_realauction_clerk_entries() -> dict:
|
|
"""Generate registry entries for ALL counties in REALAUCTION_COUNTIES."""
|
|
out = {}
|
|
for county_name, cfg in _RA_COUNTIES.items():
|
|
source_id = cfg["source_id"]
|
|
label = cfg.get("label", county_name)
|
|
# Extract area_hint from label (e.g. "Duval (Jacksonville)" → "Jacksonville")
|
|
area_hint = ""
|
|
if "(" in label and ")" in label:
|
|
area_hint = label.split("(", 1)[1].rstrip(")").strip()
|
|
out[source_id] = _realauction_entry(county_name, county_name, source_id, area_hint=area_hint)
|
|
return out
|
|
|
|
|
|
SOURCES: dict[str, dict] = {
|
|
# All 41 FL counties on realforeclose.com platform (auto-generated)
|
|
**_all_realauction_clerk_entries(),
|
|
"hud_homestore": {
|
|
"label": "HUD Homestore — Federal REO (FHA defaults)",
|
|
"callable_path": "scrapers.hud_homestore:run_scraper_to_db",
|
|
"scope": "state",
|
|
"supported_counties": None, # cualquier county dentro del state
|
|
"supported_states": None, # cualquier USA state — pero default FL
|
|
"deal_types_produced": ["reo"],
|
|
"firecrawl_credits_per_run": 0,
|
|
"stack": "playwright",
|
|
"free": True,
|
|
"description": (
|
|
"Listings federales de propiedades HUD (REO post-foreclosure de loans "
|
|
"FHA defaulted). Cubre el estado entero en una corrida. "
|
|
"Listing periods: Exclusive (owner-occ primero) vs Extended (investors)."
|
|
),
|
|
"parameters_schema": {
|
|
"states": {"type": "list[str]", "default": ["FL"], "description": "USA state codes (e.g., FL, GA)"},
|
|
"auto_classify": {"type": "bool", "default": True},
|
|
},
|
|
},
|
|
"zillow": {
|
|
"label": "Zillow MLS Listings",
|
|
"callable_path": "scrapers.zillow:run_scraper_to_db",
|
|
"scope": "county",
|
|
"supported_counties": None, # cualquier county
|
|
"supported_states": None, # cualquier state
|
|
"deal_types_produced": ["mls", "foreclosure", "auction"],
|
|
# Firecrawl: 1 credit por page scrape (verified). Default 1 page/county.
|
|
"firecrawl_credits_per_run": 1,
|
|
"stack": "firecrawl",
|
|
"free": False,
|
|
"description": (
|
|
"Zillow MLS listings parametrizable por county. Cubre Single Family "
|
|
"Homes (SFH). ~9-30 listings por page scrape. Detecta badges "
|
|
"automaticamente (New construction, Price reduced, Foreclosure, Auction). "
|
|
"Costo: 1 Firecrawl credit por page scrape (~$0.001 USD)."
|
|
),
|
|
"parameters_schema": {
|
|
"counties": {"type": "list[str]", "default": ["Miami-Dade"]},
|
|
"state": {"type": "str", "default": "FL"},
|
|
"pages_per_county": {"type": "int", "default": 1, "min": 1, "max": 5},
|
|
"auto_classify": {"type": "bool", "default": True},
|
|
},
|
|
},
|
|
# Slot for future scrapers (uncomment when shipped):
|
|
# "realtor": {...},
|
|
# "broward_clerk": {...},
|
|
# "palm_beach_clerk": {...},
|
|
# "hillsborough_clerk": {...},
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Public API
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def list_sources() -> list[dict]:
|
|
"""Returns list of all registered sources with their metadata."""
|
|
return [{"id": k, **v} for k, v in SOURCES.items()]
|
|
|
|
|
|
def get_source(source_id: str) -> Optional[dict]:
|
|
return SOURCES.get(source_id)
|
|
|
|
|
|
def get_sources_for_county(county: str) -> list[dict]:
|
|
"""Returns list of sources que soportan el condado dado.
|
|
|
|
"Miami-Dade" → [miami_dade_clerk, hud_homestore (national)]
|
|
"Broward" → [hud_homestore] (until broward_clerk added)
|
|
"""
|
|
result = []
|
|
for src_id, src in SOURCES.items():
|
|
sup = src.get("supported_counties")
|
|
if sup is None or county in sup:
|
|
result.append({"id": src_id, **src})
|
|
return result
|
|
|
|
|
|
def get_sources_for_state(state: str) -> list[dict]:
|
|
"""Returns sources que soportan el state dado."""
|
|
result = []
|
|
for src_id, src in SOURCES.items():
|
|
sup_states = src.get("supported_states")
|
|
if sup_states is None or state in sup_states:
|
|
result.append({"id": src_id, **src})
|
|
return result
|
|
|
|
|
|
def estimate_credits(source_ids: list[str]) -> dict:
|
|
"""Calcula total Firecrawl credits que consumirian estos sources combinados."""
|
|
total = 0
|
|
breakdown = []
|
|
for src_id in source_ids:
|
|
src = SOURCES.get(src_id)
|
|
if not src:
|
|
continue
|
|
cr = src.get("firecrawl_credits_per_run", 0)
|
|
total += cr
|
|
breakdown.append({
|
|
"source_id": src_id,
|
|
"label": src["label"],
|
|
"credits": cr,
|
|
"free": src.get("free", False),
|
|
})
|
|
return {
|
|
"total_credits": total,
|
|
"breakdown": breakdown,
|
|
}
|
|
|
|
|
|
def resolve_callable(source_id: str) -> Optional[Callable[..., Any]]:
|
|
"""Importa el callable real de un source. Returns None si fallo."""
|
|
src = SOURCES.get(source_id)
|
|
if not src:
|
|
return None
|
|
path = src.get("callable_path")
|
|
if not path or ":" not in path:
|
|
return None
|
|
module_name, func_name = path.split(":", 1)
|
|
try:
|
|
mod = importlib.import_module(module_name)
|
|
return getattr(mod, func_name)
|
|
except (ImportError, AttributeError) as e:
|
|
# Defensive: don't crash UI if scraper module is broken
|
|
return None
|