AR-House/search_engine.py

"""search_engine.py — High-level orchestrator para Search UI on-demand.

Invocado por la UI cuando el usuario clickea "Buscar deals":
1. Verifica Firecrawl budget pre-flight (alert/pause si excede)
2. Por cada source seleccionado, llama su `run_scraper_to_db()`
3. Aplica filtros opcionales (price/beds/deal_type)
4. Retorna deals filtrados desde deals.db
5. Tracking de Firecrawl credits consumidos

API:
    preflight_check(source_ids: list[str]) -> dict
        — verifica budget + retorna info para confirmation modal
    run_search(source_ids: list[str], counties: list[str], filters: dict,
               status_cb: Callable, **kwargs) -> dict
        — ejecuta scrapers y retorna deals matched
"""
from __future__ import annotations

import time
from typing import Callable, Optional

from scrapers.registry import (
    SOURCES, get_source, estimate_credits, resolve_callable,
)
from deals_db import (
    init_db, list_deals, firecrawl_budget_status, is_firecrawl_paused,
)


def preflight_check(source_ids: list[str]) -> dict:
    """Calcula credits estimados + estado del budget ANTES de ejecutar.

    Returns dict para mostrar en confirmation modal:
        sources_info: list with each source's metadata + cost
        total_credits_estimated: int
        budget_snapshot: {used, total, remaining, alert_level, is_paused}
        post_run_pct: float — % del budget que quedara consumido despues
        warnings: list[str] de cosas a tener en cuenta
        is_paused: bool — True si NO se debe correr (budget excedido)
    """
    init_db()
    estimation = estimate_credits(source_ids)
    budget = firecrawl_budget_status()

    total_after = budget["credits_used"] + estimation["total_credits"]
    post_run_pct = (total_after / budget["credits_budget"] * 100) if budget["credits_budget"] else 0

    warnings = []
    if budget["is_paused"]:
        warnings.append(
            f"🚨 BUDGET FIRECRAWL EXCEDIDO (95%+). Solo sources gratuitos disponibles."
        )
    elif post_run_pct >= 95:
        warnings.append(
            f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% — superara el "
            f"pause threshold ({budget['pause_threshold_pct']}%). Considere reducir sources."
        )
    elif post_run_pct >= 80:
        warnings.append(
            f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% (alert "
            f"threshold {budget['alert_threshold_pct']}%)."
        )

    # Validate that selected sources actually exist
    invalid_sources = [sid for sid in source_ids if not get_source(sid)]
    if invalid_sources:
        warnings.append(f"Sources desconocidos (ignorados): {invalid_sources}")

    return {
        "sources_info": estimation["breakdown"],
        "total_credits_estimated": estimation["total_credits"],
        "budget_snapshot": budget,
        "post_run_pct": round(post_run_pct, 1),
        "warnings": warnings,
        "is_paused": budget["is_paused"],
        "ok_to_run": not budget["is_paused"] or estimation["total_credits"] == 0,
    }


def run_search(
    *,
    source_ids: list[str],
    counties: Optional[list[str]] = None,
    filters: Optional[dict] = None,
    status_cb: Optional[Callable[[str], None]] = None,
    scraper_kwargs_override: Optional[dict] = None,
    cancel_check: Optional[Callable[[], bool]] = None,
) -> dict:
    """Ejecuta multi-source search on-demand.

    Args:
        source_ids: lista de source ids a ejecutar (e.g. ["miami_dade_clerk", "hud_homestore"])
        counties: lista de condados a focusear (informacional para reporte)
        filters: dict con filtros post-scrape (min_price, max_price, beds_min, deal_types)
        status_cb: callback de progreso
        scraper_kwargs_override: dict opcional per-source para override defaults
            e.g. {"hud_homestore": {"states": ["FL", "GA"]}, "miami_dade_clerk": {"days_ahead": 7}}
        cancel_check: callable opcional → True si el usuario clickeo cancelar.
            Se chequea entre source iterations. NO interrumpe mid-source
            (los scrapers son blocking); para eso usar Streamlit Stop button.

    Returns dict con:
        runs: list[dict] — un run summary por scraper
        new_deals_count: int — total nuevos deals across all sources
        total_credits_used: int — Firecrawl credits consumidos
        elapsed_seconds: float
        matching_deals: list[dict] — deals que pasan los filtros (limite 200)
        errors: list[str]
        cancelled: bool — True si fue cancelado mid-batch
    """
    init_db()
    filters = filters or {}
    scraper_kwargs_override = scraper_kwargs_override or {}

    def _log(msg: str) -> None:
        if status_cb:
            status_cb(msg)

    def _was_cancelled() -> bool:
        return bool(cancel_check and cancel_check())

    t0 = time.perf_counter()
    runs: list[dict] = []
    total_new = 0
    total_credits = 0
    errors: list[str] = []
    cancelled = False

    for src_id in source_ids:
        # Cooperative cancellation between sources
        if _was_cancelled():
            cancelled = True
            _log(f"Cancelacion solicitada — saltando sources restantes ({len([s for s in source_ids if s == src_id or source_ids.index(s) > source_ids.index(src_id)])} pendientes)")
            errors.append("Cancelled by user")
            break

        src = get_source(src_id)
        if not src:
            errors.append(f"Source '{src_id}' not in registry")
            continue

        # Check budget mid-loop in case prior iteration consumed credits
        if is_firecrawl_paused() and not src.get("free"):
            errors.append(f"Source '{src_id}' skipped — Firecrawl budget paused")
            continue

        callable_fn = resolve_callable(src_id)
        if not callable_fn:
            errors.append(f"Source '{src_id}' callable could not be resolved")
            continue

        # Build kwargs: defaults from registry + override from caller
        kwargs = {}
        schema_params = src.get("parameters_schema") or {}
        for param_name, schema in schema_params.items():
            if "default" in schema:
                kwargs[param_name] = schema["default"]
        if src_id in scraper_kwargs_override:
            kwargs.update(scraper_kwargs_override[src_id])

        # Auto-inject user-selected counties into scrapers that accept them.
        # Bug fix: Zillow's registry default was ["Miami-Dade"], so selecting
        # Duval in the UI did nothing — Zillow always scraped Miami-Dade.
        # If the scraper has a `counties` (plural) param AND the user selected
        # counties, inject them — unless caller explicitly overrode.
        if counties:
            caller_override = scraper_kwargs_override.get(src_id, {})
            if "counties" in schema_params and "counties" not in caller_override:
                kwargs["counties"] = list(counties)
            # Some scrapers take singular `county` (e.g., realauction county-specific
            # variants). For those, the registry already hard-codes the right county
            # per source_id (duval_clerk → "Duval"), so we do NOT override here.

        # Always inject status_cb
        kwargs["status_cb"] = status_cb

        _log(f"▶️ Running source '{src['label']}' with kwargs={list(kwargs.keys())}...")
        try:
            run_summary = callable_fn(**kwargs)
            run_summary["source_id"] = src_id
            run_summary["source_label"] = src["label"]
            runs.append(run_summary)
            total_new += run_summary.get("deals_new", 0)
            total_credits += run_summary.get("firecrawl_credits_used", 0)
            _log(f"  ✓ {src_id}: {run_summary.get('deals_new', 0)} new deals")
        except Exception as e:
            err_msg = f"Source '{src_id}' raised: {type(e).__name__}: {e}"
            errors.append(err_msg)
            _log(f"  ✗ {err_msg}")

    # ─── Post-scrape: query matching deals from deals.db ───────────────────
    # Apply filters
    list_deals_kwargs = {"limit": 500}
    if filters.get("min_score"):
        list_deals_kwargs["min_score"] = filters["min_score"]
    # Note: list_deals soporta filter por source/county/classification/status

    # We'll do filtering manually for richer filters
    matching: list[dict] = []
    # Collect from all sources we just ran
    for src_id in source_ids:
        deals_from_src = list_deals(source=src_id, limit=500)
        for d in deals_from_src:
            # Filter by counties (if specified)
            if counties:
                deal_county = (d.get("county") or "").strip()
                # Normalize: "Miami-Dade" vs "Miami-Dade County"
                deal_county_norm = deal_county.replace(" County", "").strip().lower()
                wanted = any(deal_county_norm == c.replace(" County", "").strip().lower()
                            for c in counties)
                if not wanted:
                    continue
            # Filter by min_price / max_price
            price = d.get("listing_price")
            if filters.get("min_price") is not None:
                if price is None or price < filters["min_price"]:
                    continue
            if filters.get("max_price") is not None:
                if price is None or price > filters["max_price"]:
                    continue
            # Filter by beds_min
            if filters.get("beds_min") is not None:
                beds = d.get("beds")
                if beds is None or beds < filters["beds_min"]:
                    continue
            # Filter by deal_types
            if filters.get("deal_types"):
                if d.get("deal_type") not in filters["deal_types"]:
                    continue
            # Filter by classification_status
            if filters.get("classifications"):
                if d.get("classification_status") not in filters["classifications"]:
                    continue
            # Filter by cities
            if filters.get("cities"):
                deal_city = (d.get("city") or "").strip()
                if deal_city not in filters["cities"]:
                    continue
            # REDEEMED bug fix — filter out dead auction statuses by default
            if not filters.get("include_dead_auctions") and _is_auction_dead(d.get("auction_status")):
                continue
            matching.append(d)

    # Dedup by deal_hash (in case same deal appears from multiple sources — rare but possible)
    seen_hashes = set()
    deduped = []
    for d in matching:
        h = d.get("deal_hash")
        if h and h not in seen_hashes:
            seen_hashes.add(h)
            deduped.append(d)
    matching = deduped

    # Sort by classification_score desc (best first)
    matching.sort(key=lambda d: (d.get("classification_score") or 0,
                                  d.get("scraped_at") or ""), reverse=True)

    elapsed = time.perf_counter() - t0
    summary_msg = f"Cancelado tras {elapsed:.0f}s" if cancelled else f"Busqueda completa: {len(matching)} deals tras filtros, {elapsed:.0f}s"
    _log(summary_msg)

    return {
        "runs": runs,
        "new_deals_count": total_new,
        "total_credits_used": total_credits,
        "elapsed_seconds": round(elapsed, 1),
        "matching_deals": matching[:200],  # cap for UI rendering
        "matching_total": len(matching),
        "filters_applied": filters,
        "counties_searched": counties or [],
        "errors": errors,
        "cancelled": cancelled,
    }


# REDEEMED bug fix — single source of truth en realauction_clerk._DEAD_STATUS_SUBSTRINGS.
# Importamos para evitar drift entre el filtro del parser y el filtro de search.
# Substring matching captura variantes ("Canceled per Bankruptcy", "Canceled per Order", etc.)
from scrapers.realauction_clerk import (
    _DEAD_STATUS_SUBSTRINGS as _DEAD_AUCTION_SUBSTRINGS,
    _is_status_dead as _is_auction_dead,
)


def search_existing_only(
    *,
    counties: Optional[list[str]] = None,
    source_ids: Optional[list[str]] = None,
    filters: Optional[dict] = None,
    include_dead_auctions: bool = False,
) -> list[dict]:
    """Browse deals existentes en deals.db SIN ejecutar scrapers.

    Util cuando el usuario solo quiere ver deals previamente scraped sin
    consumir tiempo/credits en re-scrapear. Aplica los mismos filtros que run_search.

    include_dead_auctions: si False (default), filtra cases con auction_status
    in {Redeemed, Canceled, Sold, etc} — esos ya NO van a auction.
    """
    init_db()
    filters = filters or {}

    # Pull from each requested source
    pool: list[dict] = []
    if source_ids:
        for s in source_ids:
            pool.extend(list_deals(source=s, limit=500))
    else:
        pool = list_deals(limit=2000)

    matching: list[dict] = []
    for d in pool:
        # County filter
        if counties:
            deal_county_norm = (d.get("county") or "").replace(" County", "").strip().lower()
            wanted = any(deal_county_norm == c.replace(" County", "").strip().lower()
                        for c in counties)
            if not wanted:
                continue
        # Price filter
        price = d.get("listing_price")
        if filters.get("min_price") is not None:
            if price is None or price < filters["min_price"]:
                continue
        if filters.get("max_price") is not None:
            if price is None or price > filters["max_price"]:
                continue
        # Beds
        if filters.get("beds_min") is not None:
            if (d.get("beds") or 0) < filters["beds_min"]:
                continue
        # Deal type
        if filters.get("deal_types") and d.get("deal_type") not in filters["deal_types"]:
            continue
        # Classification
        if filters.get("classifications") and d.get("classification_status") not in filters["classifications"]:
            continue
        # Cities
        if filters.get("cities"):
            deal_city = (d.get("city") or "").strip()
            if deal_city not in filters["cities"]:
                continue
        # REDEEMED bug fix — filter out dead auction statuses by default
        if not include_dead_auctions and _is_auction_dead(d.get("auction_status")):
            continue
        matching.append(d)

    # Dedup + sort
    seen = set()
    out = []
    for d in matching:
        h = d.get("deal_hash")
        if h and h not in seen:
            seen.add(h)
            out.append(d)
    out.sort(key=lambda d: (d.get("classification_score") or 0,
                              d.get("scraped_at") or ""), reverse=True)
    return out[:500]