"""search_engine.py — High-level orchestrator para Search UI on-demand. Invocado por la UI cuando el usuario clickea "Buscar deals": 1. Verifica Firecrawl budget pre-flight (alert/pause si excede) 2. Por cada source seleccionado, llama su `run_scraper_to_db()` 3. Aplica filtros opcionales (price/beds/deal_type) 4. Retorna deals filtrados desde deals.db 5. Tracking de Firecrawl credits consumidos API: preflight_check(source_ids: list[str]) -> dict — verifica budget + retorna info para confirmation modal run_search(source_ids: list[str], counties: list[str], filters: dict, status_cb: Callable, **kwargs) -> dict — ejecuta scrapers y retorna deals matched """ from __future__ import annotations import time from typing import Callable, Optional from scrapers.registry import ( SOURCES, get_source, estimate_credits, resolve_callable, ) from deals_db import ( init_db, list_deals, firecrawl_budget_status, is_firecrawl_paused, ) def preflight_check(source_ids: list[str]) -> dict: """Calcula credits estimados + estado del budget ANTES de ejecutar. Returns dict para mostrar en confirmation modal: sources_info: list with each source's metadata + cost total_credits_estimated: int budget_snapshot: {used, total, remaining, alert_level, is_paused} post_run_pct: float — % del budget que quedara consumido despues warnings: list[str] de cosas a tener en cuenta is_paused: bool — True si NO se debe correr (budget excedido) """ init_db() estimation = estimate_credits(source_ids) budget = firecrawl_budget_status() total_after = budget["credits_used"] + estimation["total_credits"] post_run_pct = (total_after / budget["credits_budget"] * 100) if budget["credits_budget"] else 0 warnings = [] if budget["is_paused"]: warnings.append( f"🚨 BUDGET FIRECRAWL EXCEDIDO (95%+). Solo sources gratuitos disponibles." ) elif post_run_pct >= 95: warnings.append( f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% — superara el " f"pause threshold ({budget['pause_threshold_pct']}%). Considere reducir sources." ) elif post_run_pct >= 80: warnings.append( f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% (alert " f"threshold {budget['alert_threshold_pct']}%)." ) # Validate that selected sources actually exist invalid_sources = [sid for sid in source_ids if not get_source(sid)] if invalid_sources: warnings.append(f"Sources desconocidos (ignorados): {invalid_sources}") return { "sources_info": estimation["breakdown"], "total_credits_estimated": estimation["total_credits"], "budget_snapshot": budget, "post_run_pct": round(post_run_pct, 1), "warnings": warnings, "is_paused": budget["is_paused"], "ok_to_run": not budget["is_paused"] or estimation["total_credits"] == 0, } def run_search( *, source_ids: list[str], counties: Optional[list[str]] = None, filters: Optional[dict] = None, status_cb: Optional[Callable[[str], None]] = None, scraper_kwargs_override: Optional[dict] = None, cancel_check: Optional[Callable[[], bool]] = None, ) -> dict: """Ejecuta multi-source search on-demand. Args: source_ids: lista de source ids a ejecutar (e.g. ["miami_dade_clerk", "hud_homestore"]) counties: lista de condados a focusear (informacional para reporte) filters: dict con filtros post-scrape (min_price, max_price, beds_min, deal_types) status_cb: callback de progreso scraper_kwargs_override: dict opcional per-source para override defaults e.g. {"hud_homestore": {"states": ["FL", "GA"]}, "miami_dade_clerk": {"days_ahead": 7}} cancel_check: callable opcional → True si el usuario clickeo cancelar. Se chequea entre source iterations. NO interrumpe mid-source (los scrapers son blocking); para eso usar Streamlit Stop button. Returns dict con: runs: list[dict] — un run summary por scraper new_deals_count: int — total nuevos deals across all sources total_credits_used: int — Firecrawl credits consumidos elapsed_seconds: float matching_deals: list[dict] — deals que pasan los filtros (limite 200) errors: list[str] cancelled: bool — True si fue cancelado mid-batch """ init_db() filters = filters or {} scraper_kwargs_override = scraper_kwargs_override or {} def _log(msg: str) -> None: if status_cb: status_cb(msg) def _was_cancelled() -> bool: return bool(cancel_check and cancel_check()) t0 = time.perf_counter() runs: list[dict] = [] total_new = 0 total_credits = 0 errors: list[str] = [] cancelled = False for src_id in source_ids: # Cooperative cancellation between sources if _was_cancelled(): cancelled = True _log(f"Cancelacion solicitada — saltando sources restantes ({len([s for s in source_ids if s == src_id or source_ids.index(s) > source_ids.index(src_id)])} pendientes)") errors.append("Cancelled by user") break src = get_source(src_id) if not src: errors.append(f"Source '{src_id}' not in registry") continue # Check budget mid-loop in case prior iteration consumed credits if is_firecrawl_paused() and not src.get("free"): errors.append(f"Source '{src_id}' skipped — Firecrawl budget paused") continue callable_fn = resolve_callable(src_id) if not callable_fn: errors.append(f"Source '{src_id}' callable could not be resolved") continue # Build kwargs: defaults from registry + override from caller kwargs = {} schema_params = src.get("parameters_schema") or {} for param_name, schema in schema_params.items(): if "default" in schema: kwargs[param_name] = schema["default"] if src_id in scraper_kwargs_override: kwargs.update(scraper_kwargs_override[src_id]) # Auto-inject user-selected counties into scrapers that accept them. # Bug fix: Zillow's registry default was ["Miami-Dade"], so selecting # Duval in the UI did nothing — Zillow always scraped Miami-Dade. # If the scraper has a `counties` (plural) param AND the user selected # counties, inject them — unless caller explicitly overrode. if counties: caller_override = scraper_kwargs_override.get(src_id, {}) if "counties" in schema_params and "counties" not in caller_override: kwargs["counties"] = list(counties) # Some scrapers take singular `county` (e.g., realauction county-specific # variants). For those, the registry already hard-codes the right county # per source_id (duval_clerk → "Duval"), so we do NOT override here. # Always inject status_cb kwargs["status_cb"] = status_cb _log(f"▶️ Running source '{src['label']}' with kwargs={list(kwargs.keys())}...") try: run_summary = callable_fn(**kwargs) run_summary["source_id"] = src_id run_summary["source_label"] = src["label"] runs.append(run_summary) total_new += run_summary.get("deals_new", 0) total_credits += run_summary.get("firecrawl_credits_used", 0) _log(f" ✓ {src_id}: {run_summary.get('deals_new', 0)} new deals") except Exception as e: err_msg = f"Source '{src_id}' raised: {type(e).__name__}: {e}" errors.append(err_msg) _log(f" ✗ {err_msg}") # ─── Post-scrape: query matching deals from deals.db ─────────────────── # Apply filters list_deals_kwargs = {"limit": 500} if filters.get("min_score"): list_deals_kwargs["min_score"] = filters["min_score"] # Note: list_deals soporta filter por source/county/classification/status # We'll do filtering manually for richer filters matching: list[dict] = [] # Collect from all sources we just ran for src_id in source_ids: deals_from_src = list_deals(source=src_id, limit=500) for d in deals_from_src: # Filter by counties (if specified) if counties: deal_county = (d.get("county") or "").strip() # Normalize: "Miami-Dade" vs "Miami-Dade County" deal_county_norm = deal_county.replace(" County", "").strip().lower() wanted = any(deal_county_norm == c.replace(" County", "").strip().lower() for c in counties) if not wanted: continue # Filter by min_price / max_price price = d.get("listing_price") if filters.get("min_price") is not None: if price is None or price < filters["min_price"]: continue if filters.get("max_price") is not None: if price is None or price > filters["max_price"]: continue # Filter by beds_min if filters.get("beds_min") is not None: beds = d.get("beds") if beds is None or beds < filters["beds_min"]: continue # Filter by deal_types if filters.get("deal_types"): if d.get("deal_type") not in filters["deal_types"]: continue # Filter by classification_status if filters.get("classifications"): if d.get("classification_status") not in filters["classifications"]: continue # Filter by cities if filters.get("cities"): deal_city = (d.get("city") or "").strip() if deal_city not in filters["cities"]: continue # REDEEMED bug fix — filter out dead auction statuses by default if not filters.get("include_dead_auctions") and _is_auction_dead(d.get("auction_status")): continue matching.append(d) # Dedup by deal_hash (in case same deal appears from multiple sources — rare but possible) seen_hashes = set() deduped = [] for d in matching: h = d.get("deal_hash") if h and h not in seen_hashes: seen_hashes.add(h) deduped.append(d) matching = deduped # Sort by classification_score desc (best first) matching.sort(key=lambda d: (d.get("classification_score") or 0, d.get("scraped_at") or ""), reverse=True) elapsed = time.perf_counter() - t0 summary_msg = f"Cancelado tras {elapsed:.0f}s" if cancelled else f"Busqueda completa: {len(matching)} deals tras filtros, {elapsed:.0f}s" _log(summary_msg) return { "runs": runs, "new_deals_count": total_new, "total_credits_used": total_credits, "elapsed_seconds": round(elapsed, 1), "matching_deals": matching[:200], # cap for UI rendering "matching_total": len(matching), "filters_applied": filters, "counties_searched": counties or [], "errors": errors, "cancelled": cancelled, } # REDEEMED bug fix — single source of truth en realauction_clerk._DEAD_STATUS_SUBSTRINGS. # Importamos para evitar drift entre el filtro del parser y el filtro de search. # Substring matching captura variantes ("Canceled per Bankruptcy", "Canceled per Order", etc.) from scrapers.realauction_clerk import ( _DEAD_STATUS_SUBSTRINGS as _DEAD_AUCTION_SUBSTRINGS, _is_status_dead as _is_auction_dead, ) def search_existing_only( *, counties: Optional[list[str]] = None, source_ids: Optional[list[str]] = None, filters: Optional[dict] = None, include_dead_auctions: bool = False, ) -> list[dict]: """Browse deals existentes en deals.db SIN ejecutar scrapers. Util cuando el usuario solo quiere ver deals previamente scraped sin consumir tiempo/credits en re-scrapear. Aplica los mismos filtros que run_search. include_dead_auctions: si False (default), filtra cases con auction_status in {Redeemed, Canceled, Sold, etc} — esos ya NO van a auction. """ init_db() filters = filters or {} # Pull from each requested source pool: list[dict] = [] if source_ids: for s in source_ids: pool.extend(list_deals(source=s, limit=500)) else: pool = list_deals(limit=2000) matching: list[dict] = [] for d in pool: # County filter if counties: deal_county_norm = (d.get("county") or "").replace(" County", "").strip().lower() wanted = any(deal_county_norm == c.replace(" County", "").strip().lower() for c in counties) if not wanted: continue # Price filter price = d.get("listing_price") if filters.get("min_price") is not None: if price is None or price < filters["min_price"]: continue if filters.get("max_price") is not None: if price is None or price > filters["max_price"]: continue # Beds if filters.get("beds_min") is not None: if (d.get("beds") or 0) < filters["beds_min"]: continue # Deal type if filters.get("deal_types") and d.get("deal_type") not in filters["deal_types"]: continue # Classification if filters.get("classifications") and d.get("classification_status") not in filters["classifications"]: continue # Cities if filters.get("cities"): deal_city = (d.get("city") or "").strip() if deal_city not in filters["cities"]: continue # REDEEMED bug fix — filter out dead auction statuses by default if not include_dead_auctions and _is_auction_dead(d.get("auction_status")): continue matching.append(d) # Dedup + sort seen = set() out = [] for d in matching: h = d.get("deal_hash") if h and h not in seen: seen.add(h) out.append(d) out.sort(key=lambda d: (d.get("classification_score") or 0, d.get("scraped_at") or ""), reverse=True) return out[:500]