359 lines
14 KiB
Python
359 lines
14 KiB
Python
"""search_engine.py — High-level orchestrator para Search UI on-demand.
|
|
|
|
Invocado por la UI cuando el usuario clickea "Buscar deals":
|
|
1. Verifica Firecrawl budget pre-flight (alert/pause si excede)
|
|
2. Por cada source seleccionado, llama su `run_scraper_to_db()`
|
|
3. Aplica filtros opcionales (price/beds/deal_type)
|
|
4. Retorna deals filtrados desde deals.db
|
|
5. Tracking de Firecrawl credits consumidos
|
|
|
|
API:
|
|
preflight_check(source_ids: list[str]) -> dict
|
|
— verifica budget + retorna info para confirmation modal
|
|
run_search(source_ids: list[str], counties: list[str], filters: dict,
|
|
status_cb: Callable, **kwargs) -> dict
|
|
— ejecuta scrapers y retorna deals matched
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from typing import Callable, Optional
|
|
|
|
from scrapers.registry import (
|
|
SOURCES, get_source, estimate_credits, resolve_callable,
|
|
)
|
|
from deals_db import (
|
|
init_db, list_deals, firecrawl_budget_status, is_firecrawl_paused,
|
|
)
|
|
|
|
|
|
def preflight_check(source_ids: list[str]) -> dict:
|
|
"""Calcula credits estimados + estado del budget ANTES de ejecutar.
|
|
|
|
Returns dict para mostrar en confirmation modal:
|
|
sources_info: list with each source's metadata + cost
|
|
total_credits_estimated: int
|
|
budget_snapshot: {used, total, remaining, alert_level, is_paused}
|
|
post_run_pct: float — % del budget que quedara consumido despues
|
|
warnings: list[str] de cosas a tener en cuenta
|
|
is_paused: bool — True si NO se debe correr (budget excedido)
|
|
"""
|
|
init_db()
|
|
estimation = estimate_credits(source_ids)
|
|
budget = firecrawl_budget_status()
|
|
|
|
total_after = budget["credits_used"] + estimation["total_credits"]
|
|
post_run_pct = (total_after / budget["credits_budget"] * 100) if budget["credits_budget"] else 0
|
|
|
|
warnings = []
|
|
if budget["is_paused"]:
|
|
warnings.append(
|
|
f"🚨 BUDGET FIRECRAWL EXCEDIDO (95%+). Solo sources gratuitos disponibles."
|
|
)
|
|
elif post_run_pct >= 95:
|
|
warnings.append(
|
|
f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% — superara el "
|
|
f"pause threshold ({budget['pause_threshold_pct']}%). Considere reducir sources."
|
|
)
|
|
elif post_run_pct >= 80:
|
|
warnings.append(
|
|
f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% (alert "
|
|
f"threshold {budget['alert_threshold_pct']}%)."
|
|
)
|
|
|
|
# Validate that selected sources actually exist
|
|
invalid_sources = [sid for sid in source_ids if not get_source(sid)]
|
|
if invalid_sources:
|
|
warnings.append(f"Sources desconocidos (ignorados): {invalid_sources}")
|
|
|
|
return {
|
|
"sources_info": estimation["breakdown"],
|
|
"total_credits_estimated": estimation["total_credits"],
|
|
"budget_snapshot": budget,
|
|
"post_run_pct": round(post_run_pct, 1),
|
|
"warnings": warnings,
|
|
"is_paused": budget["is_paused"],
|
|
"ok_to_run": not budget["is_paused"] or estimation["total_credits"] == 0,
|
|
}
|
|
|
|
|
|
def run_search(
|
|
*,
|
|
source_ids: list[str],
|
|
counties: Optional[list[str]] = None,
|
|
filters: Optional[dict] = None,
|
|
status_cb: Optional[Callable[[str], None]] = None,
|
|
scraper_kwargs_override: Optional[dict] = None,
|
|
cancel_check: Optional[Callable[[], bool]] = None,
|
|
) -> dict:
|
|
"""Ejecuta multi-source search on-demand.
|
|
|
|
Args:
|
|
source_ids: lista de source ids a ejecutar (e.g. ["miami_dade_clerk", "hud_homestore"])
|
|
counties: lista de condados a focusear (informacional para reporte)
|
|
filters: dict con filtros post-scrape (min_price, max_price, beds_min, deal_types)
|
|
status_cb: callback de progreso
|
|
scraper_kwargs_override: dict opcional per-source para override defaults
|
|
e.g. {"hud_homestore": {"states": ["FL", "GA"]}, "miami_dade_clerk": {"days_ahead": 7}}
|
|
cancel_check: callable opcional → True si el usuario clickeo cancelar.
|
|
Se chequea entre source iterations. NO interrumpe mid-source
|
|
(los scrapers son blocking); para eso usar Streamlit Stop button.
|
|
|
|
Returns dict con:
|
|
runs: list[dict] — un run summary por scraper
|
|
new_deals_count: int — total nuevos deals across all sources
|
|
total_credits_used: int — Firecrawl credits consumidos
|
|
elapsed_seconds: float
|
|
matching_deals: list[dict] — deals que pasan los filtros (limite 200)
|
|
errors: list[str]
|
|
cancelled: bool — True si fue cancelado mid-batch
|
|
"""
|
|
init_db()
|
|
filters = filters or {}
|
|
scraper_kwargs_override = scraper_kwargs_override or {}
|
|
|
|
def _log(msg: str) -> None:
|
|
if status_cb:
|
|
status_cb(msg)
|
|
|
|
def _was_cancelled() -> bool:
|
|
return bool(cancel_check and cancel_check())
|
|
|
|
t0 = time.perf_counter()
|
|
runs: list[dict] = []
|
|
total_new = 0
|
|
total_credits = 0
|
|
errors: list[str] = []
|
|
cancelled = False
|
|
|
|
for src_id in source_ids:
|
|
# Cooperative cancellation between sources
|
|
if _was_cancelled():
|
|
cancelled = True
|
|
_log(f"Cancelacion solicitada — saltando sources restantes ({len([s for s in source_ids if s == src_id or source_ids.index(s) > source_ids.index(src_id)])} pendientes)")
|
|
errors.append("Cancelled by user")
|
|
break
|
|
|
|
src = get_source(src_id)
|
|
if not src:
|
|
errors.append(f"Source '{src_id}' not in registry")
|
|
continue
|
|
|
|
# Check budget mid-loop in case prior iteration consumed credits
|
|
if is_firecrawl_paused() and not src.get("free"):
|
|
errors.append(f"Source '{src_id}' skipped — Firecrawl budget paused")
|
|
continue
|
|
|
|
callable_fn = resolve_callable(src_id)
|
|
if not callable_fn:
|
|
errors.append(f"Source '{src_id}' callable could not be resolved")
|
|
continue
|
|
|
|
# Build kwargs: defaults from registry + override from caller
|
|
kwargs = {}
|
|
schema_params = src.get("parameters_schema") or {}
|
|
for param_name, schema in schema_params.items():
|
|
if "default" in schema:
|
|
kwargs[param_name] = schema["default"]
|
|
if src_id in scraper_kwargs_override:
|
|
kwargs.update(scraper_kwargs_override[src_id])
|
|
|
|
# Auto-inject user-selected counties into scrapers that accept them.
|
|
# Bug fix: Zillow's registry default was ["Miami-Dade"], so selecting
|
|
# Duval in the UI did nothing — Zillow always scraped Miami-Dade.
|
|
# If the scraper has a `counties` (plural) param AND the user selected
|
|
# counties, inject them — unless caller explicitly overrode.
|
|
if counties:
|
|
caller_override = scraper_kwargs_override.get(src_id, {})
|
|
if "counties" in schema_params and "counties" not in caller_override:
|
|
kwargs["counties"] = list(counties)
|
|
# Some scrapers take singular `county` (e.g., realauction county-specific
|
|
# variants). For those, the registry already hard-codes the right county
|
|
# per source_id (duval_clerk → "Duval"), so we do NOT override here.
|
|
|
|
# Always inject status_cb
|
|
kwargs["status_cb"] = status_cb
|
|
|
|
_log(f"▶️ Running source '{src['label']}' with kwargs={list(kwargs.keys())}...")
|
|
try:
|
|
run_summary = callable_fn(**kwargs)
|
|
run_summary["source_id"] = src_id
|
|
run_summary["source_label"] = src["label"]
|
|
runs.append(run_summary)
|
|
total_new += run_summary.get("deals_new", 0)
|
|
total_credits += run_summary.get("firecrawl_credits_used", 0)
|
|
_log(f" ✓ {src_id}: {run_summary.get('deals_new', 0)} new deals")
|
|
except Exception as e:
|
|
err_msg = f"Source '{src_id}' raised: {type(e).__name__}: {e}"
|
|
errors.append(err_msg)
|
|
_log(f" ✗ {err_msg}")
|
|
|
|
# ─── Post-scrape: query matching deals from deals.db ───────────────────
|
|
# Apply filters
|
|
list_deals_kwargs = {"limit": 500}
|
|
if filters.get("min_score"):
|
|
list_deals_kwargs["min_score"] = filters["min_score"]
|
|
# Note: list_deals soporta filter por source/county/classification/status
|
|
|
|
# We'll do filtering manually for richer filters
|
|
matching: list[dict] = []
|
|
# Collect from all sources we just ran
|
|
for src_id in source_ids:
|
|
deals_from_src = list_deals(source=src_id, limit=500)
|
|
for d in deals_from_src:
|
|
# Filter by counties (if specified)
|
|
if counties:
|
|
deal_county = (d.get("county") or "").strip()
|
|
# Normalize: "Miami-Dade" vs "Miami-Dade County"
|
|
deal_county_norm = deal_county.replace(" County", "").strip().lower()
|
|
wanted = any(deal_county_norm == c.replace(" County", "").strip().lower()
|
|
for c in counties)
|
|
if not wanted:
|
|
continue
|
|
# Filter by min_price / max_price
|
|
price = d.get("listing_price")
|
|
if filters.get("min_price") is not None:
|
|
if price is None or price < filters["min_price"]:
|
|
continue
|
|
if filters.get("max_price") is not None:
|
|
if price is None or price > filters["max_price"]:
|
|
continue
|
|
# Filter by beds_min
|
|
if filters.get("beds_min") is not None:
|
|
beds = d.get("beds")
|
|
if beds is None or beds < filters["beds_min"]:
|
|
continue
|
|
# Filter by deal_types
|
|
if filters.get("deal_types"):
|
|
if d.get("deal_type") not in filters["deal_types"]:
|
|
continue
|
|
# Filter by classification_status
|
|
if filters.get("classifications"):
|
|
if d.get("classification_status") not in filters["classifications"]:
|
|
continue
|
|
# Filter by cities
|
|
if filters.get("cities"):
|
|
deal_city = (d.get("city") or "").strip()
|
|
if deal_city not in filters["cities"]:
|
|
continue
|
|
# REDEEMED bug fix — filter out dead auction statuses by default
|
|
if not filters.get("include_dead_auctions") and _is_auction_dead(d.get("auction_status")):
|
|
continue
|
|
matching.append(d)
|
|
|
|
# Dedup by deal_hash (in case same deal appears from multiple sources — rare but possible)
|
|
seen_hashes = set()
|
|
deduped = []
|
|
for d in matching:
|
|
h = d.get("deal_hash")
|
|
if h and h not in seen_hashes:
|
|
seen_hashes.add(h)
|
|
deduped.append(d)
|
|
matching = deduped
|
|
|
|
# Sort by classification_score desc (best first)
|
|
matching.sort(key=lambda d: (d.get("classification_score") or 0,
|
|
d.get("scraped_at") or ""), reverse=True)
|
|
|
|
elapsed = time.perf_counter() - t0
|
|
summary_msg = f"Cancelado tras {elapsed:.0f}s" if cancelled else f"Busqueda completa: {len(matching)} deals tras filtros, {elapsed:.0f}s"
|
|
_log(summary_msg)
|
|
|
|
return {
|
|
"runs": runs,
|
|
"new_deals_count": total_new,
|
|
"total_credits_used": total_credits,
|
|
"elapsed_seconds": round(elapsed, 1),
|
|
"matching_deals": matching[:200], # cap for UI rendering
|
|
"matching_total": len(matching),
|
|
"filters_applied": filters,
|
|
"counties_searched": counties or [],
|
|
"errors": errors,
|
|
"cancelled": cancelled,
|
|
}
|
|
|
|
|
|
# REDEEMED bug fix — single source of truth en realauction_clerk._DEAD_STATUS_SUBSTRINGS.
|
|
# Importamos para evitar drift entre el filtro del parser y el filtro de search.
|
|
# Substring matching captura variantes ("Canceled per Bankruptcy", "Canceled per Order", etc.)
|
|
from scrapers.realauction_clerk import (
|
|
_DEAD_STATUS_SUBSTRINGS as _DEAD_AUCTION_SUBSTRINGS,
|
|
_is_status_dead as _is_auction_dead,
|
|
)
|
|
|
|
|
|
def search_existing_only(
|
|
*,
|
|
counties: Optional[list[str]] = None,
|
|
source_ids: Optional[list[str]] = None,
|
|
filters: Optional[dict] = None,
|
|
include_dead_auctions: bool = False,
|
|
) -> list[dict]:
|
|
"""Browse deals existentes en deals.db SIN ejecutar scrapers.
|
|
|
|
Util cuando el usuario solo quiere ver deals previamente scraped sin
|
|
consumir tiempo/credits en re-scrapear. Aplica los mismos filtros que run_search.
|
|
|
|
include_dead_auctions: si False (default), filtra cases con auction_status
|
|
in {Redeemed, Canceled, Sold, etc} — esos ya NO van a auction.
|
|
"""
|
|
init_db()
|
|
filters = filters or {}
|
|
|
|
# Pull from each requested source
|
|
pool: list[dict] = []
|
|
if source_ids:
|
|
for s in source_ids:
|
|
pool.extend(list_deals(source=s, limit=500))
|
|
else:
|
|
pool = list_deals(limit=2000)
|
|
|
|
matching: list[dict] = []
|
|
for d in pool:
|
|
# County filter
|
|
if counties:
|
|
deal_county_norm = (d.get("county") or "").replace(" County", "").strip().lower()
|
|
wanted = any(deal_county_norm == c.replace(" County", "").strip().lower()
|
|
for c in counties)
|
|
if not wanted:
|
|
continue
|
|
# Price filter
|
|
price = d.get("listing_price")
|
|
if filters.get("min_price") is not None:
|
|
if price is None or price < filters["min_price"]:
|
|
continue
|
|
if filters.get("max_price") is not None:
|
|
if price is None or price > filters["max_price"]:
|
|
continue
|
|
# Beds
|
|
if filters.get("beds_min") is not None:
|
|
if (d.get("beds") or 0) < filters["beds_min"]:
|
|
continue
|
|
# Deal type
|
|
if filters.get("deal_types") and d.get("deal_type") not in filters["deal_types"]:
|
|
continue
|
|
# Classification
|
|
if filters.get("classifications") and d.get("classification_status") not in filters["classifications"]:
|
|
continue
|
|
# Cities
|
|
if filters.get("cities"):
|
|
deal_city = (d.get("city") or "").strip()
|
|
if deal_city not in filters["cities"]:
|
|
continue
|
|
# REDEEMED bug fix — filter out dead auction statuses by default
|
|
if not include_dead_auctions and _is_auction_dead(d.get("auction_status")):
|
|
continue
|
|
matching.append(d)
|
|
|
|
# Dedup + sort
|
|
seen = set()
|
|
out = []
|
|
for d in matching:
|
|
h = d.get("deal_hash")
|
|
if h and h not in seen:
|
|
seen.add(h)
|
|
out.append(d)
|
|
out.sort(key=lambda d: (d.get("classification_score") or 0,
|
|
d.get("scraped_at") or ""), reverse=True)
|
|
return out[:500]
|