Files
AR-House/search_engine.py
T
2026-07-03 12:24:58 -04:00

359 lines
14 KiB
Python

"""search_engine.py — High-level orchestrator para Search UI on-demand.
Invocado por la UI cuando el usuario clickea "Buscar deals":
1. Verifica Firecrawl budget pre-flight (alert/pause si excede)
2. Por cada source seleccionado, llama su `run_scraper_to_db()`
3. Aplica filtros opcionales (price/beds/deal_type)
4. Retorna deals filtrados desde deals.db
5. Tracking de Firecrawl credits consumidos
API:
preflight_check(source_ids: list[str]) -> dict
— verifica budget + retorna info para confirmation modal
run_search(source_ids: list[str], counties: list[str], filters: dict,
status_cb: Callable, **kwargs) -> dict
— ejecuta scrapers y retorna deals matched
"""
from __future__ import annotations
import time
from typing import Callable, Optional
from scrapers.registry import (
SOURCES, get_source, estimate_credits, resolve_callable,
)
from deals_db import (
init_db, list_deals, firecrawl_budget_status, is_firecrawl_paused,
)
def preflight_check(source_ids: list[str]) -> dict:
"""Calcula credits estimados + estado del budget ANTES de ejecutar.
Returns dict para mostrar en confirmation modal:
sources_info: list with each source's metadata + cost
total_credits_estimated: int
budget_snapshot: {used, total, remaining, alert_level, is_paused}
post_run_pct: float — % del budget que quedara consumido despues
warnings: list[str] de cosas a tener en cuenta
is_paused: bool — True si NO se debe correr (budget excedido)
"""
init_db()
estimation = estimate_credits(source_ids)
budget = firecrawl_budget_status()
total_after = budget["credits_used"] + estimation["total_credits"]
post_run_pct = (total_after / budget["credits_budget"] * 100) if budget["credits_budget"] else 0
warnings = []
if budget["is_paused"]:
warnings.append(
f"🚨 BUDGET FIRECRAWL EXCEDIDO (95%+). Solo sources gratuitos disponibles."
)
elif post_run_pct >= 95:
warnings.append(
f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% — superara el "
f"pause threshold ({budget['pause_threshold_pct']}%). Considere reducir sources."
)
elif post_run_pct >= 80:
warnings.append(
f"⚠️ Esta corrida llevaria el budget a {post_run_pct:.1f}% (alert "
f"threshold {budget['alert_threshold_pct']}%)."
)
# Validate that selected sources actually exist
invalid_sources = [sid for sid in source_ids if not get_source(sid)]
if invalid_sources:
warnings.append(f"Sources desconocidos (ignorados): {invalid_sources}")
return {
"sources_info": estimation["breakdown"],
"total_credits_estimated": estimation["total_credits"],
"budget_snapshot": budget,
"post_run_pct": round(post_run_pct, 1),
"warnings": warnings,
"is_paused": budget["is_paused"],
"ok_to_run": not budget["is_paused"] or estimation["total_credits"] == 0,
}
def run_search(
*,
source_ids: list[str],
counties: Optional[list[str]] = None,
filters: Optional[dict] = None,
status_cb: Optional[Callable[[str], None]] = None,
scraper_kwargs_override: Optional[dict] = None,
cancel_check: Optional[Callable[[], bool]] = None,
) -> dict:
"""Ejecuta multi-source search on-demand.
Args:
source_ids: lista de source ids a ejecutar (e.g. ["miami_dade_clerk", "hud_homestore"])
counties: lista de condados a focusear (informacional para reporte)
filters: dict con filtros post-scrape (min_price, max_price, beds_min, deal_types)
status_cb: callback de progreso
scraper_kwargs_override: dict opcional per-source para override defaults
e.g. {"hud_homestore": {"states": ["FL", "GA"]}, "miami_dade_clerk": {"days_ahead": 7}}
cancel_check: callable opcional → True si el usuario clickeo cancelar.
Se chequea entre source iterations. NO interrumpe mid-source
(los scrapers son blocking); para eso usar Streamlit Stop button.
Returns dict con:
runs: list[dict] — un run summary por scraper
new_deals_count: int — total nuevos deals across all sources
total_credits_used: int — Firecrawl credits consumidos
elapsed_seconds: float
matching_deals: list[dict] — deals que pasan los filtros (limite 200)
errors: list[str]
cancelled: bool — True si fue cancelado mid-batch
"""
init_db()
filters = filters or {}
scraper_kwargs_override = scraper_kwargs_override or {}
def _log(msg: str) -> None:
if status_cb:
status_cb(msg)
def _was_cancelled() -> bool:
return bool(cancel_check and cancel_check())
t0 = time.perf_counter()
runs: list[dict] = []
total_new = 0
total_credits = 0
errors: list[str] = []
cancelled = False
for src_id in source_ids:
# Cooperative cancellation between sources
if _was_cancelled():
cancelled = True
_log(f"Cancelacion solicitada — saltando sources restantes ({len([s for s in source_ids if s == src_id or source_ids.index(s) > source_ids.index(src_id)])} pendientes)")
errors.append("Cancelled by user")
break
src = get_source(src_id)
if not src:
errors.append(f"Source '{src_id}' not in registry")
continue
# Check budget mid-loop in case prior iteration consumed credits
if is_firecrawl_paused() and not src.get("free"):
errors.append(f"Source '{src_id}' skipped — Firecrawl budget paused")
continue
callable_fn = resolve_callable(src_id)
if not callable_fn:
errors.append(f"Source '{src_id}' callable could not be resolved")
continue
# Build kwargs: defaults from registry + override from caller
kwargs = {}
schema_params = src.get("parameters_schema") or {}
for param_name, schema in schema_params.items():
if "default" in schema:
kwargs[param_name] = schema["default"]
if src_id in scraper_kwargs_override:
kwargs.update(scraper_kwargs_override[src_id])
# Auto-inject user-selected counties into scrapers that accept them.
# Bug fix: Zillow's registry default was ["Miami-Dade"], so selecting
# Duval in the UI did nothing — Zillow always scraped Miami-Dade.
# If the scraper has a `counties` (plural) param AND the user selected
# counties, inject them — unless caller explicitly overrode.
if counties:
caller_override = scraper_kwargs_override.get(src_id, {})
if "counties" in schema_params and "counties" not in caller_override:
kwargs["counties"] = list(counties)
# Some scrapers take singular `county` (e.g., realauction county-specific
# variants). For those, the registry already hard-codes the right county
# per source_id (duval_clerk → "Duval"), so we do NOT override here.
# Always inject status_cb
kwargs["status_cb"] = status_cb
_log(f"▶️ Running source '{src['label']}' with kwargs={list(kwargs.keys())}...")
try:
run_summary = callable_fn(**kwargs)
run_summary["source_id"] = src_id
run_summary["source_label"] = src["label"]
runs.append(run_summary)
total_new += run_summary.get("deals_new", 0)
total_credits += run_summary.get("firecrawl_credits_used", 0)
_log(f" ✓ {src_id}: {run_summary.get('deals_new', 0)} new deals")
except Exception as e:
err_msg = f"Source '{src_id}' raised: {type(e).__name__}: {e}"
errors.append(err_msg)
_log(f" ✗ {err_msg}")
# ─── Post-scrape: query matching deals from deals.db ───────────────────
# Apply filters
list_deals_kwargs = {"limit": 500}
if filters.get("min_score"):
list_deals_kwargs["min_score"] = filters["min_score"]
# Note: list_deals soporta filter por source/county/classification/status
# We'll do filtering manually for richer filters
matching: list[dict] = []
# Collect from all sources we just ran
for src_id in source_ids:
deals_from_src = list_deals(source=src_id, limit=500)
for d in deals_from_src:
# Filter by counties (if specified)
if counties:
deal_county = (d.get("county") or "").strip()
# Normalize: "Miami-Dade" vs "Miami-Dade County"
deal_county_norm = deal_county.replace(" County", "").strip().lower()
wanted = any(deal_county_norm == c.replace(" County", "").strip().lower()
for c in counties)
if not wanted:
continue
# Filter by min_price / max_price
price = d.get("listing_price")
if filters.get("min_price") is not None:
if price is None or price < filters["min_price"]:
continue
if filters.get("max_price") is not None:
if price is None or price > filters["max_price"]:
continue
# Filter by beds_min
if filters.get("beds_min") is not None:
beds = d.get("beds")
if beds is None or beds < filters["beds_min"]:
continue
# Filter by deal_types
if filters.get("deal_types"):
if d.get("deal_type") not in filters["deal_types"]:
continue
# Filter by classification_status
if filters.get("classifications"):
if d.get("classification_status") not in filters["classifications"]:
continue
# Filter by cities
if filters.get("cities"):
deal_city = (d.get("city") or "").strip()
if deal_city not in filters["cities"]:
continue
# REDEEMED bug fix — filter out dead auction statuses by default
if not filters.get("include_dead_auctions") and _is_auction_dead(d.get("auction_status")):
continue
matching.append(d)
# Dedup by deal_hash (in case same deal appears from multiple sources — rare but possible)
seen_hashes = set()
deduped = []
for d in matching:
h = d.get("deal_hash")
if h and h not in seen_hashes:
seen_hashes.add(h)
deduped.append(d)
matching = deduped
# Sort by classification_score desc (best first)
matching.sort(key=lambda d: (d.get("classification_score") or 0,
d.get("scraped_at") or ""), reverse=True)
elapsed = time.perf_counter() - t0
summary_msg = f"Cancelado tras {elapsed:.0f}s" if cancelled else f"Busqueda completa: {len(matching)} deals tras filtros, {elapsed:.0f}s"
_log(summary_msg)
return {
"runs": runs,
"new_deals_count": total_new,
"total_credits_used": total_credits,
"elapsed_seconds": round(elapsed, 1),
"matching_deals": matching[:200], # cap for UI rendering
"matching_total": len(matching),
"filters_applied": filters,
"counties_searched": counties or [],
"errors": errors,
"cancelled": cancelled,
}
# REDEEMED bug fix — single source of truth en realauction_clerk._DEAD_STATUS_SUBSTRINGS.
# Importamos para evitar drift entre el filtro del parser y el filtro de search.
# Substring matching captura variantes ("Canceled per Bankruptcy", "Canceled per Order", etc.)
from scrapers.realauction_clerk import (
_DEAD_STATUS_SUBSTRINGS as _DEAD_AUCTION_SUBSTRINGS,
_is_status_dead as _is_auction_dead,
)
def search_existing_only(
*,
counties: Optional[list[str]] = None,
source_ids: Optional[list[str]] = None,
filters: Optional[dict] = None,
include_dead_auctions: bool = False,
) -> list[dict]:
"""Browse deals existentes en deals.db SIN ejecutar scrapers.
Util cuando el usuario solo quiere ver deals previamente scraped sin
consumir tiempo/credits en re-scrapear. Aplica los mismos filtros que run_search.
include_dead_auctions: si False (default), filtra cases con auction_status
in {Redeemed, Canceled, Sold, etc} — esos ya NO van a auction.
"""
init_db()
filters = filters or {}
# Pull from each requested source
pool: list[dict] = []
if source_ids:
for s in source_ids:
pool.extend(list_deals(source=s, limit=500))
else:
pool = list_deals(limit=2000)
matching: list[dict] = []
for d in pool:
# County filter
if counties:
deal_county_norm = (d.get("county") or "").replace(" County", "").strip().lower()
wanted = any(deal_county_norm == c.replace(" County", "").strip().lower()
for c in counties)
if not wanted:
continue
# Price filter
price = d.get("listing_price")
if filters.get("min_price") is not None:
if price is None or price < filters["min_price"]:
continue
if filters.get("max_price") is not None:
if price is None or price > filters["max_price"]:
continue
# Beds
if filters.get("beds_min") is not None:
if (d.get("beds") or 0) < filters["beds_min"]:
continue
# Deal type
if filters.get("deal_types") and d.get("deal_type") not in filters["deal_types"]:
continue
# Classification
if filters.get("classifications") and d.get("classification_status") not in filters["classifications"]:
continue
# Cities
if filters.get("cities"):
deal_city = (d.get("city") or "").strip()
if deal_city not in filters["cities"]:
continue
# REDEEMED bug fix — filter out dead auction statuses by default
if not include_dead_auctions and _is_auction_dead(d.get("auction_status")):
continue
matching.append(d)
# Dedup + sort
seen = set()
out = []
for d in matching:
h = d.get("deal_hash")
if h and h not in seen:
seen.add(h)
out.append(d)
out.sort(key=lambda d: (d.get("classification_score") or 0,
d.get("scraped_at") or ""), reverse=True)
return out[:500]