454 lines
17 KiB
Python
454 lines
17 KiB
Python
"""deal_classifier.py — Lightweight clasificador de deals (Phase 3A).
|
|
|
|
PROPOSITO:
|
|
DealFinder scrapea muchos deals/dia. Antes de gastar ciclos en analisis profundo
|
|
(8 agentes Ollama, 18-22 min/deal), clasificamos cada deal nuevo con un modelo
|
|
liviano que decide en ~10 segundos si vale la pena.
|
|
|
|
OUTPUT del classifier:
|
|
- classification_status: potential_winner | maybe | pass | red_flag
|
|
- score: 0-100
|
|
- reasons: list[str]
|
|
- strategy: buy_hold | brrrr | wholesale | section8 | auction | needs_analysis
|
|
|
|
FLOW:
|
|
1. precompute_heuristics(deal_data) — Python calcula $/sqft, cap_rate_rough, etc
|
|
2. build_classifier_prompt(deal_data, heuristics) — embed inputs en el prompt
|
|
3. ollama.chat(model="DealClassifier", prompt) — LLM clasifica
|
|
4. parse_classifier_output(response) — extraer JSON estricto, validar campos
|
|
5. deals_db.update_classification(deal_id, ...) — persistir resultado
|
|
|
|
API:
|
|
classify_deal(deal_data: dict) -> dict — clasifica un deal
|
|
classify_all_unclassified(limit: int = 50) -> dict — batch process pending
|
|
|
|
FAIL-SAFE: si LLM devuelve JSON invalido o esta down, retorna 'maybe + needs_analysis'
|
|
con score 50 (default safe). No bloquea el pipeline.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import concurrent.futures
|
|
import json
|
|
import re
|
|
import time
|
|
from typing import Optional
|
|
|
|
import ollama
|
|
|
|
|
|
# Hard timeout per deal (seconds). Modelos llama3.1:8b típicamente terminan
|
|
# en <10s; si tarda más de 30s probablemente está en loop o stuck por
|
|
# format=json no satisfecho. Skip + mark como error.
|
|
CLASSIFIER_TIMEOUT_SECONDS = 30
|
|
|
|
# Hard ceiling de tokens generados. JSON output debería ser <300 tokens.
|
|
# Si genera más, el modelo está divagando — abortar.
|
|
CLASSIFIER_NUM_PREDICT = 400
|
|
|
|
from config import (
|
|
DEAL_CLASSIFIER_MODEL,
|
|
RESOURCE_MODE,
|
|
KEEP_ALIVE_BY_MODE,
|
|
CLASSIFICATION_VALUES,
|
|
CLASS_MAYBE,
|
|
CLASS_POTENTIAL_WINNER,
|
|
CLASS_PASS,
|
|
CLASS_RED_FLAG,
|
|
)
|
|
|
|
|
|
# Strategy values aceptables (matchea el modelfile)
|
|
VALID_STRATEGIES = {
|
|
"buy_hold", "brrrr", "wholesale", "section8", "auction", "needs_analysis",
|
|
}
|
|
|
|
# County typical $/sqft benchmarks (Florida) — usados en heuristics
|
|
# Conservative: tomar mid del rango como reference, agente puede recalibrar
|
|
COUNTY_PSQFT_BENCHMARKS_FL = {
|
|
# Miami-Dade
|
|
"miami-dade": {"A": (300, 500), "B": (200, 300), "C": (140, 200), "D": (80, 140)},
|
|
# Broward
|
|
"broward": {"A": (280, 450), "B": (180, 280), "C": (130, 180), "D": (75, 130)},
|
|
# Palm Beach
|
|
"palm beach": {"A": (350, 600), "B": (220, 350), "C": (150, 220), "D": (90, 150)},
|
|
# Duval (Jacksonville)
|
|
"duval": {"A": (200, 350), "B": (140, 200), "C": (100, 140), "D": (60, 100)},
|
|
# Hillsborough (Tampa)
|
|
"hillsborough": {"A": (240, 400), "B": (170, 240), "C": (120, 170), "D": (70, 120)},
|
|
# Orange (Orlando)
|
|
"orange": {"A": (220, 380), "B": (160, 220), "C": (110, 160), "D": (70, 110)},
|
|
# Default (otros condados FL)
|
|
"_default": {"A": (220, 380), "B": (160, 220), "C": (110, 160), "D": (70, 110)},
|
|
}
|
|
|
|
|
|
def precompute_heuristics(deal_data: dict) -> dict:
|
|
"""Calcula heuristicas baratas en Python ANTES del LLM.
|
|
|
|
Estos numeros son inputs cerrados para el LLM (NO los recalcules en el prompt).
|
|
|
|
FIX B1 v1.1: handles listing_price=None correctamente (foreclosure pre-auction).
|
|
Para foreclosure: usa estimated_arv (assessed_value) como reference para $/sqft.
|
|
NO usa final_judgment_amount como price (eso es la deuda, no el bid del buyer).
|
|
"""
|
|
listing_price = deal_data.get("listing_price")
|
|
starting_bid = deal_data.get("starting_bid")
|
|
assessed_arv = deal_data.get("estimated_arv")
|
|
sqft = deal_data.get("sqft")
|
|
deal_type = (deal_data.get("deal_type") or "mls").lower()
|
|
county = (deal_data.get("county") or "").lower().replace(" county", "").strip()
|
|
|
|
# Reference price para heuristics (no es lo mismo que listing_price)
|
|
# tax_deed / mls: usar listing_price (es el bid o asking)
|
|
# foreclosure pre-auction: usar assessed_arv como proxy de market (NO final_judgment)
|
|
if deal_type == "foreclosure" and not listing_price:
|
|
# No tenemos el bid real — usar assessed_value como proxy de market value
|
|
reference_price = assessed_arv or 0
|
|
else:
|
|
reference_price = listing_price or starting_bid or 0
|
|
|
|
h: dict = {
|
|
"listing_price_semantics": (
|
|
"tax_deed: listing_price IS the Opening Bid (what buyer pays)"
|
|
if deal_type == "tax_deed" else
|
|
"foreclosure pre-auction: listing_price hidden; using assessed_value as proxy"
|
|
if deal_type == "foreclosure" else
|
|
"standard: listing_price is the asking price"
|
|
),
|
|
"reference_price_used": reference_price,
|
|
}
|
|
|
|
# $/sqft
|
|
if reference_price and sqft and sqft > 0:
|
|
h["price_per_sqft"] = round(reference_price / sqft, 1)
|
|
else:
|
|
h["price_per_sqft"] = None
|
|
|
|
# 1% rule estimated rent (basico)
|
|
h["estimated_rent_1pct_rule"] = round(reference_price * 0.01, 0) if reference_price else None
|
|
|
|
# Cap rate rough usando 50% rule
|
|
if reference_price and h.get("estimated_rent_1pct_rule"):
|
|
rent_annual = h["estimated_rent_1pct_rule"] * 12
|
|
h["estimated_cap_rate_rough_pct"] = round(rent_annual * 0.5 / reference_price * 100, 2)
|
|
else:
|
|
h["estimated_cap_rate_rough_pct"] = None
|
|
|
|
# Deal type categorization
|
|
h["is_deal_type_distressed"] = deal_type in (
|
|
"auction", "foreclosure", "tax_deed", "reo"
|
|
)
|
|
|
|
# County benchmark
|
|
bench = COUNTY_PSQFT_BENCHMARKS_FL.get(county) or COUNTY_PSQFT_BENCHMARKS_FL["_default"]
|
|
h["county_psqft_benchmarks"] = bench
|
|
|
|
# ARV upside calculations
|
|
# tax_deed: ARV vs starting_bid (REAL buyer payment) = real upside
|
|
# foreclosure: ARV vs final_judgment is misleading; we set to None
|
|
final_judgment = deal_data.get("final_judgment_amount")
|
|
if deal_type == "tax_deed" and assessed_arv and starting_bid and starting_bid > 0:
|
|
h["arv_upside_dollars"] = round(assessed_arv - starting_bid, 0)
|
|
h["arv_upside_pct"] = round((assessed_arv - starting_bid) / starting_bid * 100, 1)
|
|
h["arv_upside_note"] = "tax_deed: real upside (starting_bid is what buyer pays)"
|
|
elif deal_type == "foreclosure":
|
|
h["arv_upside_dollars"] = None
|
|
h["arv_upside_pct"] = None
|
|
h["arv_upside_note"] = (
|
|
"foreclosure pre-auction: cannot compute real upside (bid hidden). "
|
|
f"Reference data: assessed_value=${assessed_arv}, final_judgment=${final_judgment}"
|
|
)
|
|
elif assessed_arv and reference_price > 0:
|
|
h["arv_upside_dollars"] = round(assessed_arv - reference_price, 0)
|
|
h["arv_upside_pct"] = round((assessed_arv - reference_price) / reference_price * 100, 1)
|
|
h["arv_upside_note"] = "standard: arv - listing_price"
|
|
else:
|
|
h["arv_upside_dollars"] = None
|
|
h["arv_upside_pct"] = None
|
|
h["arv_upside_note"] = "insufficient data"
|
|
|
|
# Photos count
|
|
photos = deal_data.get("photos_urls")
|
|
if isinstance(photos, str):
|
|
try:
|
|
photos = json.loads(photos)
|
|
except Exception:
|
|
photos = None
|
|
h["photos_count"] = len(photos) if isinstance(photos, (list, tuple)) else 0
|
|
|
|
# Description length
|
|
desc = deal_data.get("listing_description") or ""
|
|
h["description_length"] = len(desc)
|
|
|
|
return h
|
|
|
|
|
|
def build_classifier_prompt(deal_data: dict, heuristics: dict) -> str:
|
|
"""Construye el user prompt para DealClassifier. Compacto, JSON-friendly."""
|
|
# Compactar deal_data: solo campos relevantes
|
|
d = {
|
|
"source": deal_data.get("source"),
|
|
"deal_type": deal_data.get("deal_type"),
|
|
"address": deal_data.get("address"),
|
|
"city": deal_data.get("city"),
|
|
"county": deal_data.get("county"),
|
|
"state": deal_data.get("state"),
|
|
"zip": deal_data.get("zip"),
|
|
"listing_price": deal_data.get("listing_price"),
|
|
"starting_bid": deal_data.get("starting_bid"),
|
|
"estimated_arv": deal_data.get("estimated_arv"),
|
|
"beds": deal_data.get("beds"),
|
|
"baths": deal_data.get("baths"),
|
|
"sqft": deal_data.get("sqft"),
|
|
"year_built": deal_data.get("year_built"),
|
|
"lot_sqft": deal_data.get("lot_sqft"),
|
|
"auction_date": deal_data.get("auction_date"),
|
|
"case_number": deal_data.get("case_number"),
|
|
"listing_description_excerpt": (
|
|
(deal_data.get("listing_description") or "")[:500]
|
|
),
|
|
}
|
|
# Remove None values for cleaner prompt
|
|
d = {k: v for k, v in d.items() if v is not None and v != ""}
|
|
|
|
return f"""Clasifica este deal de real estate USA Florida.
|
|
|
|
═══ DEAL DATA ═══
|
|
{json.dumps(d, indent=2, ensure_ascii=False)}
|
|
|
|
═══ HEURISTIC PRE-CALCULATIONS (Python — NO recalcules) ═══
|
|
{json.dumps(heuristics, indent=2, ensure_ascii=False)}
|
|
|
|
═══ TU TAREA ═══
|
|
Devuelve SOLAMENTE un objeto JSON valido con classification_status, score, reasons, strategy.
|
|
No prose. No markdown. No bloques de codigo. Solo JSON.
|
|
"""
|
|
|
|
|
|
def parse_classifier_output(raw: str) -> dict:
|
|
"""Extrae JSON del output del LLM. Robust contra markdown / prose extra.
|
|
|
|
Returns dict con classification_status, score, reasons, strategy.
|
|
Si falla parsing → fallback safe (maybe + needs_analysis + score 50).
|
|
"""
|
|
fallback = {
|
|
"classification_status": CLASS_MAYBE,
|
|
"score": 50,
|
|
"reasons": ["LLM output no parseable — fallback safe"],
|
|
"strategy": "needs_analysis",
|
|
"_parse_error": True,
|
|
}
|
|
|
|
if not raw:
|
|
return fallback
|
|
|
|
# Strip markdown code fences si aparecen
|
|
cleaned = re.sub(r"```(?:json)?\s*", "", raw).strip()
|
|
cleaned = re.sub(r"```\s*$", "", cleaned).strip()
|
|
|
|
# Buscar el primer { y el ultimo } para aislar el JSON
|
|
start = cleaned.find("{")
|
|
end = cleaned.rfind("}")
|
|
if start < 0 or end < 0 or end <= start:
|
|
return {**fallback, "_parse_error_detail": f"no JSON braces found in: {raw[:200]}"}
|
|
|
|
json_str = cleaned[start:end + 1]
|
|
try:
|
|
data = json.loads(json_str)
|
|
except json.JSONDecodeError as e:
|
|
return {**fallback, "_parse_error_detail": f"JSON decode error: {e}: {json_str[:200]}"}
|
|
|
|
# Validar y normalizar campos
|
|
status = data.get("classification_status", "").lower().strip()
|
|
if status not in CLASSIFICATION_VALUES:
|
|
# Algunos LLMs devuelven valores ligeramente distintos
|
|
normalized = {
|
|
"winner": CLASS_POTENTIAL_WINNER,
|
|
"potential winner": CLASS_POTENTIAL_WINNER,
|
|
"potentialwinner": CLASS_POTENTIAL_WINNER,
|
|
"good": CLASS_POTENTIAL_WINNER,
|
|
"redflag": CLASS_RED_FLAG,
|
|
"red flag": CLASS_RED_FLAG,
|
|
"warning": CLASS_RED_FLAG,
|
|
"bad": CLASS_PASS,
|
|
}
|
|
status = normalized.get(status, CLASS_MAYBE)
|
|
|
|
try:
|
|
score = int(data.get("score", 50))
|
|
score = max(0, min(100, score))
|
|
except (TypeError, ValueError):
|
|
score = 50
|
|
|
|
reasons = data.get("reasons", [])
|
|
if not isinstance(reasons, list):
|
|
reasons = [str(reasons)]
|
|
reasons = [str(r)[:200] for r in reasons[:6] if r]
|
|
|
|
strategy = data.get("strategy", "needs_analysis")
|
|
if isinstance(strategy, str):
|
|
strategy = strategy.lower().strip().replace("&", "and").replace(" ", "_")
|
|
if strategy not in VALID_STRATEGIES:
|
|
strategy = "needs_analysis"
|
|
|
|
return {
|
|
"classification_status": status,
|
|
"score": score,
|
|
"reasons": reasons or ["sin razones provistas"],
|
|
"strategy": strategy,
|
|
"_parse_error": False,
|
|
}
|
|
|
|
|
|
def _call_ollama_chat(prompt: str, keep_alive) -> dict:
|
|
"""Direct ollama.chat call. Called inside a threadpool by classify_deal()."""
|
|
return ollama.chat(
|
|
model=DEAL_CLASSIFIER_MODEL,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
keep_alive=keep_alive,
|
|
format="json", # Forces JSON output
|
|
options={
|
|
"num_predict": CLASSIFIER_NUM_PREDICT, # cap output length
|
|
"num_ctx": 4096, # smaller context = faster+safer
|
|
"temperature": 0.2,
|
|
},
|
|
)
|
|
|
|
|
|
def classify_deal(deal_data: dict, timeout_seconds: int = CLASSIFIER_TIMEOUT_SECONDS) -> dict:
|
|
"""Clasifica un solo deal. Entry point principal.
|
|
|
|
BUG FIX (2026-05-14): added hard timeout via threadpool. Si ollama.chat se
|
|
cuelga (model loop, format=json no satisfecho), abortamos a los N segundos
|
|
en vez de bloquear forever.
|
|
|
|
Returns dict con classification_status, score, reasons, strategy, _meta:
|
|
_meta: {duration_seconds, model, tokens, parse_error, timed_out}
|
|
"""
|
|
started = time.perf_counter()
|
|
keep_alive = KEEP_ALIVE_BY_MODE[RESOURCE_MODE]
|
|
|
|
heuristics = precompute_heuristics(deal_data)
|
|
prompt = build_classifier_prompt(deal_data, heuristics)
|
|
|
|
raw = ""
|
|
tokens = 0
|
|
error = None
|
|
timed_out = False
|
|
|
|
try:
|
|
# Wrap ollama.chat in a threadpool with hard timeout.
|
|
# Ollama Python client no expone per-request timeout en .chat(), por eso
|
|
# usamos concurrent.futures para forzar uno.
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
future = executor.submit(_call_ollama_chat, prompt, keep_alive)
|
|
try:
|
|
response = future.result(timeout=timeout_seconds)
|
|
raw = response["message"]["content"]
|
|
tokens = response.get("eval_count", 0)
|
|
except concurrent.futures.TimeoutError:
|
|
timed_out = True
|
|
error = f"timeout after {timeout_seconds}s — model stuck or runaway generation"
|
|
# Note: future may still run in background; ollama keep_alive will
|
|
# release the model. Nothing we can do to interrupt mid-generation.
|
|
except Exception as e:
|
|
error = f"{type(e).__name__}: {e}"
|
|
|
|
parsed = parse_classifier_output(raw)
|
|
duration = time.perf_counter() - started
|
|
|
|
# On timeout, override classification to "error" sentinel so we don't retry
|
|
if timed_out:
|
|
parsed["classification_status"] = CLASS_MAYBE # safe default
|
|
parsed["score"] = 0
|
|
parsed["reasons"] = [f"Classifier timeout {timeout_seconds}s — needs manual review"]
|
|
parsed["strategy"] = "needs_analysis"
|
|
|
|
parsed["_meta"] = {
|
|
"duration_seconds": round(duration, 2),
|
|
"model": DEAL_CLASSIFIER_MODEL,
|
|
"tokens": tokens,
|
|
"ollama_error": error,
|
|
"timed_out": timed_out,
|
|
"parse_error": parsed.pop("_parse_error", False),
|
|
"parse_error_detail": parsed.pop("_parse_error_detail", None),
|
|
"heuristics_used": heuristics,
|
|
}
|
|
return parsed
|
|
|
|
|
|
def classify_all_unclassified(
|
|
limit: int = 50,
|
|
status_cb=None,
|
|
timeout_seconds: int = CLASSIFIER_TIMEOUT_SECONDS,
|
|
) -> dict:
|
|
"""Batch process: clasifica todos los deals con status='new'.
|
|
|
|
Returns summary dict con counts + duration metrics.
|
|
|
|
BUG FIX (2026-05-14): progress logging muestra i/total + duracion por deal +
|
|
skip + continue cuando uno se cuelga. Antes podia colgarse en un solo deal
|
|
indefinidamente.
|
|
"""
|
|
from deals_db import list_deals, update_classification, init_db
|
|
init_db()
|
|
|
|
pending = list_deals(status="new", limit=limit)
|
|
total = len(pending)
|
|
summary = {
|
|
"processed": 0,
|
|
"potential_winner": 0,
|
|
"maybe": 0,
|
|
"pass": 0,
|
|
"red_flag": 0,
|
|
"errors": 0,
|
|
"timeouts": 0,
|
|
"total_pending": total,
|
|
"total_seconds": 0.0,
|
|
}
|
|
|
|
def _log(msg: str) -> None:
|
|
if status_cb:
|
|
status_cb(msg)
|
|
|
|
batch_start = time.perf_counter()
|
|
for i, deal in enumerate(pending, 1):
|
|
addr_preview = (deal.get("address") or f"id={deal['id']}")[:50]
|
|
_log(f"[{i}/{total}] Classifying deal {deal['id']} — {addr_preview}")
|
|
|
|
deal_start = time.perf_counter()
|
|
try:
|
|
result = classify_deal(deal, timeout_seconds=timeout_seconds)
|
|
meta = result.get("_meta", {})
|
|
|
|
update_classification(
|
|
deal_id=deal["id"],
|
|
status=result["classification_status"],
|
|
score=result["score"],
|
|
reasons=result["reasons"],
|
|
strategy=result["strategy"],
|
|
)
|
|
summary["processed"] += 1
|
|
cls = result["classification_status"]
|
|
summary[cls] = summary.get(cls, 0) + 1
|
|
|
|
if meta.get("timed_out"):
|
|
summary["timeouts"] += 1
|
|
_log(f" [{i}/{total}] TIMEOUT after {timeout_seconds}s — marked maybe+needs_analysis, continuing")
|
|
elif meta.get("parse_error"):
|
|
summary["errors"] += 1
|
|
_log(f" [{i}/{total}] parse_error: {meta.get('parse_error_detail', '')[:80]}")
|
|
else:
|
|
deal_elapsed = time.perf_counter() - deal_start
|
|
_log(f" [{i}/{total}] OK in {deal_elapsed:.1f}s — {cls} (score {result['score']})")
|
|
|
|
except Exception as e:
|
|
summary["errors"] += 1
|
|
_log(f" [{i}/{total}] ERROR {type(e).__name__}: {str(e)[:120]}")
|
|
|
|
summary["total_seconds"] = round(time.perf_counter() - batch_start, 1)
|
|
_log(f"Batch complete: {summary['processed']}/{total} processed, "
|
|
f"{summary['timeouts']} timeouts, {summary['errors']} errors, "
|
|
f"{summary['total_seconds']:.0f}s total")
|
|
return summary
|