466 lines
22 KiB
Python
466 lines
22 KiB
Python
"""price_validator.py - detecta discrepancia listing vs market estimates.
|
|
|
|
PROBLEMA QUE RESUELVE:
|
|
Deal con listing $70K mientras Zillow Zestimate dice $280K. Cap rate sale 18%.
|
|
Sistema procede a calcular como deal normal sin detectar el RED FLAG mas obvio:
|
|
ese precio bajo casi siempre indica problema oculto heredable (liens, foreclosure,
|
|
damage, code violations, title issues, etc.).
|
|
|
|
OBJETIVO:
|
|
Detectar discrepancia >30% entre listing price y market value estimates.
|
|
Alertar al usuario MASIVAMENTE (CRITICAL_RED_FLAG) con lista de razones probables
|
|
y due diligence obligatoria.
|
|
|
|
LOGICA:
|
|
- discrepancy < 10% → NORMAL (listing dentro de rango razonable)
|
|
- 10-30% → WARNING (listing fuera de rango pero no escandalo)
|
|
- ≥30% → CRITICAL_RED_FLAG (algo huele mal — investigar antes de seguir)
|
|
|
|
FUENTES de market value:
|
|
1. Zillow Zestimate (Firecrawl ~3 credits) — opt-in con ENABLE_FIRECRAWL_PRICE_CHECK
|
|
2. Redfin estimate (Firecrawl ~3 credits) — opt-in
|
|
3. Tax Assessed Value (county scraper, gratis cuando funcione)
|
|
4. Existing comps via property_value.py si ya estan computados
|
|
|
|
FAIL-SOFT: si no hay ninguna fuente disponible, retorna status='unknown' con warning.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
|
|
|
|
|
# Thresholds
|
|
NORMAL_THRESHOLD_PCT = 10.0
|
|
WARNING_THRESHOLD_PCT = 30.0
|
|
|
|
# Tax assessed → market value typical ratio in FL
|
|
TAX_TO_MARKET_RATIO = 0.85 # tax assessed suele ser 85% del market value
|
|
|
|
|
|
def _firecrawl_price_check_enabled() -> bool:
|
|
"""Flag separado de comps. Default OFF para no quemar credits."""
|
|
flag = os.getenv("ENABLE_FIRECRAWL_PRICE_CHECK", "false").lower() == "true"
|
|
has_key = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
|
|
return flag and has_key
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Fetchers de market value
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
def fetch_zillow_zestimate(address: str) -> tuple[Optional[int], list[str]]:
|
|
"""Fetch Zillow Zestimate via Firecrawl scrape de la pagina de la propiedad.
|
|
|
|
Returns (zestimate_value, errors). zestimate puede ser None si no se encuentra.
|
|
Consume ~3 credits Firecrawl.
|
|
"""
|
|
errors: list[str] = []
|
|
if not _firecrawl_price_check_enabled():
|
|
errors.append(
|
|
"Firecrawl price check deshabilitado. "
|
|
"Setear ENABLE_FIRECRAWL_PRICE_CHECK=true en .env para activar."
|
|
)
|
|
return None, errors
|
|
|
|
try:
|
|
from firecrawl import FirecrawlApp
|
|
except ImportError as e:
|
|
errors.append(f"firecrawl-py no importable: {e}")
|
|
return None, errors
|
|
|
|
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
|
if not api_key:
|
|
errors.append("FIRECRAWL_API_KEY ausente en .env")
|
|
return None, errors
|
|
|
|
# Zillow address search url
|
|
# Formato: https://www.zillow.com/homes/{address-with-dashes}_rb/
|
|
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
|
|
url = f"https://www.zillow.com/homes/{addr_slug}_rb/"
|
|
|
|
try:
|
|
app = FirecrawlApp(api_key=api_key)
|
|
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
|
|
result = app.scrape(url, formats=["markdown"])
|
|
if not result or not hasattr(result, "markdown"):
|
|
errors.append("Firecrawl Zillow: respuesta vacia")
|
|
return None, errors
|
|
md = result.markdown or ""
|
|
except Exception as e:
|
|
errors.append(f"Firecrawl Zillow error: {e}")
|
|
return None, errors
|
|
|
|
# Parser: buscar patron "Zestimate" + precio cercano
|
|
m = re.search(
|
|
r"zestimate[^\$]*\$([\d,]{4,9})",
|
|
md,
|
|
re.IGNORECASE,
|
|
)
|
|
if not m:
|
|
errors.append("Zillow markdown OK pero patron 'Zestimate $XXX' no encontrado")
|
|
return None, errors
|
|
try:
|
|
zestimate = int(m.group(1).replace(",", ""))
|
|
if 20_000 <= zestimate <= 50_000_000:
|
|
return zestimate, errors
|
|
else:
|
|
errors.append(f"Zestimate fuera de rango razonable: ${zestimate}")
|
|
return None, errors
|
|
except ValueError as e:
|
|
errors.append(f"Parse Zestimate: {e}")
|
|
return None, errors
|
|
|
|
|
|
def fetch_redfin_estimate(address: str) -> tuple[Optional[int], list[str]]:
|
|
"""Similar a Zillow pero Redfin. ~3 credits Firecrawl."""
|
|
errors: list[str] = []
|
|
if not _firecrawl_price_check_enabled():
|
|
errors.append("Firecrawl price check deshabilitado (ENABLE_FIRECRAWL_PRICE_CHECK=true)")
|
|
return None, errors
|
|
|
|
try:
|
|
from firecrawl import FirecrawlApp
|
|
except ImportError as e:
|
|
errors.append(f"firecrawl-py no importable: {e}")
|
|
return None, errors
|
|
|
|
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
|
if not api_key:
|
|
errors.append("FIRECRAWL_API_KEY ausente")
|
|
return None, errors
|
|
|
|
# Redfin search por address
|
|
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
|
|
url = f"https://www.redfin.com/?location={addr_slug}"
|
|
|
|
try:
|
|
app = FirecrawlApp(api_key=api_key)
|
|
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
|
|
result = app.scrape(url, formats=["markdown"])
|
|
md = result.markdown if result and hasattr(result, "markdown") else ""
|
|
except Exception as e:
|
|
errors.append(f"Firecrawl Redfin error: {e}")
|
|
return None, errors
|
|
|
|
m = re.search(
|
|
r"redfin estimate[^\$]*\$([\d,]{4,9})",
|
|
md,
|
|
re.IGNORECASE,
|
|
)
|
|
if not m:
|
|
errors.append("Redfin: patron 'Redfin Estimate' no encontrado")
|
|
return None, errors
|
|
try:
|
|
est = int(m.group(1).replace(",", ""))
|
|
if 20_000 <= est <= 50_000_000:
|
|
return est, errors
|
|
return None, errors
|
|
except ValueError:
|
|
return None, errors
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Posibles razones de discrepancia (educacion al usuario)
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
# Bug 6: Hipotesis distressed ordenadas por likelihood en Florida.
|
|
# Cuando listing es <$150K SFR + status=UNKNOWN, estas son las CAUSAS PRIMARIAS
|
|
# que el sistema debe surface ANTES de las 12 razones generales.
|
|
# Frecuencia derivada de datos publicos: ~60% de listings <$150K SFR en Florida
|
|
# son foreclosure-related (auction, REO, pre-foreclosure short sale) o tax deed.
|
|
DISTRESSED_HYPOTHESIS_REASONS = [
|
|
"🥇 FORECLOSURE AUCTION — listing puede ser el opening bid en la subasta judicial. "
|
|
"Lookup obligatorio: lis pendens en CCIS del condado (clerk online).",
|
|
"🥈 REO (Real Estate Owned) — el banco recupero la propiedad post-foreclosure y la "
|
|
"lista as-is cash-quick-close. Comun en bancos chicos / credit unions.",
|
|
"🥉 TAX DEED — el condado vendio el certificado por tax delinquency severa. "
|
|
"1-year redemption period donde el ex-owner puede recomprar.",
|
|
"Pre-foreclosure short sale — owner intenta vender antes de la subasta. "
|
|
"Requiere aprobacion del lender (puede llevar 3-6 meses).",
|
|
"Wholesale assignment — el wholesaler tiene el deal bajo contrato y vende el contrato. "
|
|
"Puede haber issues con marketable title.",
|
|
"Probate / estate sale — heirs liquidando rapido. Requiere certificado del juez.",
|
|
]
|
|
|
|
POSSIBLE_RED_FLAG_REASONS = [
|
|
"Tax delinquency severa (property tax + interes acumulado puede ser >20% del valor)",
|
|
"IRS lien sobre el owner (federal tax lien, 120-day right of redemption)",
|
|
"Code enforcement violations grandes (municipalidad puede tener liens de $50K+)",
|
|
"Foreclosure en curso (lis pendens publico) — el seller intenta vender antes de subasta",
|
|
"Damage severo no fotografiado (fire, water, structural) que requiere $50K-$200K rehab",
|
|
"Title issues (clouds en el chain — heirs no identificados, divorce sin completar, fraud)",
|
|
"Bankruptcy quick-sale (trustee debe liquidar rapido, precio bajo para cerrar)",
|
|
"Wholesaler problem deal (el wholesaler le bajo el precio porque tuvo issues con buyers anteriores)",
|
|
"HOA litigation pendiente — lender no presta hasta resolver",
|
|
"Open insurance claims que el buyer hereda",
|
|
"Polybutylene plumbing + electrical Federal Pacific (re-pipe + repanel costoso)",
|
|
"Inhabitable / no certificate of occupancy (puede ser ilegal alquilar tal como esta)",
|
|
]
|
|
|
|
MANDATORY_INVESTIGATION_LIST = [
|
|
"Court records search (county clerk: lis pendens, foreclosure docket, civil suits)",
|
|
"Tax collector / appraiser: verificar pagos al dia + assessed value",
|
|
"Code enforcement check con la municipalidad: violations + liens",
|
|
"Property records: chain of title del county recorder",
|
|
"Title search profesional ($300-$500) ANTES de hacer oferta",
|
|
"Drive-by inspection (sin entrar): nivel de mantenimiento exterior, signos de damage",
|
|
"PACER bankruptcy search (federal): auto-stay del owner puede invalidar transferencia",
|
|
"Permits buscador: openings sin cerrar pueden tener implicaciones legales",
|
|
"Open insurance claims: pedir disclosure al seller",
|
|
]
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# API publica
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
def validate_price(
|
|
*,
|
|
address: str,
|
|
listing_price: float,
|
|
tax_assessed_value: Optional[float] = None,
|
|
existing_comps_estimate: Optional[float] = None,
|
|
existing_comps_confidence: Optional[str] = None, # Bug 4: "high"|"medium"|"low"|None
|
|
existing_comps_sources: Optional[list] = None, # Bug 4: list of source labels
|
|
neighborhood_class: Optional[str] = None, # Bug 6: A|B|C|D|None
|
|
use_firecrawl: Optional[bool] = None,
|
|
) -> dict:
|
|
"""Entry point. Valida listing_price contra fuentes de market value.
|
|
|
|
Args:
|
|
address: full address de la propiedad
|
|
listing_price: precio listado
|
|
tax_assessed_value: opcional, si ya se computo por property_value.py
|
|
existing_comps_estimate: opcional, mid del estimated_value de property_value.py
|
|
existing_comps_confidence: confidence level del estimate ("high"/"medium"/"low").
|
|
Si "low", el estimate NO se usa como baseline (Bug 4 fix).
|
|
existing_comps_sources: lista de sources del property_value (para detectar
|
|
heuristica-only). Bug 4: si solo viene de "Deductions",
|
|
NO usar como baseline.
|
|
neighborhood_class: A/B/C/D del Census ACS. Bug 6: si UNKNOWN + listing muy bajo
|
|
en zona Class C/D, surface hipotesis foreclosure.
|
|
use_firecrawl: si True, hace lookups Zillow + Redfin (consume credits).
|
|
None → usa flag ENABLE_FIRECRAWL_PRICE_CHECK de .env
|
|
|
|
Returns dict con:
|
|
status: NORMAL | WARNING | CRITICAL_RED_FLAG | UNKNOWN
|
|
listing_price, market_estimates {zillow, redfin, tax_implied, comps_mid}
|
|
max_discrepancy_pct
|
|
possible_reasons (list of str) — si CRITICAL_RED_FLAG o UNKNOWN+sospechoso
|
|
mandatory_investigation (list of str)
|
|
recommendation: brief one-liner
|
|
fetched_at, sources_used, errors
|
|
"""
|
|
fetched_at = datetime.now(timezone.utc).isoformat()
|
|
errors: list[str] = []
|
|
sources_used: list[str] = []
|
|
estimates: dict[str, Optional[int]] = {
|
|
"zillow_zestimate": None,
|
|
"redfin_estimate": None,
|
|
"tax_implied_market": None,
|
|
"comps_mid": None,
|
|
}
|
|
rejected_sources: list[str] = [] # Bug 4: tracking de sources descartadas
|
|
|
|
# 1. Tax assessed → market implied (FL ratio ~85%)
|
|
if tax_assessed_value and tax_assessed_value > 1000:
|
|
estimates["tax_implied_market"] = int(tax_assessed_value / TAX_TO_MARKET_RATIO)
|
|
sources_used.append(f"Tax assessed → market implied (${tax_assessed_value:,.0f} / {TAX_TO_MARKET_RATIO})")
|
|
|
|
# 2. Existing comps estimate (de property_value.py) — CON VALIDACION DE CALIDAD (Bug 4)
|
|
if existing_comps_estimate and existing_comps_estimate > 1000:
|
|
# Reject if confidence is "low" — significa que property_value.py no tuvo
|
|
# data real y cayo en fallback heuristico de deductions. Usarlo como
|
|
# baseline produce direccion INVERTIDA (visto en Jacksonville test).
|
|
is_heuristic_only = False
|
|
if existing_comps_sources:
|
|
srcs_str = " | ".join(str(s) for s in existing_comps_sources).lower()
|
|
# Si la UNICA source es "Deductions por edad" → no es un comp real
|
|
is_heuristic_only = (
|
|
("deduction" in srcs_str or "heurística" in srcs_str or "heuristica" in srcs_str)
|
|
and "comp" not in srcs_str
|
|
and "tax" not in srcs_str
|
|
and "zillow" not in srcs_str
|
|
and "redfin" not in srcs_str
|
|
)
|
|
|
|
if existing_comps_confidence == "low" or is_heuristic_only:
|
|
rejected_sources.append(
|
|
f"property_value comps_mid descartado: confidence={existing_comps_confidence}, "
|
|
f"sources={existing_comps_sources} — fallback heuristico no es baseline valido"
|
|
)
|
|
errors.append(
|
|
"property_value estimate descartado por baja calidad (heuristic-only). "
|
|
"Para validacion confiable: activar ENABLE_FIRECRAWL_COMPS o esperar tax_assessed scraper."
|
|
)
|
|
else:
|
|
estimates["comps_mid"] = int(existing_comps_estimate)
|
|
sources_used.append(f"Comps mid (confidence={existing_comps_confidence or 'unknown'}, ${existing_comps_estimate:,.0f})")
|
|
|
|
# 3. Firecrawl Zillow Zestimate
|
|
if use_firecrawl is None:
|
|
do_firecrawl = _firecrawl_price_check_enabled()
|
|
else:
|
|
do_firecrawl = use_firecrawl
|
|
|
|
if do_firecrawl:
|
|
z, z_errors = fetch_zillow_zestimate(address)
|
|
if z:
|
|
estimates["zillow_zestimate"] = z
|
|
sources_used.append(f"Zillow Zestimate (${z:,.0f})")
|
|
errors.extend(z_errors)
|
|
|
|
r, r_errors = fetch_redfin_estimate(address)
|
|
if r:
|
|
estimates["redfin_estimate"] = r
|
|
sources_used.append(f"Redfin Estimate (${r:,.0f})")
|
|
errors.extend(r_errors)
|
|
|
|
# 4. Calcular discrepancia
|
|
available_estimates = [v for v in estimates.values() if v]
|
|
if not available_estimates:
|
|
# Bug 6: UNKNOWN-pero-listing-sospechosamente-bajo → surface hipotesis
|
|
# distressed (foreclosure / tax_deed / REO / pre-foreclosure short sale).
|
|
# Heuristica: listing < $150K + zona conocida por foreclosures FL
|
|
# (Duval, Hillsborough, Polk, Marion, Brevard, Volusia, Lake) o sin info
|
|
# de neighborhood = surface hypothesis.
|
|
suspicious_low_listing = listing_price < 150_000
|
|
is_low_class_area = (neighborhood_class or "").upper() in ("C", "D")
|
|
|
|
possible = []
|
|
investigation = []
|
|
recommendation_text = (
|
|
"No se pudo validar el precio contra fuentes de mercado confiables. "
|
|
"Activar ENABLE_FIRECRAWL_PRICE_CHECK + ENABLE_FIRECRAWL_COMPS en .env "
|
|
"o esperar el tax_assessed scraper para validacion automatica. "
|
|
"Considera lookup manual en Zillow/Redfin antes de proceder."
|
|
)
|
|
|
|
if suspicious_low_listing:
|
|
# En USA real estate, listing <$150K SFR en Florida es estadisticamente raro
|
|
# excepto en: (a) zonas Class D donde es market-rate (rare), (b) deals
|
|
# distressed donde el listing es el opening bid o el "as-is cash quick close".
|
|
# Surface las hipotesis distressed como PRIMER orden de explicacion.
|
|
possible = DISTRESSED_HYPOTHESIS_REASONS + POSSIBLE_RED_FLAG_REASONS
|
|
investigation = MANDATORY_INVESTIGATION_LIST
|
|
if is_low_class_area:
|
|
class_note = (
|
|
f"Vecindario Class {neighborhood_class} (income bajo) — listing en este rango "
|
|
"puede ser market-rate. Pero foreclosure tampoco esta descartado: en Class D "
|
|
"FL, el porcentaje de foreclosures es ~3x el promedio nacional."
|
|
)
|
|
elif neighborhood_class in ("A", "B"):
|
|
class_note = (
|
|
f"Vecindario Class {neighborhood_class} (income medio/alto) — listing tan bajo "
|
|
"es PROBABLEMENTE deal distressed. Investigar lis pendens en CCIS antes de proceder."
|
|
)
|
|
else:
|
|
# neighborhood_class unknown
|
|
class_note = (
|
|
"Neighborhood class no disponible — no se puede inferir si el listing es "
|
|
"market-rate-para-la-zona o distressed."
|
|
)
|
|
recommendation_text = (
|
|
f"⚠️ Listing ${listing_price:,.0f} es estadisticamente raro para SFR en Florida "
|
|
f"(<$150K). {class_note} "
|
|
"HIPOTESIS PRIMARIA: deal distressed (foreclosure, tax_deed, REO, short sale, "
|
|
"pre-foreclosure). Re-verificar deal_type del usuario, hacer court records lookup "
|
|
"(lis pendens en CCIS del condado), y tratar este analisis como PRELIMINAR hasta "
|
|
"confirmar el status real."
|
|
)
|
|
|
|
return {
|
|
"status": "UNKNOWN",
|
|
"listing_price": int(listing_price),
|
|
"market_estimates": estimates,
|
|
"max_discrepancy_pct": None,
|
|
"min_discrepancy_pct": None,
|
|
"possible_reasons": possible,
|
|
"mandatory_investigation": investigation,
|
|
"recommendation": recommendation_text,
|
|
"sources_used": sources_used,
|
|
"rejected_sources": rejected_sources,
|
|
"suspicious_low_listing": suspicious_low_listing,
|
|
"errors": errors,
|
|
"fetched_at": fetched_at,
|
|
}
|
|
|
|
# Discrepancia % vs cada estimate (negativo = listing < market, positivo = listing > market)
|
|
discrepancies = {}
|
|
for src, val in estimates.items():
|
|
if val:
|
|
disc_pct = (listing_price - val) / val * 100
|
|
discrepancies[src] = round(disc_pct, 1)
|
|
|
|
# max ABS discrepancy = la mas alarmante (ya sea sobre o bajo el mercado)
|
|
max_abs_disc = max(abs(d) for d in discrepancies.values())
|
|
# signed para reportar direccion
|
|
signed_max = max(discrepancies.values(), key=abs)
|
|
|
|
# Status
|
|
if max_abs_disc < NORMAL_THRESHOLD_PCT:
|
|
status = "NORMAL"
|
|
recommendation = (
|
|
f"Listing dentro de ±{NORMAL_THRESHOLD_PCT}% de market estimates. "
|
|
"Procede con analisis financiero estandar."
|
|
)
|
|
possible_reasons = []
|
|
investigation = []
|
|
elif max_abs_disc < WARNING_THRESHOLD_PCT:
|
|
status = "WARNING"
|
|
direction = "sobre" if signed_max > 0 else "bajo"
|
|
recommendation = (
|
|
f"Listing {abs(signed_max):.0f}% {direction} el market estimate. "
|
|
"Verifica condiciones del deal antes de proceder. "
|
|
"Si listing > market: probable inflación del seller. "
|
|
"Si listing < market: investigar razon (motivacion legitima vs problema oculto)."
|
|
)
|
|
possible_reasons = []
|
|
investigation = []
|
|
else:
|
|
# CRITICAL_RED_FLAG
|
|
status = "CRITICAL_RED_FLAG"
|
|
direction = "sobre" if signed_max > 0 else "bajo"
|
|
if signed_max < 0:
|
|
# Listing < market — el caso peligroso de problema oculto
|
|
recommendation = (
|
|
f"🚨 LISTING ${listing_price:,.0f} esta {abs(signed_max):.0f}% BAJO el market estimate. "
|
|
"Esto NO es un 'gran deal' por default — es una RED FLAG masiva. "
|
|
"El precio bajo casi siempre indica problema oculto heredable. "
|
|
"NO procedas con analisis financiero estandar hasta entender el POR QUE del precio bajo. "
|
|
"Cap rate alto en este contexto puede ser ilusion — los costos heredables pueden destruir el deal."
|
|
)
|
|
possible_reasons = POSSIBLE_RED_FLAG_REASONS
|
|
investigation = MANDATORY_INVESTIGATION_LIST
|
|
else:
|
|
# Listing > market — clasico seller inflacionado pero no peligroso
|
|
recommendation = (
|
|
f"Listing ${listing_price:,.0f} esta {abs(signed_max):.0f}% SOBRE el market estimate. "
|
|
"Probable inflacion del seller. Oferta agresiva justificada. "
|
|
"Si declinan, walk away — hay deals mejores."
|
|
)
|
|
possible_reasons = []
|
|
investigation = []
|
|
|
|
return {
|
|
"status": status,
|
|
"listing_price": int(listing_price),
|
|
"market_estimates": estimates,
|
|
"discrepancies_pct": discrepancies,
|
|
"max_discrepancy_pct": round(max_abs_disc, 1),
|
|
"signed_max_discrepancy_pct": round(signed_max, 1),
|
|
"direction": "listing_BELOW_market" if signed_max < 0 else "listing_ABOVE_market" if signed_max > 0 else "match",
|
|
"possible_reasons": possible_reasons,
|
|
"mandatory_investigation": investigation,
|
|
"recommendation": recommendation,
|
|
"sources_used": sources_used,
|
|
"errors": errors,
|
|
"fetched_at": fetched_at,
|
|
}
|