Files
AR-House/data_fetchers/price_validator.py
T
2026-07-03 12:24:58 -04:00

466 lines
22 KiB
Python

"""price_validator.py - detecta discrepancia listing vs market estimates.
PROBLEMA QUE RESUELVE:
Deal con listing $70K mientras Zillow Zestimate dice $280K. Cap rate sale 18%.
Sistema procede a calcular como deal normal sin detectar el RED FLAG mas obvio:
ese precio bajo casi siempre indica problema oculto heredable (liens, foreclosure,
damage, code violations, title issues, etc.).
OBJETIVO:
Detectar discrepancia >30% entre listing price y market value estimates.
Alertar al usuario MASIVAMENTE (CRITICAL_RED_FLAG) con lista de razones probables
y due diligence obligatoria.
LOGICA:
- discrepancy < 10% → NORMAL (listing dentro de rango razonable)
- 10-30% → WARNING (listing fuera de rango pero no escandalo)
- ≥30% → CRITICAL_RED_FLAG (algo huele mal — investigar antes de seguir)
FUENTES de market value:
1. Zillow Zestimate (Firecrawl ~3 credits) — opt-in con ENABLE_FIRECRAWL_PRICE_CHECK
2. Redfin estimate (Firecrawl ~3 credits) — opt-in
3. Tax Assessed Value (county scraper, gratis cuando funcione)
4. Existing comps via property_value.py si ya estan computados
FAIL-SOFT: si no hay ninguna fuente disponible, retorna status='unknown' con warning.
"""
from __future__ import annotations
import os
import re
from datetime import datetime, timezone
from typing import Optional
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
# Thresholds
NORMAL_THRESHOLD_PCT = 10.0
WARNING_THRESHOLD_PCT = 30.0
# Tax assessed → market value typical ratio in FL
TAX_TO_MARKET_RATIO = 0.85 # tax assessed suele ser 85% del market value
def _firecrawl_price_check_enabled() -> bool:
"""Flag separado de comps. Default OFF para no quemar credits."""
flag = os.getenv("ENABLE_FIRECRAWL_PRICE_CHECK", "false").lower() == "true"
has_key = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
return flag and has_key
# ═══════════════════════════════════════════════════════════════════════════
# Fetchers de market value
# ═══════════════════════════════════════════════════════════════════════════
def fetch_zillow_zestimate(address: str) -> tuple[Optional[int], list[str]]:
"""Fetch Zillow Zestimate via Firecrawl scrape de la pagina de la propiedad.
Returns (zestimate_value, errors). zestimate puede ser None si no se encuentra.
Consume ~3 credits Firecrawl.
"""
errors: list[str] = []
if not _firecrawl_price_check_enabled():
errors.append(
"Firecrawl price check deshabilitado. "
"Setear ENABLE_FIRECRAWL_PRICE_CHECK=true en .env para activar."
)
return None, errors
try:
from firecrawl import FirecrawlApp
except ImportError as e:
errors.append(f"firecrawl-py no importable: {e}")
return None, errors
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
if not api_key:
errors.append("FIRECRAWL_API_KEY ausente en .env")
return None, errors
# Zillow address search url
# Formato: https://www.zillow.com/homes/{address-with-dashes}_rb/
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
url = f"https://www.zillow.com/homes/{addr_slug}_rb/"
try:
app = FirecrawlApp(api_key=api_key)
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
result = app.scrape(url, formats=["markdown"])
if not result or not hasattr(result, "markdown"):
errors.append("Firecrawl Zillow: respuesta vacia")
return None, errors
md = result.markdown or ""
except Exception as e:
errors.append(f"Firecrawl Zillow error: {e}")
return None, errors
# Parser: buscar patron "Zestimate" + precio cercano
m = re.search(
r"zestimate[^\$]*\$([\d,]{4,9})",
md,
re.IGNORECASE,
)
if not m:
errors.append("Zillow markdown OK pero patron 'Zestimate $XXX' no encontrado")
return None, errors
try:
zestimate = int(m.group(1).replace(",", ""))
if 20_000 <= zestimate <= 50_000_000:
return zestimate, errors
else:
errors.append(f"Zestimate fuera de rango razonable: ${zestimate}")
return None, errors
except ValueError as e:
errors.append(f"Parse Zestimate: {e}")
return None, errors
def fetch_redfin_estimate(address: str) -> tuple[Optional[int], list[str]]:
"""Similar a Zillow pero Redfin. ~3 credits Firecrawl."""
errors: list[str] = []
if not _firecrawl_price_check_enabled():
errors.append("Firecrawl price check deshabilitado (ENABLE_FIRECRAWL_PRICE_CHECK=true)")
return None, errors
try:
from firecrawl import FirecrawlApp
except ImportError as e:
errors.append(f"firecrawl-py no importable: {e}")
return None, errors
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
if not api_key:
errors.append("FIRECRAWL_API_KEY ausente")
return None, errors
# Redfin search por address
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
url = f"https://www.redfin.com/?location={addr_slug}"
try:
app = FirecrawlApp(api_key=api_key)
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
result = app.scrape(url, formats=["markdown"])
md = result.markdown if result and hasattr(result, "markdown") else ""
except Exception as e:
errors.append(f"Firecrawl Redfin error: {e}")
return None, errors
m = re.search(
r"redfin estimate[^\$]*\$([\d,]{4,9})",
md,
re.IGNORECASE,
)
if not m:
errors.append("Redfin: patron 'Redfin Estimate' no encontrado")
return None, errors
try:
est = int(m.group(1).replace(",", ""))
if 20_000 <= est <= 50_000_000:
return est, errors
return None, errors
except ValueError:
return None, errors
# ═══════════════════════════════════════════════════════════════════════════
# Posibles razones de discrepancia (educacion al usuario)
# ═══════════════════════════════════════════════════════════════════════════
# Bug 6: Hipotesis distressed ordenadas por likelihood en Florida.
# Cuando listing es <$150K SFR + status=UNKNOWN, estas son las CAUSAS PRIMARIAS
# que el sistema debe surface ANTES de las 12 razones generales.
# Frecuencia derivada de datos publicos: ~60% de listings <$150K SFR en Florida
# son foreclosure-related (auction, REO, pre-foreclosure short sale) o tax deed.
DISTRESSED_HYPOTHESIS_REASONS = [
"🥇 FORECLOSURE AUCTION — listing puede ser el opening bid en la subasta judicial. "
"Lookup obligatorio: lis pendens en CCIS del condado (clerk online).",
"🥈 REO (Real Estate Owned) — el banco recupero la propiedad post-foreclosure y la "
"lista as-is cash-quick-close. Comun en bancos chicos / credit unions.",
"🥉 TAX DEED — el condado vendio el certificado por tax delinquency severa. "
"1-year redemption period donde el ex-owner puede recomprar.",
"Pre-foreclosure short sale — owner intenta vender antes de la subasta. "
"Requiere aprobacion del lender (puede llevar 3-6 meses).",
"Wholesale assignment — el wholesaler tiene el deal bajo contrato y vende el contrato. "
"Puede haber issues con marketable title.",
"Probate / estate sale — heirs liquidando rapido. Requiere certificado del juez.",
]
POSSIBLE_RED_FLAG_REASONS = [
"Tax delinquency severa (property tax + interes acumulado puede ser >20% del valor)",
"IRS lien sobre el owner (federal tax lien, 120-day right of redemption)",
"Code enforcement violations grandes (municipalidad puede tener liens de $50K+)",
"Foreclosure en curso (lis pendens publico) — el seller intenta vender antes de subasta",
"Damage severo no fotografiado (fire, water, structural) que requiere $50K-$200K rehab",
"Title issues (clouds en el chain — heirs no identificados, divorce sin completar, fraud)",
"Bankruptcy quick-sale (trustee debe liquidar rapido, precio bajo para cerrar)",
"Wholesaler problem deal (el wholesaler le bajo el precio porque tuvo issues con buyers anteriores)",
"HOA litigation pendiente — lender no presta hasta resolver",
"Open insurance claims que el buyer hereda",
"Polybutylene plumbing + electrical Federal Pacific (re-pipe + repanel costoso)",
"Inhabitable / no certificate of occupancy (puede ser ilegal alquilar tal como esta)",
]
MANDATORY_INVESTIGATION_LIST = [
"Court records search (county clerk: lis pendens, foreclosure docket, civil suits)",
"Tax collector / appraiser: verificar pagos al dia + assessed value",
"Code enforcement check con la municipalidad: violations + liens",
"Property records: chain of title del county recorder",
"Title search profesional ($300-$500) ANTES de hacer oferta",
"Drive-by inspection (sin entrar): nivel de mantenimiento exterior, signos de damage",
"PACER bankruptcy search (federal): auto-stay del owner puede invalidar transferencia",
"Permits buscador: openings sin cerrar pueden tener implicaciones legales",
"Open insurance claims: pedir disclosure al seller",
]
# ═══════════════════════════════════════════════════════════════════════════
# API publica
# ═══════════════════════════════════════════════════════════════════════════
def validate_price(
*,
address: str,
listing_price: float,
tax_assessed_value: Optional[float] = None,
existing_comps_estimate: Optional[float] = None,
existing_comps_confidence: Optional[str] = None, # Bug 4: "high"|"medium"|"low"|None
existing_comps_sources: Optional[list] = None, # Bug 4: list of source labels
neighborhood_class: Optional[str] = None, # Bug 6: A|B|C|D|None
use_firecrawl: Optional[bool] = None,
) -> dict:
"""Entry point. Valida listing_price contra fuentes de market value.
Args:
address: full address de la propiedad
listing_price: precio listado
tax_assessed_value: opcional, si ya se computo por property_value.py
existing_comps_estimate: opcional, mid del estimated_value de property_value.py
existing_comps_confidence: confidence level del estimate ("high"/"medium"/"low").
Si "low", el estimate NO se usa como baseline (Bug 4 fix).
existing_comps_sources: lista de sources del property_value (para detectar
heuristica-only). Bug 4: si solo viene de "Deductions",
NO usar como baseline.
neighborhood_class: A/B/C/D del Census ACS. Bug 6: si UNKNOWN + listing muy bajo
en zona Class C/D, surface hipotesis foreclosure.
use_firecrawl: si True, hace lookups Zillow + Redfin (consume credits).
None → usa flag ENABLE_FIRECRAWL_PRICE_CHECK de .env
Returns dict con:
status: NORMAL | WARNING | CRITICAL_RED_FLAG | UNKNOWN
listing_price, market_estimates {zillow, redfin, tax_implied, comps_mid}
max_discrepancy_pct
possible_reasons (list of str) — si CRITICAL_RED_FLAG o UNKNOWN+sospechoso
mandatory_investigation (list of str)
recommendation: brief one-liner
fetched_at, sources_used, errors
"""
fetched_at = datetime.now(timezone.utc).isoformat()
errors: list[str] = []
sources_used: list[str] = []
estimates: dict[str, Optional[int]] = {
"zillow_zestimate": None,
"redfin_estimate": None,
"tax_implied_market": None,
"comps_mid": None,
}
rejected_sources: list[str] = [] # Bug 4: tracking de sources descartadas
# 1. Tax assessed → market implied (FL ratio ~85%)
if tax_assessed_value and tax_assessed_value > 1000:
estimates["tax_implied_market"] = int(tax_assessed_value / TAX_TO_MARKET_RATIO)
sources_used.append(f"Tax assessed → market implied (${tax_assessed_value:,.0f} / {TAX_TO_MARKET_RATIO})")
# 2. Existing comps estimate (de property_value.py) — CON VALIDACION DE CALIDAD (Bug 4)
if existing_comps_estimate and existing_comps_estimate > 1000:
# Reject if confidence is "low" — significa que property_value.py no tuvo
# data real y cayo en fallback heuristico de deductions. Usarlo como
# baseline produce direccion INVERTIDA (visto en Jacksonville test).
is_heuristic_only = False
if existing_comps_sources:
srcs_str = " | ".join(str(s) for s in existing_comps_sources).lower()
# Si la UNICA source es "Deductions por edad" → no es un comp real
is_heuristic_only = (
("deduction" in srcs_str or "heurística" in srcs_str or "heuristica" in srcs_str)
and "comp" not in srcs_str
and "tax" not in srcs_str
and "zillow" not in srcs_str
and "redfin" not in srcs_str
)
if existing_comps_confidence == "low" or is_heuristic_only:
rejected_sources.append(
f"property_value comps_mid descartado: confidence={existing_comps_confidence}, "
f"sources={existing_comps_sources} — fallback heuristico no es baseline valido"
)
errors.append(
"property_value estimate descartado por baja calidad (heuristic-only). "
"Para validacion confiable: activar ENABLE_FIRECRAWL_COMPS o esperar tax_assessed scraper."
)
else:
estimates["comps_mid"] = int(existing_comps_estimate)
sources_used.append(f"Comps mid (confidence={existing_comps_confidence or 'unknown'}, ${existing_comps_estimate:,.0f})")
# 3. Firecrawl Zillow Zestimate
if use_firecrawl is None:
do_firecrawl = _firecrawl_price_check_enabled()
else:
do_firecrawl = use_firecrawl
if do_firecrawl:
z, z_errors = fetch_zillow_zestimate(address)
if z:
estimates["zillow_zestimate"] = z
sources_used.append(f"Zillow Zestimate (${z:,.0f})")
errors.extend(z_errors)
r, r_errors = fetch_redfin_estimate(address)
if r:
estimates["redfin_estimate"] = r
sources_used.append(f"Redfin Estimate (${r:,.0f})")
errors.extend(r_errors)
# 4. Calcular discrepancia
available_estimates = [v for v in estimates.values() if v]
if not available_estimates:
# Bug 6: UNKNOWN-pero-listing-sospechosamente-bajo → surface hipotesis
# distressed (foreclosure / tax_deed / REO / pre-foreclosure short sale).
# Heuristica: listing < $150K + zona conocida por foreclosures FL
# (Duval, Hillsborough, Polk, Marion, Brevard, Volusia, Lake) o sin info
# de neighborhood = surface hypothesis.
suspicious_low_listing = listing_price < 150_000
is_low_class_area = (neighborhood_class or "").upper() in ("C", "D")
possible = []
investigation = []
recommendation_text = (
"No se pudo validar el precio contra fuentes de mercado confiables. "
"Activar ENABLE_FIRECRAWL_PRICE_CHECK + ENABLE_FIRECRAWL_COMPS en .env "
"o esperar el tax_assessed scraper para validacion automatica. "
"Considera lookup manual en Zillow/Redfin antes de proceder."
)
if suspicious_low_listing:
# En USA real estate, listing <$150K SFR en Florida es estadisticamente raro
# excepto en: (a) zonas Class D donde es market-rate (rare), (b) deals
# distressed donde el listing es el opening bid o el "as-is cash quick close".
# Surface las hipotesis distressed como PRIMER orden de explicacion.
possible = DISTRESSED_HYPOTHESIS_REASONS + POSSIBLE_RED_FLAG_REASONS
investigation = MANDATORY_INVESTIGATION_LIST
if is_low_class_area:
class_note = (
f"Vecindario Class {neighborhood_class} (income bajo) — listing en este rango "
"puede ser market-rate. Pero foreclosure tampoco esta descartado: en Class D "
"FL, el porcentaje de foreclosures es ~3x el promedio nacional."
)
elif neighborhood_class in ("A", "B"):
class_note = (
f"Vecindario Class {neighborhood_class} (income medio/alto) — listing tan bajo "
"es PROBABLEMENTE deal distressed. Investigar lis pendens en CCIS antes de proceder."
)
else:
# neighborhood_class unknown
class_note = (
"Neighborhood class no disponible — no se puede inferir si el listing es "
"market-rate-para-la-zona o distressed."
)
recommendation_text = (
f"⚠️ Listing ${listing_price:,.0f} es estadisticamente raro para SFR en Florida "
f"(<$150K). {class_note} "
"HIPOTESIS PRIMARIA: deal distressed (foreclosure, tax_deed, REO, short sale, "
"pre-foreclosure). Re-verificar deal_type del usuario, hacer court records lookup "
"(lis pendens en CCIS del condado), y tratar este analisis como PRELIMINAR hasta "
"confirmar el status real."
)
return {
"status": "UNKNOWN",
"listing_price": int(listing_price),
"market_estimates": estimates,
"max_discrepancy_pct": None,
"min_discrepancy_pct": None,
"possible_reasons": possible,
"mandatory_investigation": investigation,
"recommendation": recommendation_text,
"sources_used": sources_used,
"rejected_sources": rejected_sources,
"suspicious_low_listing": suspicious_low_listing,
"errors": errors,
"fetched_at": fetched_at,
}
# Discrepancia % vs cada estimate (negativo = listing < market, positivo = listing > market)
discrepancies = {}
for src, val in estimates.items():
if val:
disc_pct = (listing_price - val) / val * 100
discrepancies[src] = round(disc_pct, 1)
# max ABS discrepancy = la mas alarmante (ya sea sobre o bajo el mercado)
max_abs_disc = max(abs(d) for d in discrepancies.values())
# signed para reportar direccion
signed_max = max(discrepancies.values(), key=abs)
# Status
if max_abs_disc < NORMAL_THRESHOLD_PCT:
status = "NORMAL"
recommendation = (
f"Listing dentro de ±{NORMAL_THRESHOLD_PCT}% de market estimates. "
"Procede con analisis financiero estandar."
)
possible_reasons = []
investigation = []
elif max_abs_disc < WARNING_THRESHOLD_PCT:
status = "WARNING"
direction = "sobre" if signed_max > 0 else "bajo"
recommendation = (
f"Listing {abs(signed_max):.0f}% {direction} el market estimate. "
"Verifica condiciones del deal antes de proceder. "
"Si listing > market: probable inflación del seller. "
"Si listing < market: investigar razon (motivacion legitima vs problema oculto)."
)
possible_reasons = []
investigation = []
else:
# CRITICAL_RED_FLAG
status = "CRITICAL_RED_FLAG"
direction = "sobre" if signed_max > 0 else "bajo"
if signed_max < 0:
# Listing < market — el caso peligroso de problema oculto
recommendation = (
f"🚨 LISTING ${listing_price:,.0f} esta {abs(signed_max):.0f}% BAJO el market estimate. "
"Esto NO es un 'gran deal' por default — es una RED FLAG masiva. "
"El precio bajo casi siempre indica problema oculto heredable. "
"NO procedas con analisis financiero estandar hasta entender el POR QUE del precio bajo. "
"Cap rate alto en este contexto puede ser ilusion — los costos heredables pueden destruir el deal."
)
possible_reasons = POSSIBLE_RED_FLAG_REASONS
investigation = MANDATORY_INVESTIGATION_LIST
else:
# Listing > market — clasico seller inflacionado pero no peligroso
recommendation = (
f"Listing ${listing_price:,.0f} esta {abs(signed_max):.0f}% SOBRE el market estimate. "
"Probable inflacion del seller. Oferta agresiva justificada. "
"Si declinan, walk away — hay deals mejores."
)
possible_reasons = []
investigation = []
return {
"status": status,
"listing_price": int(listing_price),
"market_estimates": estimates,
"discrepancies_pct": discrepancies,
"max_discrepancy_pct": round(max_abs_disc, 1),
"signed_max_discrepancy_pct": round(signed_max, 1),
"direction": "listing_BELOW_market" if signed_max < 0 else "listing_ABOVE_market" if signed_max > 0 else "match",
"possible_reasons": possible_reasons,
"mandatory_investigation": investigation,
"recommendation": recommendation,
"sources_used": sources_used,
"errors": errors,
"fetched_at": fetched_at,
}