550 lines
22 KiB
Python
550 lines
22 KiB
Python
"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos.
|
|
|
|
CRITICO - COMPLIANCE LEGAL:
|
|
La clasificacion se basa SOLO en indicadores economicos y datos publicos:
|
|
income, owner-occupancy, education attainment, vacancy, crime, days on market.
|
|
|
|
NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining.
|
|
Esta es una clasificacion ECONOMICA, no demografica.
|
|
|
|
Indicadores y pesos (max 100):
|
|
- median_household_income (Census ACS) 25%
|
|
- owner_occupied_pct (Census ACS) 20%
|
|
- education_attainment_pct_bachelor_plus (ACS) 20%
|
|
- crime_vs_national (FBI UCR) 15%
|
|
- vacancy_rate (Census ACS) 10%
|
|
- days_on_market_median (Firecrawl, opt-in) 10%
|
|
|
|
Graceful degradation: si un indicador no esta disponible (API key missing,
|
|
fetcher fallo), se redistribuye su peso entre los disponibles.
|
|
|
|
confidence_level (basado en CANTIDAD de indicadores disponibles):
|
|
- "high": 5-6 indicadores
|
|
- "medium": 3-4 indicadores
|
|
- "low": 1-2 indicadores
|
|
- "unclassified": 0 indicadores
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import requests
|
|
from dotenv import load_dotenv
|
|
|
|
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
|
|
|
|
|
# ─── Pesos del algoritmo de clasificacion ───────────────────────────────────
|
|
WEIGHTS = {
|
|
"income": 25,
|
|
"owner_occupied": 20,
|
|
"education": 20,
|
|
"crime": 15,
|
|
"vacancy": 10,
|
|
"dom": 10,
|
|
}
|
|
|
|
# ─── Census ACS variable codes ──────────────────────────────────────────────
|
|
ACS_VARS = {
|
|
"income": "B19013_001E", # Median household income (last 12 months)
|
|
"oo_count": "B25003_002E", # Owner-occupied housing units count
|
|
"occupied_total": "B25003_001E", # Total occupied housing units
|
|
"vacant_count": "B25002_003E", # Vacant housing units count
|
|
"housing_total": "B25002_001E", # Total housing units (occupied + vacant)
|
|
"home_value": "B25077_001E", # Median home value
|
|
"edu_total": "B15003_001E", # Total population 25+
|
|
"edu_bachelor": "B15003_022E", # Bachelor's degree
|
|
"edu_master": "B15003_023E", # Master's degree
|
|
"edu_prof": "B15003_024E", # Professional school degree
|
|
"edu_doctorate": "B15003_025E", # Doctorate degree
|
|
}
|
|
|
|
# ─── National crime rates (FBI UCR 2022, per 100K population) ──────────────
|
|
# Usado como denominador para crime_vs_national. Actualizar anualmente.
|
|
NATIONAL_VIOLENT_CRIME_PER_100K = 380.7
|
|
NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Fetchers individuales (fail-soft)
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]:
|
|
"""Fetch 4 indicadores Census ACS para el tract del geocode.
|
|
|
|
Returns (indicators_dict, errors_list).
|
|
"""
|
|
errors: list[str] = []
|
|
out: dict = {}
|
|
|
|
api_key = os.getenv("CENSUS_API_KEY", "").strip()
|
|
if not api_key:
|
|
errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)")
|
|
return out, errors
|
|
|
|
state_fips = geocode.get("state_fips")
|
|
county_code = geocode.get("county_code_only")
|
|
tract_code = geocode.get("tract_code")
|
|
if not state_fips or not county_code or not tract_code:
|
|
errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})")
|
|
return out, errors
|
|
|
|
url = "https://api.census.gov/data/2022/acs/acs5"
|
|
# Pedir todas las vars en una sola llamada (la API acepta hasta 50)
|
|
var_keys = ["NAME"] + list(ACS_VARS.values())
|
|
params = {
|
|
"get": ",".join(var_keys),
|
|
"for": f"tract:{tract_code}",
|
|
"in": f"state:{state_fips} county:{county_code}",
|
|
"key": api_key,
|
|
}
|
|
|
|
try:
|
|
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
except requests.RequestException as e:
|
|
errors.append(f"Census ACS HTTP: {e}")
|
|
return out, errors
|
|
except ValueError as e:
|
|
errors.append(f"Census ACS JSON: {e}")
|
|
return out, errors
|
|
|
|
if not data or len(data) < 2:
|
|
errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)")
|
|
return out, errors
|
|
|
|
header = data[0]
|
|
row = data[1]
|
|
idx = {col: i for i, col in enumerate(header)}
|
|
|
|
def _f(col: str) -> Optional[float]:
|
|
try:
|
|
v = row[idx[col]]
|
|
except (KeyError, IndexError):
|
|
return None
|
|
if v is None or v == "" or v == "null":
|
|
return None
|
|
try:
|
|
f = float(v)
|
|
# Census usa valores negativos para "no data" / "suppressed"
|
|
return f if f >= 0 else None
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
# 1) Median household income
|
|
income = _f(ACS_VARS["income"])
|
|
if income is not None:
|
|
out["median_household_income"] = round(income, 0)
|
|
|
|
# 2) Owner-occupied percentage
|
|
oo = _f(ACS_VARS["oo_count"])
|
|
total = _f(ACS_VARS["occupied_total"])
|
|
if oo is not None and total and total > 0:
|
|
out["owner_occupied_pct"] = round(oo / total * 100, 1)
|
|
|
|
# 3) Vacancy rate
|
|
vacant = _f(ACS_VARS["vacant_count"])
|
|
housing = _f(ACS_VARS["housing_total"])
|
|
if vacant is not None and housing and housing > 0:
|
|
out["vacancy_rate"] = round(vacant / housing * 100, 1)
|
|
|
|
# 4) Median home value
|
|
home_value = _f(ACS_VARS["home_value"])
|
|
if home_value is not None:
|
|
out["median_home_value"] = round(home_value, 0)
|
|
|
|
# 5) Education attainment (% bachelor's or higher, age 25+)
|
|
edu_total = _f(ACS_VARS["edu_total"])
|
|
edu_b = _f(ACS_VARS["edu_bachelor"]) or 0
|
|
edu_m = _f(ACS_VARS["edu_master"]) or 0
|
|
edu_p = _f(ACS_VARS["edu_prof"]) or 0
|
|
edu_d = _f(ACS_VARS["edu_doctorate"]) or 0
|
|
if edu_total and edu_total > 0:
|
|
pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100
|
|
out["education_attainment_pct_bachelor_plus"] = round(pct, 1)
|
|
|
|
return out, errors
|
|
|
|
|
|
def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]:
|
|
"""Fetch crime data via FBI Crime Data Explorer (api.data.gov key).
|
|
|
|
NOTA: la API publica gratis de FBI es county-level via summarized endpoint.
|
|
Implementacion best-effort: si la API responde, devolvemos crime_vs_national.
|
|
Si no, fail-soft (errors list, indicator ausente).
|
|
"""
|
|
errors: list[str] = []
|
|
out: dict = {}
|
|
|
|
api_key = os.getenv("API_DATA_GOV_KEY", "").strip()
|
|
if not api_key:
|
|
errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)")
|
|
return out, errors
|
|
|
|
state_abbr = geocode.get("state") # e.g. "FL"
|
|
if not state_abbr:
|
|
errors.append("state abbreviation faltante en geocode")
|
|
return out, errors
|
|
|
|
# Endpoint: FBI Crime Data Explorer state-level estimate
|
|
# Mejor que tener nada (county-level es complejo de agregar).
|
|
url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}"
|
|
params = {
|
|
"from": "2022",
|
|
"to": "2022",
|
|
"API_KEY": api_key,
|
|
}
|
|
|
|
try:
|
|
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
except requests.RequestException as e:
|
|
errors.append(f"FBI UCR HTTP: {e}")
|
|
return out, errors
|
|
except ValueError as e:
|
|
errors.append(f"FBI UCR JSON: {e}")
|
|
return out, errors
|
|
|
|
# Estructura tipica del endpoint: lista de estimates por ano con keys
|
|
# como 'violent_crime', 'property_crime', 'population', etc.
|
|
# Defensivo: probar varias formas.
|
|
estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else [])
|
|
if not estimates:
|
|
errors.append("FBI UCR sin estimates en respuesta")
|
|
return out, errors
|
|
|
|
rec = estimates[0] if isinstance(estimates, list) else estimates
|
|
if not isinstance(rec, dict):
|
|
errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}")
|
|
return out, errors
|
|
|
|
population = rec.get("population")
|
|
violent = rec.get("violent_crime")
|
|
property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0 # fallback - se ignora luego
|
|
|
|
if not population or not violent:
|
|
errors.append("FBI UCR sin population o violent_crime en estimate")
|
|
return out, errors
|
|
|
|
try:
|
|
violent_per_100k = float(violent) / float(population) * 100000
|
|
ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K
|
|
|
|
if property_c:
|
|
property_per_100k = float(property_c) / float(population) * 100000
|
|
ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K
|
|
# Promedio ponderado: violent pesa mas (2/3) que property (1/3)
|
|
combined = (ratio_violent * 2 + ratio_property) / 3
|
|
else:
|
|
combined = ratio_violent
|
|
|
|
out["crime_vs_national"] = round(combined, 2)
|
|
out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa."
|
|
except (TypeError, ValueError) as e:
|
|
errors.append(f"FBI UCR calc error: {e}")
|
|
|
|
return out, errors
|
|
|
|
|
|
def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]:
|
|
"""Fetch median days-on-market via Firecrawl scrape de Zillow.
|
|
|
|
OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup).
|
|
Llamar solo si include_dom=True en classify_neighborhood().
|
|
"""
|
|
errors: list[str] = []
|
|
# Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6).
|
|
# Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado.
|
|
errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6")
|
|
return {}, errors
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Algoritmo de clasificacion
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
def _score_income(income: float) -> int:
|
|
if income >= 100000: return 25
|
|
if income >= 60000: return 18
|
|
if income >= 35000: return 10
|
|
return 3
|
|
|
|
|
|
def _score_owner_occupied(pct: float) -> int:
|
|
if pct >= 80: return 20
|
|
if pct >= 60: return 15
|
|
if pct >= 40: return 8
|
|
return 3
|
|
|
|
|
|
def _score_education(pct_bach_plus: float) -> int:
|
|
if pct_bach_plus >= 50: return 20
|
|
if pct_bach_plus >= 30: return 14
|
|
if pct_bach_plus >= 15: return 7
|
|
return 2
|
|
|
|
|
|
def _score_crime(ratio_vs_national: float) -> int:
|
|
"""Lower ratio = better (less crime than national)."""
|
|
if ratio_vs_national < 0.7: return 15
|
|
if ratio_vs_national < 1.0: return 12
|
|
if ratio_vs_national < 1.5: return 7
|
|
return 2
|
|
|
|
|
|
def _score_vacancy(pct: float) -> int:
|
|
"""Lower vacancy = better."""
|
|
if pct < 3: return 10
|
|
if pct < 6: return 7
|
|
if pct < 10: return 4
|
|
return 1
|
|
|
|
|
|
def _score_dom(days: float) -> int:
|
|
"""Lower days-on-market = hotter neighborhood = better."""
|
|
if days < 30: return 10
|
|
if days < 60: return 7
|
|
if days < 90: return 4
|
|
return 1
|
|
|
|
|
|
def _classify(indicators: dict) -> dict:
|
|
"""Aplica el algoritmo de scoring con graceful degradation.
|
|
|
|
Returns dict con neighborhood_class, class_score, confidence_level, etc.
|
|
"""
|
|
score_funcs = {
|
|
"income": (_score_income, "median_household_income"),
|
|
"owner_occupied": (_score_owner_occupied, "owner_occupied_pct"),
|
|
"education": (_score_education, "education_attainment_pct_bachelor_plus"),
|
|
"crime": (_score_crime, "crime_vs_national"),
|
|
"vacancy": (_score_vacancy, "vacancy_rate"),
|
|
"dom": (_score_dom, "days_on_market_median"),
|
|
}
|
|
|
|
points = {}
|
|
indicators_available = []
|
|
for key, (func, indicator_name) in score_funcs.items():
|
|
val = indicators.get(indicator_name)
|
|
if val is not None:
|
|
points[key] = func(val)
|
|
indicators_available.append(key)
|
|
|
|
n_available = len(indicators_available)
|
|
|
|
# Confidence level por cantidad de indicadores
|
|
if n_available == 0:
|
|
confidence = "unclassified"
|
|
elif n_available <= 2:
|
|
confidence = "low"
|
|
elif n_available <= 4:
|
|
confidence = "medium"
|
|
else:
|
|
confidence = "high"
|
|
|
|
if n_available == 0:
|
|
return {
|
|
"neighborhood_class": "unclassified",
|
|
"class_score": 0.0,
|
|
"confidence_level": "unclassified",
|
|
"indicators_available": [],
|
|
"weight_coverage_pct": 0,
|
|
}
|
|
|
|
# Graceful degradation: scale points contra weights disponibles
|
|
total_weight_available = sum(WEIGHTS[k] for k in indicators_available)
|
|
total_points = sum(points.values())
|
|
scaled_0_to_100 = (total_points / total_weight_available) * 100
|
|
|
|
if scaled_0_to_100 >= 85: letter = "A"
|
|
elif scaled_0_to_100 >= 65: letter = "B"
|
|
elif scaled_0_to_100 >= 40: letter = "C"
|
|
else: letter = "D"
|
|
|
|
return {
|
|
"neighborhood_class": letter,
|
|
"class_score": round(scaled_0_to_100, 1),
|
|
"confidence_level": confidence,
|
|
"indicators_available": indicators_available,
|
|
"weight_coverage_pct": total_weight_available,
|
|
"raw_points": points,
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Investment implications por clase
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
INVESTMENT_IMPLICATIONS = {
|
|
"A": {
|
|
"buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos",
|
|
"section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR",
|
|
"appreciation_potential": "Alta - tipicamente supera inflacion",
|
|
"tenant_quality_expected": "Profesional, familias, muy estable",
|
|
"typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"],
|
|
},
|
|
"B": {
|
|
"buy_hold_viability": "Alta - balance entre cash flow y apreciacion",
|
|
"section_8_viability": "Baja - market rents por encima de FMR pero no por mucho",
|
|
"appreciation_potential": "Media-alta",
|
|
"tenant_quality_expected": "Profesional, familias, estable",
|
|
"typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"],
|
|
},
|
|
"C": {
|
|
"buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management",
|
|
"section_8_viability": "Alta - market rents cerca o por debajo de FMR",
|
|
"appreciation_potential": "Baja-media",
|
|
"tenant_quality_expected": "Working class, estabilidad mixta",
|
|
"typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"],
|
|
},
|
|
"D": {
|
|
"buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo",
|
|
"section_8_viability": "Muy alta - Section 8 puede superar market rent",
|
|
"appreciation_potential": "Baja - depende de trayectoria del vecindario",
|
|
"tenant_quality_expected": "Bajos ingresos, screening diligente requerido",
|
|
"typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"],
|
|
},
|
|
"unclassified": {
|
|
"buy_hold_viability": "No determinado - sin datos suficientes",
|
|
"section_8_viability": "No determinado",
|
|
"appreciation_potential": "No determinado",
|
|
"tenant_quality_expected": "No determinado",
|
|
"typical_strategies": [],
|
|
},
|
|
}
|
|
|
|
|
|
def _build_reasoning(indicators: dict, classification: dict) -> str:
|
|
"""Genera 1-2 lineas de justificacion del class letter."""
|
|
letter = classification["neighborhood_class"]
|
|
if letter == "unclassified":
|
|
return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)."
|
|
|
|
parts = []
|
|
if (v := indicators.get("median_household_income")) is not None:
|
|
parts.append(f"median income ${v:,.0f}")
|
|
if (v := indicators.get("owner_occupied_pct")) is not None:
|
|
parts.append(f"owner-occupied {v:.0f}%")
|
|
if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None:
|
|
parts.append(f"bachelor+ {v:.0f}%")
|
|
if (v := indicators.get("crime_vs_national")) is not None:
|
|
parts.append(f"crime {v:.2f}x national")
|
|
if (v := indicators.get("vacancy_rate")) is not None:
|
|
parts.append(f"vacancy {v:.1f}%")
|
|
if (v := indicators.get("days_on_market_median")) is not None:
|
|
parts.append(f"DOM {v} dias")
|
|
|
|
indicator_str = ", ".join(parts)
|
|
score = classification["class_score"]
|
|
conf = classification["confidence_level"]
|
|
coverage = classification["weight_coverage_pct"]
|
|
return (
|
|
f"Clase {letter} (score {score}/100, confianza {conf}, "
|
|
f"cobertura {coverage}% del peso). Indicadores: {indicator_str}."
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# API publica
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict:
|
|
"""Clasifica un vecindario A/B/C/D basado en indicadores objetivos.
|
|
|
|
Args:
|
|
geocode: output de census_geocode.fetch_geocode (debe tener state_fips,
|
|
county_code_only, tract_code).
|
|
include_dom: si True, hace lookup de Days-on-Market via Firecrawl
|
|
(gasta credits). Default False.
|
|
|
|
Returns:
|
|
dict con neighborhood_class, class_score, confidence_level, indicators,
|
|
investment_implications, etc.
|
|
"""
|
|
# .env ya fue cargado por data_fetchers/__init__.py al primer import
|
|
# del paquete. No llamamos load_dotenv() aca para evitar conflictos con
|
|
# CWD distinto del proyecto.
|
|
|
|
fetched_at = datetime.now(timezone.utc).isoformat()
|
|
all_errors: list[str] = []
|
|
data_sources: list[str] = []
|
|
|
|
if not geocode or not geocode.get("state_fips"):
|
|
return {
|
|
"neighborhood_class": "unclassified",
|
|
"class_score": 0.0,
|
|
"confidence_level": "unclassified",
|
|
"indicators": {},
|
|
"indicators_available": [],
|
|
"weight_coverage_pct": 0,
|
|
"class_reasoning": "Geocode fallo - no se puede clasificar sin tract.",
|
|
"investment_implications": INVESTMENT_IMPLICATIONS["unclassified"],
|
|
"warnings": ["Geocode invalido o incompleto"],
|
|
"data_sources": [],
|
|
"tract_geoid": None,
|
|
"fetched_at": fetched_at,
|
|
"errors": ["geocode_failed"],
|
|
}
|
|
|
|
# ─── Census ACS (4 indicadores) ─────────────────────────────────────────
|
|
indicators: dict = {}
|
|
census_data, errs = _fetch_census_acs(geocode)
|
|
indicators.update(census_data)
|
|
all_errors.extend(errs)
|
|
if census_data:
|
|
data_sources.append("US Census ACS 2022 5-Year")
|
|
|
|
# ─── FBI UCR (1 indicador) ──────────────────────────────────────────────
|
|
crime_data, errs = _fetch_fbi_crime(geocode)
|
|
# Excluir keys auxiliares con prefijo "_"
|
|
indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")})
|
|
all_errors.extend(errs)
|
|
if crime_data:
|
|
data_sources.append("FBI Crime Data Explorer (state-level)")
|
|
|
|
# ─── Firecrawl DOM (1 indicador, opt-in) ────────────────────────────────
|
|
if include_dom:
|
|
dom_data, errs = _fetch_firecrawl_dom(geocode)
|
|
indicators.update(dom_data)
|
|
all_errors.extend(errs)
|
|
if dom_data:
|
|
data_sources.append("Firecrawl (Zillow DOM)")
|
|
|
|
# ─── Clasificar ─────────────────────────────────────────────────────────
|
|
classification = _classify(indicators)
|
|
reasoning = _build_reasoning(indicators, classification)
|
|
letter = classification["neighborhood_class"]
|
|
|
|
# ─── Warnings ───────────────────────────────────────────────────────────
|
|
warnings: list[str] = []
|
|
if classification["confidence_level"] in ("low", "unclassified"):
|
|
warnings.append(
|
|
f"Confianza {classification['confidence_level']}: "
|
|
f"solo {len(classification['indicators_available'])} indicadores disponibles."
|
|
)
|
|
if "_crime_state_level_note" in crime_data:
|
|
warnings.append(crime_data["_crime_state_level_note"])
|
|
|
|
return {
|
|
"neighborhood_class": letter,
|
|
"class_score": classification["class_score"],
|
|
"confidence_level": classification["confidence_level"],
|
|
"indicators": indicators,
|
|
"indicators_available": classification["indicators_available"],
|
|
"weight_coverage_pct": classification["weight_coverage_pct"],
|
|
"class_reasoning": reasoning,
|
|
"investment_implications": INVESTMENT_IMPLICATIONS[letter],
|
|
"warnings": warnings,
|
|
"data_sources": data_sources,
|
|
"tract_geoid": geocode.get("tract_geoid"),
|
|
"tract_name": geocode.get("tract_name"),
|
|
"fetched_at": fetched_at,
|
|
"errors": all_errors,
|
|
}
|