Files
AR-House/data_fetchers/neighborhood_class.py
2026-07-03 12:24:58 -04:00

550 lines
22 KiB
Python

"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos.
CRITICO - COMPLIANCE LEGAL:
La clasificacion se basa SOLO en indicadores economicos y datos publicos:
income, owner-occupancy, education attainment, vacancy, crime, days on market.
NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining.
Esta es una clasificacion ECONOMICA, no demografica.
Indicadores y pesos (max 100):
- median_household_income (Census ACS) 25%
- owner_occupied_pct (Census ACS) 20%
- education_attainment_pct_bachelor_plus (ACS) 20%
- crime_vs_national (FBI UCR) 15%
- vacancy_rate (Census ACS) 10%
- days_on_market_median (Firecrawl, opt-in) 10%
Graceful degradation: si un indicador no esta disponible (API key missing,
fetcher fallo), se redistribuye su peso entre los disponibles.
confidence_level (basado en CANTIDAD de indicadores disponibles):
- "high": 5-6 indicadores
- "medium": 3-4 indicadores
- "low": 1-2 indicadores
- "unclassified": 0 indicadores
"""
from __future__ import annotations
import os
import time
from datetime import datetime, timezone
from typing import Optional
import requests
from dotenv import load_dotenv
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
# ─── Pesos del algoritmo de clasificacion ───────────────────────────────────
WEIGHTS = {
"income": 25,
"owner_occupied": 20,
"education": 20,
"crime": 15,
"vacancy": 10,
"dom": 10,
}
# ─── Census ACS variable codes ──────────────────────────────────────────────
ACS_VARS = {
"income": "B19013_001E", # Median household income (last 12 months)
"oo_count": "B25003_002E", # Owner-occupied housing units count
"occupied_total": "B25003_001E", # Total occupied housing units
"vacant_count": "B25002_003E", # Vacant housing units count
"housing_total": "B25002_001E", # Total housing units (occupied + vacant)
"home_value": "B25077_001E", # Median home value
"edu_total": "B15003_001E", # Total population 25+
"edu_bachelor": "B15003_022E", # Bachelor's degree
"edu_master": "B15003_023E", # Master's degree
"edu_prof": "B15003_024E", # Professional school degree
"edu_doctorate": "B15003_025E", # Doctorate degree
}
# ─── National crime rates (FBI UCR 2022, per 100K population) ──────────────
# Usado como denominador para crime_vs_national. Actualizar anualmente.
NATIONAL_VIOLENT_CRIME_PER_100K = 380.7
NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4
# ═══════════════════════════════════════════════════════════════════════════
# Fetchers individuales (fail-soft)
# ═══════════════════════════════════════════════════════════════════════════
def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]:
"""Fetch 4 indicadores Census ACS para el tract del geocode.
Returns (indicators_dict, errors_list).
"""
errors: list[str] = []
out: dict = {}
api_key = os.getenv("CENSUS_API_KEY", "").strip()
if not api_key:
errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)")
return out, errors
state_fips = geocode.get("state_fips")
county_code = geocode.get("county_code_only")
tract_code = geocode.get("tract_code")
if not state_fips or not county_code or not tract_code:
errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})")
return out, errors
url = "https://api.census.gov/data/2022/acs/acs5"
# Pedir todas las vars en una sola llamada (la API acepta hasta 50)
var_keys = ["NAME"] + list(ACS_VARS.values())
params = {
"get": ",".join(var_keys),
"for": f"tract:{tract_code}",
"in": f"state:{state_fips} county:{county_code}",
"key": api_key,
}
try:
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
data = r.json()
except requests.RequestException as e:
errors.append(f"Census ACS HTTP: {e}")
return out, errors
except ValueError as e:
errors.append(f"Census ACS JSON: {e}")
return out, errors
if not data or len(data) < 2:
errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)")
return out, errors
header = data[0]
row = data[1]
idx = {col: i for i, col in enumerate(header)}
def _f(col: str) -> Optional[float]:
try:
v = row[idx[col]]
except (KeyError, IndexError):
return None
if v is None or v == "" or v == "null":
return None
try:
f = float(v)
# Census usa valores negativos para "no data" / "suppressed"
return f if f >= 0 else None
except (ValueError, TypeError):
return None
# 1) Median household income
income = _f(ACS_VARS["income"])
if income is not None:
out["median_household_income"] = round(income, 0)
# 2) Owner-occupied percentage
oo = _f(ACS_VARS["oo_count"])
total = _f(ACS_VARS["occupied_total"])
if oo is not None and total and total > 0:
out["owner_occupied_pct"] = round(oo / total * 100, 1)
# 3) Vacancy rate
vacant = _f(ACS_VARS["vacant_count"])
housing = _f(ACS_VARS["housing_total"])
if vacant is not None and housing and housing > 0:
out["vacancy_rate"] = round(vacant / housing * 100, 1)
# 4) Median home value
home_value = _f(ACS_VARS["home_value"])
if home_value is not None:
out["median_home_value"] = round(home_value, 0)
# 5) Education attainment (% bachelor's or higher, age 25+)
edu_total = _f(ACS_VARS["edu_total"])
edu_b = _f(ACS_VARS["edu_bachelor"]) or 0
edu_m = _f(ACS_VARS["edu_master"]) or 0
edu_p = _f(ACS_VARS["edu_prof"]) or 0
edu_d = _f(ACS_VARS["edu_doctorate"]) or 0
if edu_total and edu_total > 0:
pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100
out["education_attainment_pct_bachelor_plus"] = round(pct, 1)
return out, errors
def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]:
"""Fetch crime data via FBI Crime Data Explorer (api.data.gov key).
NOTA: la API publica gratis de FBI es county-level via summarized endpoint.
Implementacion best-effort: si la API responde, devolvemos crime_vs_national.
Si no, fail-soft (errors list, indicator ausente).
"""
errors: list[str] = []
out: dict = {}
api_key = os.getenv("API_DATA_GOV_KEY", "").strip()
if not api_key:
errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)")
return out, errors
state_abbr = geocode.get("state") # e.g. "FL"
if not state_abbr:
errors.append("state abbreviation faltante en geocode")
return out, errors
# Endpoint: FBI Crime Data Explorer state-level estimate
# Mejor que tener nada (county-level es complejo de agregar).
url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}"
params = {
"from": "2022",
"to": "2022",
"API_KEY": api_key,
}
try:
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
data = r.json()
except requests.RequestException as e:
errors.append(f"FBI UCR HTTP: {e}")
return out, errors
except ValueError as e:
errors.append(f"FBI UCR JSON: {e}")
return out, errors
# Estructura tipica del endpoint: lista de estimates por ano con keys
# como 'violent_crime', 'property_crime', 'population', etc.
# Defensivo: probar varias formas.
estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else [])
if not estimates:
errors.append("FBI UCR sin estimates en respuesta")
return out, errors
rec = estimates[0] if isinstance(estimates, list) else estimates
if not isinstance(rec, dict):
errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}")
return out, errors
population = rec.get("population")
violent = rec.get("violent_crime")
property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0 # fallback - se ignora luego
if not population or not violent:
errors.append("FBI UCR sin population o violent_crime en estimate")
return out, errors
try:
violent_per_100k = float(violent) / float(population) * 100000
ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K
if property_c:
property_per_100k = float(property_c) / float(population) * 100000
ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K
# Promedio ponderado: violent pesa mas (2/3) que property (1/3)
combined = (ratio_violent * 2 + ratio_property) / 3
else:
combined = ratio_violent
out["crime_vs_national"] = round(combined, 2)
out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa."
except (TypeError, ValueError) as e:
errors.append(f"FBI UCR calc error: {e}")
return out, errors
def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]:
"""Fetch median days-on-market via Firecrawl scrape de Zillow.
OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup).
Llamar solo si include_dom=True en classify_neighborhood().
"""
errors: list[str] = []
# Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6).
# Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado.
errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6")
return {}, errors
# ═══════════════════════════════════════════════════════════════════════════
# Algoritmo de clasificacion
# ═══════════════════════════════════════════════════════════════════════════
def _score_income(income: float) -> int:
if income >= 100000: return 25
if income >= 60000: return 18
if income >= 35000: return 10
return 3
def _score_owner_occupied(pct: float) -> int:
if pct >= 80: return 20
if pct >= 60: return 15
if pct >= 40: return 8
return 3
def _score_education(pct_bach_plus: float) -> int:
if pct_bach_plus >= 50: return 20
if pct_bach_plus >= 30: return 14
if pct_bach_plus >= 15: return 7
return 2
def _score_crime(ratio_vs_national: float) -> int:
"""Lower ratio = better (less crime than national)."""
if ratio_vs_national < 0.7: return 15
if ratio_vs_national < 1.0: return 12
if ratio_vs_national < 1.5: return 7
return 2
def _score_vacancy(pct: float) -> int:
"""Lower vacancy = better."""
if pct < 3: return 10
if pct < 6: return 7
if pct < 10: return 4
return 1
def _score_dom(days: float) -> int:
"""Lower days-on-market = hotter neighborhood = better."""
if days < 30: return 10
if days < 60: return 7
if days < 90: return 4
return 1
def _classify(indicators: dict) -> dict:
"""Aplica el algoritmo de scoring con graceful degradation.
Returns dict con neighborhood_class, class_score, confidence_level, etc.
"""
score_funcs = {
"income": (_score_income, "median_household_income"),
"owner_occupied": (_score_owner_occupied, "owner_occupied_pct"),
"education": (_score_education, "education_attainment_pct_bachelor_plus"),
"crime": (_score_crime, "crime_vs_national"),
"vacancy": (_score_vacancy, "vacancy_rate"),
"dom": (_score_dom, "days_on_market_median"),
}
points = {}
indicators_available = []
for key, (func, indicator_name) in score_funcs.items():
val = indicators.get(indicator_name)
if val is not None:
points[key] = func(val)
indicators_available.append(key)
n_available = len(indicators_available)
# Confidence level por cantidad de indicadores
if n_available == 0:
confidence = "unclassified"
elif n_available <= 2:
confidence = "low"
elif n_available <= 4:
confidence = "medium"
else:
confidence = "high"
if n_available == 0:
return {
"neighborhood_class": "unclassified",
"class_score": 0.0,
"confidence_level": "unclassified",
"indicators_available": [],
"weight_coverage_pct": 0,
}
# Graceful degradation: scale points contra weights disponibles
total_weight_available = sum(WEIGHTS[k] for k in indicators_available)
total_points = sum(points.values())
scaled_0_to_100 = (total_points / total_weight_available) * 100
if scaled_0_to_100 >= 85: letter = "A"
elif scaled_0_to_100 >= 65: letter = "B"
elif scaled_0_to_100 >= 40: letter = "C"
else: letter = "D"
return {
"neighborhood_class": letter,
"class_score": round(scaled_0_to_100, 1),
"confidence_level": confidence,
"indicators_available": indicators_available,
"weight_coverage_pct": total_weight_available,
"raw_points": points,
}
# ═══════════════════════════════════════════════════════════════════════════
# Investment implications por clase
# ═══════════════════════════════════════════════════════════════════════════
INVESTMENT_IMPLICATIONS = {
"A": {
"buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos",
"section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR",
"appreciation_potential": "Alta - tipicamente supera inflacion",
"tenant_quality_expected": "Profesional, familias, muy estable",
"typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"],
},
"B": {
"buy_hold_viability": "Alta - balance entre cash flow y apreciacion",
"section_8_viability": "Baja - market rents por encima de FMR pero no por mucho",
"appreciation_potential": "Media-alta",
"tenant_quality_expected": "Profesional, familias, estable",
"typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"],
},
"C": {
"buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management",
"section_8_viability": "Alta - market rents cerca o por debajo de FMR",
"appreciation_potential": "Baja-media",
"tenant_quality_expected": "Working class, estabilidad mixta",
"typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"],
},
"D": {
"buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo",
"section_8_viability": "Muy alta - Section 8 puede superar market rent",
"appreciation_potential": "Baja - depende de trayectoria del vecindario",
"tenant_quality_expected": "Bajos ingresos, screening diligente requerido",
"typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"],
},
"unclassified": {
"buy_hold_viability": "No determinado - sin datos suficientes",
"section_8_viability": "No determinado",
"appreciation_potential": "No determinado",
"tenant_quality_expected": "No determinado",
"typical_strategies": [],
},
}
def _build_reasoning(indicators: dict, classification: dict) -> str:
"""Genera 1-2 lineas de justificacion del class letter."""
letter = classification["neighborhood_class"]
if letter == "unclassified":
return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)."
parts = []
if (v := indicators.get("median_household_income")) is not None:
parts.append(f"median income ${v:,.0f}")
if (v := indicators.get("owner_occupied_pct")) is not None:
parts.append(f"owner-occupied {v:.0f}%")
if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None:
parts.append(f"bachelor+ {v:.0f}%")
if (v := indicators.get("crime_vs_national")) is not None:
parts.append(f"crime {v:.2f}x national")
if (v := indicators.get("vacancy_rate")) is not None:
parts.append(f"vacancy {v:.1f}%")
if (v := indicators.get("days_on_market_median")) is not None:
parts.append(f"DOM {v} dias")
indicator_str = ", ".join(parts)
score = classification["class_score"]
conf = classification["confidence_level"]
coverage = classification["weight_coverage_pct"]
return (
f"Clase {letter} (score {score}/100, confianza {conf}, "
f"cobertura {coverage}% del peso). Indicadores: {indicator_str}."
)
# ═══════════════════════════════════════════════════════════════════════════
# API publica
# ═══════════════════════════════════════════════════════════════════════════
def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict:
"""Clasifica un vecindario A/B/C/D basado en indicadores objetivos.
Args:
geocode: output de census_geocode.fetch_geocode (debe tener state_fips,
county_code_only, tract_code).
include_dom: si True, hace lookup de Days-on-Market via Firecrawl
(gasta credits). Default False.
Returns:
dict con neighborhood_class, class_score, confidence_level, indicators,
investment_implications, etc.
"""
# .env ya fue cargado por data_fetchers/__init__.py al primer import
# del paquete. No llamamos load_dotenv() aca para evitar conflictos con
# CWD distinto del proyecto.
fetched_at = datetime.now(timezone.utc).isoformat()
all_errors: list[str] = []
data_sources: list[str] = []
if not geocode or not geocode.get("state_fips"):
return {
"neighborhood_class": "unclassified",
"class_score": 0.0,
"confidence_level": "unclassified",
"indicators": {},
"indicators_available": [],
"weight_coverage_pct": 0,
"class_reasoning": "Geocode fallo - no se puede clasificar sin tract.",
"investment_implications": INVESTMENT_IMPLICATIONS["unclassified"],
"warnings": ["Geocode invalido o incompleto"],
"data_sources": [],
"tract_geoid": None,
"fetched_at": fetched_at,
"errors": ["geocode_failed"],
}
# ─── Census ACS (4 indicadores) ─────────────────────────────────────────
indicators: dict = {}
census_data, errs = _fetch_census_acs(geocode)
indicators.update(census_data)
all_errors.extend(errs)
if census_data:
data_sources.append("US Census ACS 2022 5-Year")
# ─── FBI UCR (1 indicador) ──────────────────────────────────────────────
crime_data, errs = _fetch_fbi_crime(geocode)
# Excluir keys auxiliares con prefijo "_"
indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")})
all_errors.extend(errs)
if crime_data:
data_sources.append("FBI Crime Data Explorer (state-level)")
# ─── Firecrawl DOM (1 indicador, opt-in) ────────────────────────────────
if include_dom:
dom_data, errs = _fetch_firecrawl_dom(geocode)
indicators.update(dom_data)
all_errors.extend(errs)
if dom_data:
data_sources.append("Firecrawl (Zillow DOM)")
# ─── Clasificar ─────────────────────────────────────────────────────────
classification = _classify(indicators)
reasoning = _build_reasoning(indicators, classification)
letter = classification["neighborhood_class"]
# ─── Warnings ───────────────────────────────────────────────────────────
warnings: list[str] = []
if classification["confidence_level"] in ("low", "unclassified"):
warnings.append(
f"Confianza {classification['confidence_level']}: "
f"solo {len(classification['indicators_available'])} indicadores disponibles."
)
if "_crime_state_level_note" in crime_data:
warnings.append(crime_data["_crime_state_level_note"])
return {
"neighborhood_class": letter,
"class_score": classification["class_score"],
"confidence_level": classification["confidence_level"],
"indicators": indicators,
"indicators_available": classification["indicators_available"],
"weight_coverage_pct": classification["weight_coverage_pct"],
"class_reasoning": reasoning,
"investment_implications": INVESTMENT_IMPLICATIONS[letter],
"warnings": warnings,
"data_sources": data_sources,
"tract_geoid": geocode.get("tract_geoid"),
"tract_name": geocode.get("tract_name"),
"fetched_at": fetched_at,
"errors": all_errors,
}