feat: AR-House initial commit

2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
@@ -0,0 +1,549 @@
+"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos.
+
+CRITICO - COMPLIANCE LEGAL:
+La clasificacion se basa SOLO en indicadores economicos y datos publicos:
+income, owner-occupancy, education attainment, vacancy, crime, days on market.
+
+NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining.
+Esta es una clasificacion ECONOMICA, no demografica.
+
+Indicadores y pesos (max 100):
+- median_household_income (Census ACS)             25%
+- owner_occupied_pct (Census ACS)                  20%
+- education_attainment_pct_bachelor_plus (ACS)     20%
+- crime_vs_national (FBI UCR)                      15%
+- vacancy_rate (Census ACS)                        10%
+- days_on_market_median (Firecrawl, opt-in)        10%
+
+Graceful degradation: si un indicador no esta disponible (API key missing,
+fetcher fallo), se redistribuye su peso entre los disponibles.
+
+confidence_level (basado en CANTIDAD de indicadores disponibles):
+- "high":         5-6 indicadores
+- "medium":       3-4 indicadores
+- "low":          1-2 indicadores
+- "unclassified": 0 indicadores
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from datetime import datetime, timezone
+from typing import Optional
+
+import requests
+from dotenv import load_dotenv
+
+from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
+
+
+# ─── Pesos del algoritmo de clasificacion ───────────────────────────────────
+WEIGHTS = {
+    "income":         25,
+    "owner_occupied": 20,
+    "education":      20,
+    "crime":          15,
+    "vacancy":        10,
+    "dom":            10,
+}
+
+# ─── Census ACS variable codes ──────────────────────────────────────────────
+ACS_VARS = {
+    "income":         "B19013_001E",  # Median household income (last 12 months)
+    "oo_count":       "B25003_002E",  # Owner-occupied housing units count
+    "occupied_total": "B25003_001E",  # Total occupied housing units
+    "vacant_count":   "B25002_003E",  # Vacant housing units count
+    "housing_total":  "B25002_001E",  # Total housing units (occupied + vacant)
+    "home_value":     "B25077_001E",  # Median home value
+    "edu_total":      "B15003_001E",  # Total population 25+
+    "edu_bachelor":   "B15003_022E",  # Bachelor's degree
+    "edu_master":     "B15003_023E",  # Master's degree
+    "edu_prof":       "B15003_024E",  # Professional school degree
+    "edu_doctorate":  "B15003_025E",  # Doctorate degree
+}
+
+# ─── National crime rates (FBI UCR 2022, per 100K population) ──────────────
+# Usado como denominador para crime_vs_national. Actualizar anualmente.
+NATIONAL_VIOLENT_CRIME_PER_100K = 380.7
+NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# Fetchers individuales (fail-soft)
+# ═══════════════════════════════════════════════════════════════════════════
+
+def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]:
+    """Fetch 4 indicadores Census ACS para el tract del geocode.
+
+    Returns (indicators_dict, errors_list).
+    """
+    errors: list[str] = []
+    out: dict = {}
+
+    api_key = os.getenv("CENSUS_API_KEY", "").strip()
+    if not api_key:
+        errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)")
+        return out, errors
+
+    state_fips = geocode.get("state_fips")
+    county_code = geocode.get("county_code_only")
+    tract_code = geocode.get("tract_code")
+    if not state_fips or not county_code or not tract_code:
+        errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})")
+        return out, errors
+
+    url = "https://api.census.gov/data/2022/acs/acs5"
+    # Pedir todas las vars en una sola llamada (la API acepta hasta 50)
+    var_keys = ["NAME"] + list(ACS_VARS.values())
+    params = {
+        "get": ",".join(var_keys),
+        "for": f"tract:{tract_code}",
+        "in": f"state:{state_fips} county:{county_code}",
+        "key": api_key,
+    }
+
+    try:
+        r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
+        r.raise_for_status()
+        data = r.json()
+    except requests.RequestException as e:
+        errors.append(f"Census ACS HTTP: {e}")
+        return out, errors
+    except ValueError as e:
+        errors.append(f"Census ACS JSON: {e}")
+        return out, errors
+
+    if not data or len(data) < 2:
+        errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)")
+        return out, errors
+
+    header = data[0]
+    row = data[1]
+    idx = {col: i for i, col in enumerate(header)}
+
+    def _f(col: str) -> Optional[float]:
+        try:
+            v = row[idx[col]]
+        except (KeyError, IndexError):
+            return None
+        if v is None or v == "" or v == "null":
+            return None
+        try:
+            f = float(v)
+            # Census usa valores negativos para "no data" / "suppressed"
+            return f if f >= 0 else None
+        except (ValueError, TypeError):
+            return None
+
+    # 1) Median household income
+    income = _f(ACS_VARS["income"])
+    if income is not None:
+        out["median_household_income"] = round(income, 0)
+
+    # 2) Owner-occupied percentage
+    oo = _f(ACS_VARS["oo_count"])
+    total = _f(ACS_VARS["occupied_total"])
+    if oo is not None and total and total > 0:
+        out["owner_occupied_pct"] = round(oo / total * 100, 1)
+
+    # 3) Vacancy rate
+    vacant = _f(ACS_VARS["vacant_count"])
+    housing = _f(ACS_VARS["housing_total"])
+    if vacant is not None and housing and housing > 0:
+        out["vacancy_rate"] = round(vacant / housing * 100, 1)
+
+    # 4) Median home value
+    home_value = _f(ACS_VARS["home_value"])
+    if home_value is not None:
+        out["median_home_value"] = round(home_value, 0)
+
+    # 5) Education attainment (% bachelor's or higher, age 25+)
+    edu_total = _f(ACS_VARS["edu_total"])
+    edu_b = _f(ACS_VARS["edu_bachelor"]) or 0
+    edu_m = _f(ACS_VARS["edu_master"]) or 0
+    edu_p = _f(ACS_VARS["edu_prof"]) or 0
+    edu_d = _f(ACS_VARS["edu_doctorate"]) or 0
+    if edu_total and edu_total > 0:
+        pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100
+        out["education_attainment_pct_bachelor_plus"] = round(pct, 1)
+
+    return out, errors
+
+
+def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]:
+    """Fetch crime data via FBI Crime Data Explorer (api.data.gov key).
+
+    NOTA: la API publica gratis de FBI es county-level via summarized endpoint.
+    Implementacion best-effort: si la API responde, devolvemos crime_vs_national.
+    Si no, fail-soft (errors list, indicator ausente).
+    """
+    errors: list[str] = []
+    out: dict = {}
+
+    api_key = os.getenv("API_DATA_GOV_KEY", "").strip()
+    if not api_key:
+        errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)")
+        return out, errors
+
+    state_abbr = geocode.get("state")  # e.g. "FL"
+    if not state_abbr:
+        errors.append("state abbreviation faltante en geocode")
+        return out, errors
+
+    # Endpoint: FBI Crime Data Explorer state-level estimate
+    # Mejor que tener nada (county-level es complejo de agregar).
+    url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}"
+    params = {
+        "from": "2022",
+        "to": "2022",
+        "API_KEY": api_key,
+    }
+
+    try:
+        r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
+        r.raise_for_status()
+        data = r.json()
+    except requests.RequestException as e:
+        errors.append(f"FBI UCR HTTP: {e}")
+        return out, errors
+    except ValueError as e:
+        errors.append(f"FBI UCR JSON: {e}")
+        return out, errors
+
+    # Estructura tipica del endpoint: lista de estimates por ano con keys
+    # como 'violent_crime', 'property_crime', 'population', etc.
+    # Defensivo: probar varias formas.
+    estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else [])
+    if not estimates:
+        errors.append("FBI UCR sin estimates en respuesta")
+        return out, errors
+
+    rec = estimates[0] if isinstance(estimates, list) else estimates
+    if not isinstance(rec, dict):
+        errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}")
+        return out, errors
+
+    population = rec.get("population")
+    violent = rec.get("violent_crime")
+    property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0  # fallback - se ignora luego
+
+    if not population or not violent:
+        errors.append("FBI UCR sin population o violent_crime en estimate")
+        return out, errors
+
+    try:
+        violent_per_100k = float(violent) / float(population) * 100000
+        ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K
+
+        if property_c:
+            property_per_100k = float(property_c) / float(population) * 100000
+            ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K
+            # Promedio ponderado: violent pesa mas (2/3) que property (1/3)
+            combined = (ratio_violent * 2 + ratio_property) / 3
+        else:
+            combined = ratio_violent
+
+        out["crime_vs_national"] = round(combined, 2)
+        out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa."
+    except (TypeError, ValueError) as e:
+        errors.append(f"FBI UCR calc error: {e}")
+
+    return out, errors
+
+
+def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]:
+    """Fetch median days-on-market via Firecrawl scrape de Zillow.
+
+    OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup).
+    Llamar solo si include_dom=True en classify_neighborhood().
+    """
+    errors: list[str] = []
+    # Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6).
+    # Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado.
+    errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6")
+    return {}, errors
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# Algoritmo de clasificacion
+# ═══════════════════════════════════════════════════════════════════════════
+
+def _score_income(income: float) -> int:
+    if income >= 100000: return 25
+    if income >= 60000:  return 18
+    if income >= 35000:  return 10
+    return 3
+
+
+def _score_owner_occupied(pct: float) -> int:
+    if pct >= 80: return 20
+    if pct >= 60: return 15
+    if pct >= 40: return 8
+    return 3
+
+
+def _score_education(pct_bach_plus: float) -> int:
+    if pct_bach_plus >= 50: return 20
+    if pct_bach_plus >= 30: return 14
+    if pct_bach_plus >= 15: return 7
+    return 2
+
+
+def _score_crime(ratio_vs_national: float) -> int:
+    """Lower ratio = better (less crime than national)."""
+    if ratio_vs_national < 0.7: return 15
+    if ratio_vs_national < 1.0: return 12
+    if ratio_vs_national < 1.5: return 7
+    return 2
+
+
+def _score_vacancy(pct: float) -> int:
+    """Lower vacancy = better."""
+    if pct < 3:  return 10
+    if pct < 6:  return 7
+    if pct < 10: return 4
+    return 1
+
+
+def _score_dom(days: float) -> int:
+    """Lower days-on-market = hotter neighborhood = better."""
+    if days < 30: return 10
+    if days < 60: return 7
+    if days < 90: return 4
+    return 1
+
+
+def _classify(indicators: dict) -> dict:
+    """Aplica el algoritmo de scoring con graceful degradation.
+
+    Returns dict con neighborhood_class, class_score, confidence_level, etc.
+    """
+    score_funcs = {
+        "income":         (_score_income,         "median_household_income"),
+        "owner_occupied": (_score_owner_occupied, "owner_occupied_pct"),
+        "education":      (_score_education,      "education_attainment_pct_bachelor_plus"),
+        "crime":          (_score_crime,          "crime_vs_national"),
+        "vacancy":        (_score_vacancy,        "vacancy_rate"),
+        "dom":            (_score_dom,            "days_on_market_median"),
+    }
+
+    points = {}
+    indicators_available = []
+    for key, (func, indicator_name) in score_funcs.items():
+        val = indicators.get(indicator_name)
+        if val is not None:
+            points[key] = func(val)
+            indicators_available.append(key)
+
+    n_available = len(indicators_available)
+
+    # Confidence level por cantidad de indicadores
+    if n_available == 0:
+        confidence = "unclassified"
+    elif n_available <= 2:
+        confidence = "low"
+    elif n_available <= 4:
+        confidence = "medium"
+    else:
+        confidence = "high"
+
+    if n_available == 0:
+        return {
+            "neighborhood_class": "unclassified",
+            "class_score": 0.0,
+            "confidence_level": "unclassified",
+            "indicators_available": [],
+            "weight_coverage_pct": 0,
+        }
+
+    # Graceful degradation: scale points contra weights disponibles
+    total_weight_available = sum(WEIGHTS[k] for k in indicators_available)
+    total_points = sum(points.values())
+    scaled_0_to_100 = (total_points / total_weight_available) * 100
+
+    if scaled_0_to_100 >= 85:   letter = "A"
+    elif scaled_0_to_100 >= 65: letter = "B"
+    elif scaled_0_to_100 >= 40: letter = "C"
+    else:                       letter = "D"
+
+    return {
+        "neighborhood_class": letter,
+        "class_score": round(scaled_0_to_100, 1),
+        "confidence_level": confidence,
+        "indicators_available": indicators_available,
+        "weight_coverage_pct": total_weight_available,
+        "raw_points": points,
+    }
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# Investment implications por clase
+# ═══════════════════════════════════════════════════════════════════════════
+
+INVESTMENT_IMPLICATIONS = {
+    "A": {
+        "buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos",
+        "section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR",
+        "appreciation_potential": "Alta - tipicamente supera inflacion",
+        "tenant_quality_expected": "Profesional, familias, muy estable",
+        "typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"],
+    },
+    "B": {
+        "buy_hold_viability": "Alta - balance entre cash flow y apreciacion",
+        "section_8_viability": "Baja - market rents por encima de FMR pero no por mucho",
+        "appreciation_potential": "Media-alta",
+        "tenant_quality_expected": "Profesional, familias, estable",
+        "typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"],
+    },
+    "C": {
+        "buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management",
+        "section_8_viability": "Alta - market rents cerca o por debajo de FMR",
+        "appreciation_potential": "Baja-media",
+        "tenant_quality_expected": "Working class, estabilidad mixta",
+        "typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"],
+    },
+    "D": {
+        "buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo",
+        "section_8_viability": "Muy alta - Section 8 puede superar market rent",
+        "appreciation_potential": "Baja - depende de trayectoria del vecindario",
+        "tenant_quality_expected": "Bajos ingresos, screening diligente requerido",
+        "typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"],
+    },
+    "unclassified": {
+        "buy_hold_viability": "No determinado - sin datos suficientes",
+        "section_8_viability": "No determinado",
+        "appreciation_potential": "No determinado",
+        "tenant_quality_expected": "No determinado",
+        "typical_strategies": [],
+    },
+}
+
+
+def _build_reasoning(indicators: dict, classification: dict) -> str:
+    """Genera 1-2 lineas de justificacion del class letter."""
+    letter = classification["neighborhood_class"]
+    if letter == "unclassified":
+        return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)."
+
+    parts = []
+    if (v := indicators.get("median_household_income")) is not None:
+        parts.append(f"median income ${v:,.0f}")
+    if (v := indicators.get("owner_occupied_pct")) is not None:
+        parts.append(f"owner-occupied {v:.0f}%")
+    if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None:
+        parts.append(f"bachelor+ {v:.0f}%")
+    if (v := indicators.get("crime_vs_national")) is not None:
+        parts.append(f"crime {v:.2f}x national")
+    if (v := indicators.get("vacancy_rate")) is not None:
+        parts.append(f"vacancy {v:.1f}%")
+    if (v := indicators.get("days_on_market_median")) is not None:
+        parts.append(f"DOM {v} dias")
+
+    indicator_str = ", ".join(parts)
+    score = classification["class_score"]
+    conf = classification["confidence_level"]
+    coverage = classification["weight_coverage_pct"]
+    return (
+        f"Clase {letter} (score {score}/100, confianza {conf}, "
+        f"cobertura {coverage}% del peso). Indicadores: {indicator_str}."
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# API publica
+# ═══════════════════════════════════════════════════════════════════════════
+
+def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict:
+    """Clasifica un vecindario A/B/C/D basado en indicadores objetivos.
+
+    Args:
+        geocode: output de census_geocode.fetch_geocode (debe tener state_fips,
+                 county_code_only, tract_code).
+        include_dom: si True, hace lookup de Days-on-Market via Firecrawl
+                     (gasta credits). Default False.
+
+    Returns:
+        dict con neighborhood_class, class_score, confidence_level, indicators,
+        investment_implications, etc.
+    """
+    # .env ya fue cargado por data_fetchers/__init__.py al primer import
+    # del paquete. No llamamos load_dotenv() aca para evitar conflictos con
+    # CWD distinto del proyecto.
+
+    fetched_at = datetime.now(timezone.utc).isoformat()
+    all_errors: list[str] = []
+    data_sources: list[str] = []
+
+    if not geocode or not geocode.get("state_fips"):
+        return {
+            "neighborhood_class": "unclassified",
+            "class_score": 0.0,
+            "confidence_level": "unclassified",
+            "indicators": {},
+            "indicators_available": [],
+            "weight_coverage_pct": 0,
+            "class_reasoning": "Geocode fallo - no se puede clasificar sin tract.",
+            "investment_implications": INVESTMENT_IMPLICATIONS["unclassified"],
+            "warnings": ["Geocode invalido o incompleto"],
+            "data_sources": [],
+            "tract_geoid": None,
+            "fetched_at": fetched_at,
+            "errors": ["geocode_failed"],
+        }
+
+    # ─── Census ACS (4 indicadores) ─────────────────────────────────────────
+    indicators: dict = {}
+    census_data, errs = _fetch_census_acs(geocode)
+    indicators.update(census_data)
+    all_errors.extend(errs)
+    if census_data:
+        data_sources.append("US Census ACS 2022 5-Year")
+
+    # ─── FBI UCR (1 indicador) ──────────────────────────────────────────────
+    crime_data, errs = _fetch_fbi_crime(geocode)
+    # Excluir keys auxiliares con prefijo "_"
+    indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")})
+    all_errors.extend(errs)
+    if crime_data:
+        data_sources.append("FBI Crime Data Explorer (state-level)")
+
+    # ─── Firecrawl DOM (1 indicador, opt-in) ────────────────────────────────
+    if include_dom:
+        dom_data, errs = _fetch_firecrawl_dom(geocode)
+        indicators.update(dom_data)
+        all_errors.extend(errs)
+        if dom_data:
+            data_sources.append("Firecrawl (Zillow DOM)")
+
+    # ─── Clasificar ─────────────────────────────────────────────────────────
+    classification = _classify(indicators)
+    reasoning = _build_reasoning(indicators, classification)
+    letter = classification["neighborhood_class"]
+
+    # ─── Warnings ───────────────────────────────────────────────────────────
+    warnings: list[str] = []
+    if classification["confidence_level"] in ("low", "unclassified"):
+        warnings.append(
+            f"Confianza {classification['confidence_level']}: "
+            f"solo {len(classification['indicators_available'])} indicadores disponibles."
+        )
+    if "_crime_state_level_note" in crime_data:
+        warnings.append(crime_data["_crime_state_level_note"])
+
+    return {
+        "neighborhood_class": letter,
+        "class_score": classification["class_score"],
+        "confidence_level": classification["confidence_level"],
+        "indicators": indicators,
+        "indicators_available": classification["indicators_available"],
+        "weight_coverage_pct": classification["weight_coverage_pct"],
+        "class_reasoning": reasoning,
+        "investment_implications": INVESTMENT_IMPLICATIONS[letter],
+        "warnings": warnings,
+        "data_sources": data_sources,
+        "tract_geoid": geocode.get("tract_geoid"),
+        "tract_name": geocode.get("tract_name"),
+        "fetched_at": fetched_at,
+        "errors": all_errors,
+    }