AR-House/data_fetchers/neighborhood_class.py

"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos.

CRITICO - COMPLIANCE LEGAL:
La clasificacion se basa SOLO en indicadores economicos y datos publicos:
income, owner-occupancy, education attainment, vacancy, crime, days on market.

NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining.
Esta es una clasificacion ECONOMICA, no demografica.

Indicadores y pesos (max 100):
- median_household_income (Census ACS)             25%
- owner_occupied_pct (Census ACS)                  20%
- education_attainment_pct_bachelor_plus (ACS)     20%
- crime_vs_national (FBI UCR)                      15%
- vacancy_rate (Census ACS)                        10%
- days_on_market_median (Firecrawl, opt-in)        10%

Graceful degradation: si un indicador no esta disponible (API key missing,
fetcher fallo), se redistribuye su peso entre los disponibles.

confidence_level (basado en CANTIDAD de indicadores disponibles):
- "high":         5-6 indicadores
- "medium":       3-4 indicadores
- "low":          1-2 indicadores
- "unclassified": 0 indicadores
"""

from __future__ import annotations

import os
import time
from datetime import datetime, timezone
from typing import Optional

import requests
from dotenv import load_dotenv

from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT


# ─── Pesos del algoritmo de clasificacion ───────────────────────────────────
WEIGHTS = {
    "income":         25,
    "owner_occupied": 20,
    "education":      20,
    "crime":          15,
    "vacancy":        10,
    "dom":            10,
}

# ─── Census ACS variable codes ──────────────────────────────────────────────
ACS_VARS = {
    "income":         "B19013_001E",  # Median household income (last 12 months)
    "oo_count":       "B25003_002E",  # Owner-occupied housing units count
    "occupied_total": "B25003_001E",  # Total occupied housing units
    "vacant_count":   "B25002_003E",  # Vacant housing units count
    "housing_total":  "B25002_001E",  # Total housing units (occupied + vacant)
    "home_value":     "B25077_001E",  # Median home value
    "edu_total":      "B15003_001E",  # Total population 25+
    "edu_bachelor":   "B15003_022E",  # Bachelor's degree
    "edu_master":     "B15003_023E",  # Master's degree
    "edu_prof":       "B15003_024E",  # Professional school degree
    "edu_doctorate":  "B15003_025E",  # Doctorate degree
}

# ─── National crime rates (FBI UCR 2022, per 100K population) ──────────────
# Usado como denominador para crime_vs_national. Actualizar anualmente.
NATIONAL_VIOLENT_CRIME_PER_100K = 380.7
NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4


# ═══════════════════════════════════════════════════════════════════════════
# Fetchers individuales (fail-soft)
# ═══════════════════════════════════════════════════════════════════════════

def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]:
    """Fetch 4 indicadores Census ACS para el tract del geocode.

    Returns (indicators_dict, errors_list).
    """
    errors: list[str] = []
    out: dict = {}

    api_key = os.getenv("CENSUS_API_KEY", "").strip()
    if not api_key:
        errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)")
        return out, errors

    state_fips = geocode.get("state_fips")
    county_code = geocode.get("county_code_only")
    tract_code = geocode.get("tract_code")
    if not state_fips or not county_code or not tract_code:
        errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})")
        return out, errors

    url = "https://api.census.gov/data/2022/acs/acs5"
    # Pedir todas las vars en una sola llamada (la API acepta hasta 50)
    var_keys = ["NAME"] + list(ACS_VARS.values())
    params = {
        "get": ",".join(var_keys),
        "for": f"tract:{tract_code}",
        "in": f"state:{state_fips} county:{county_code}",
        "key": api_key,
    }

    try:
        r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
        r.raise_for_status()
        data = r.json()
    except requests.RequestException as e:
        errors.append(f"Census ACS HTTP: {e}")
        return out, errors
    except ValueError as e:
        errors.append(f"Census ACS JSON: {e}")
        return out, errors

    if not data or len(data) < 2:
        errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)")
        return out, errors

    header = data[0]
    row = data[1]
    idx = {col: i for i, col in enumerate(header)}

    def _f(col: str) -> Optional[float]:
        try:
            v = row[idx[col]]
        except (KeyError, IndexError):
            return None
        if v is None or v == "" or v == "null":
            return None
        try:
            f = float(v)
            # Census usa valores negativos para "no data" / "suppressed"
            return f if f >= 0 else None
        except (ValueError, TypeError):
            return None

    # 1) Median household income
    income = _f(ACS_VARS["income"])
    if income is not None:
        out["median_household_income"] = round(income, 0)

    # 2) Owner-occupied percentage
    oo = _f(ACS_VARS["oo_count"])
    total = _f(ACS_VARS["occupied_total"])
    if oo is not None and total and total > 0:
        out["owner_occupied_pct"] = round(oo / total * 100, 1)

    # 3) Vacancy rate
    vacant = _f(ACS_VARS["vacant_count"])
    housing = _f(ACS_VARS["housing_total"])
    if vacant is not None and housing and housing > 0:
        out["vacancy_rate"] = round(vacant / housing * 100, 1)

    # 4) Median home value
    home_value = _f(ACS_VARS["home_value"])
    if home_value is not None:
        out["median_home_value"] = round(home_value, 0)

    # 5) Education attainment (% bachelor's or higher, age 25+)
    edu_total = _f(ACS_VARS["edu_total"])
    edu_b = _f(ACS_VARS["edu_bachelor"]) or 0
    edu_m = _f(ACS_VARS["edu_master"]) or 0
    edu_p = _f(ACS_VARS["edu_prof"]) or 0
    edu_d = _f(ACS_VARS["edu_doctorate"]) or 0
    if edu_total and edu_total > 0:
        pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100
        out["education_attainment_pct_bachelor_plus"] = round(pct, 1)

    return out, errors


def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]:
    """Fetch crime data via FBI Crime Data Explorer (api.data.gov key).

    NOTA: la API publica gratis de FBI es county-level via summarized endpoint.
    Implementacion best-effort: si la API responde, devolvemos crime_vs_national.
    Si no, fail-soft (errors list, indicator ausente).
    """
    errors: list[str] = []
    out: dict = {}

    api_key = os.getenv("API_DATA_GOV_KEY", "").strip()
    if not api_key:
        errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)")
        return out, errors

    state_abbr = geocode.get("state")  # e.g. "FL"
    if not state_abbr:
        errors.append("state abbreviation faltante en geocode")
        return out, errors

    # Endpoint: FBI Crime Data Explorer state-level estimate
    # Mejor que tener nada (county-level es complejo de agregar).
    url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}"
    params = {
        "from": "2022",
        "to": "2022",
        "API_KEY": api_key,
    }

    try:
        r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
        r.raise_for_status()
        data = r.json()
    except requests.RequestException as e:
        errors.append(f"FBI UCR HTTP: {e}")
        return out, errors
    except ValueError as e:
        errors.append(f"FBI UCR JSON: {e}")
        return out, errors

    # Estructura tipica del endpoint: lista de estimates por ano con keys
    # como 'violent_crime', 'property_crime', 'population', etc.
    # Defensivo: probar varias formas.
    estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else [])
    if not estimates:
        errors.append("FBI UCR sin estimates en respuesta")
        return out, errors

    rec = estimates[0] if isinstance(estimates, list) else estimates
    if not isinstance(rec, dict):
        errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}")
        return out, errors

    population = rec.get("population")
    violent = rec.get("violent_crime")
    property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0  # fallback - se ignora luego

    if not population or not violent:
        errors.append("FBI UCR sin population o violent_crime en estimate")
        return out, errors

    try:
        violent_per_100k = float(violent) / float(population) * 100000
        ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K

        if property_c:
            property_per_100k = float(property_c) / float(population) * 100000
            ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K
            # Promedio ponderado: violent pesa mas (2/3) que property (1/3)
            combined = (ratio_violent * 2 + ratio_property) / 3
        else:
            combined = ratio_violent

        out["crime_vs_national"] = round(combined, 2)
        out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa."
    except (TypeError, ValueError) as e:
        errors.append(f"FBI UCR calc error: {e}")

    return out, errors


def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]:
    """Fetch median days-on-market via Firecrawl scrape de Zillow.

    OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup).
    Llamar solo si include_dom=True en classify_neighborhood().
    """
    errors: list[str] = []
    # Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6).
    # Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado.
    errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6")
    return {}, errors


# ═══════════════════════════════════════════════════════════════════════════
# Algoritmo de clasificacion
# ═══════════════════════════════════════════════════════════════════════════

def _score_income(income: float) -> int:
    if income >= 100000: return 25
    if income >= 60000:  return 18
    if income >= 35000:  return 10
    return 3


def _score_owner_occupied(pct: float) -> int:
    if pct >= 80: return 20
    if pct >= 60: return 15
    if pct >= 40: return 8
    return 3


def _score_education(pct_bach_plus: float) -> int:
    if pct_bach_plus >= 50: return 20
    if pct_bach_plus >= 30: return 14
    if pct_bach_plus >= 15: return 7
    return 2


def _score_crime(ratio_vs_national: float) -> int:
    """Lower ratio = better (less crime than national)."""
    if ratio_vs_national < 0.7: return 15
    if ratio_vs_national < 1.0: return 12
    if ratio_vs_national < 1.5: return 7
    return 2


def _score_vacancy(pct: float) -> int:
    """Lower vacancy = better."""
    if pct < 3:  return 10
    if pct < 6:  return 7
    if pct < 10: return 4
    return 1


def _score_dom(days: float) -> int:
    """Lower days-on-market = hotter neighborhood = better."""
    if days < 30: return 10
    if days < 60: return 7
    if days < 90: return 4
    return 1


def _classify(indicators: dict) -> dict:
    """Aplica el algoritmo de scoring con graceful degradation.

    Returns dict con neighborhood_class, class_score, confidence_level, etc.
    """
    score_funcs = {
        "income":         (_score_income,         "median_household_income"),
        "owner_occupied": (_score_owner_occupied, "owner_occupied_pct"),
        "education":      (_score_education,      "education_attainment_pct_bachelor_plus"),
        "crime":          (_score_crime,          "crime_vs_national"),
        "vacancy":        (_score_vacancy,        "vacancy_rate"),
        "dom":            (_score_dom,            "days_on_market_median"),
    }

    points = {}
    indicators_available = []
    for key, (func, indicator_name) in score_funcs.items():
        val = indicators.get(indicator_name)
        if val is not None:
            points[key] = func(val)
            indicators_available.append(key)

    n_available = len(indicators_available)

    # Confidence level por cantidad de indicadores
    if n_available == 0:
        confidence = "unclassified"
    elif n_available <= 2:
        confidence = "low"
    elif n_available <= 4:
        confidence = "medium"
    else:
        confidence = "high"

    if n_available == 0:
        return {
            "neighborhood_class": "unclassified",
            "class_score": 0.0,
            "confidence_level": "unclassified",
            "indicators_available": [],
            "weight_coverage_pct": 0,
        }

    # Graceful degradation: scale points contra weights disponibles
    total_weight_available = sum(WEIGHTS[k] for k in indicators_available)
    total_points = sum(points.values())
    scaled_0_to_100 = (total_points / total_weight_available) * 100

    if scaled_0_to_100 >= 85:   letter = "A"
    elif scaled_0_to_100 >= 65: letter = "B"
    elif scaled_0_to_100 >= 40: letter = "C"
    else:                       letter = "D"

    return {
        "neighborhood_class": letter,
        "class_score": round(scaled_0_to_100, 1),
        "confidence_level": confidence,
        "indicators_available": indicators_available,
        "weight_coverage_pct": total_weight_available,
        "raw_points": points,
    }


# ═══════════════════════════════════════════════════════════════════════════
# Investment implications por clase
# ═══════════════════════════════════════════════════════════════════════════

INVESTMENT_IMPLICATIONS = {
    "A": {
        "buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos",
        "section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR",
        "appreciation_potential": "Alta - tipicamente supera inflacion",
        "tenant_quality_expected": "Profesional, familias, muy estable",
        "typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"],
    },
    "B": {
        "buy_hold_viability": "Alta - balance entre cash flow y apreciacion",
        "section_8_viability": "Baja - market rents por encima de FMR pero no por mucho",
        "appreciation_potential": "Media-alta",
        "tenant_quality_expected": "Profesional, familias, estable",
        "typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"],
    },
    "C": {
        "buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management",
        "section_8_viability": "Alta - market rents cerca o por debajo de FMR",
        "appreciation_potential": "Baja-media",
        "tenant_quality_expected": "Working class, estabilidad mixta",
        "typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"],
    },
    "D": {
        "buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo",
        "section_8_viability": "Muy alta - Section 8 puede superar market rent",
        "appreciation_potential": "Baja - depende de trayectoria del vecindario",
        "tenant_quality_expected": "Bajos ingresos, screening diligente requerido",
        "typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"],
    },
    "unclassified": {
        "buy_hold_viability": "No determinado - sin datos suficientes",
        "section_8_viability": "No determinado",
        "appreciation_potential": "No determinado",
        "tenant_quality_expected": "No determinado",
        "typical_strategies": [],
    },
}


def _build_reasoning(indicators: dict, classification: dict) -> str:
    """Genera 1-2 lineas de justificacion del class letter."""
    letter = classification["neighborhood_class"]
    if letter == "unclassified":
        return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)."

    parts = []
    if (v := indicators.get("median_household_income")) is not None:
        parts.append(f"median income ${v:,.0f}")
    if (v := indicators.get("owner_occupied_pct")) is not None:
        parts.append(f"owner-occupied {v:.0f}%")
    if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None:
        parts.append(f"bachelor+ {v:.0f}%")
    if (v := indicators.get("crime_vs_national")) is not None:
        parts.append(f"crime {v:.2f}x national")
    if (v := indicators.get("vacancy_rate")) is not None:
        parts.append(f"vacancy {v:.1f}%")
    if (v := indicators.get("days_on_market_median")) is not None:
        parts.append(f"DOM {v} dias")

    indicator_str = ", ".join(parts)
    score = classification["class_score"]
    conf = classification["confidence_level"]
    coverage = classification["weight_coverage_pct"]
    return (
        f"Clase {letter} (score {score}/100, confianza {conf}, "
        f"cobertura {coverage}% del peso). Indicadores: {indicator_str}."
    )


# ═══════════════════════════════════════════════════════════════════════════
# API publica
# ═══════════════════════════════════════════════════════════════════════════

def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict:
    """Clasifica un vecindario A/B/C/D basado en indicadores objetivos.

    Args:
        geocode: output de census_geocode.fetch_geocode (debe tener state_fips,
                 county_code_only, tract_code).
        include_dom: si True, hace lookup de Days-on-Market via Firecrawl
                     (gasta credits). Default False.

    Returns:
        dict con neighborhood_class, class_score, confidence_level, indicators,
        investment_implications, etc.
    """
    # .env ya fue cargado por data_fetchers/__init__.py al primer import
    # del paquete. No llamamos load_dotenv() aca para evitar conflictos con
    # CWD distinto del proyecto.

    fetched_at = datetime.now(timezone.utc).isoformat()
    all_errors: list[str] = []
    data_sources: list[str] = []

    if not geocode or not geocode.get("state_fips"):
        return {
            "neighborhood_class": "unclassified",
            "class_score": 0.0,
            "confidence_level": "unclassified",
            "indicators": {},
            "indicators_available": [],
            "weight_coverage_pct": 0,
            "class_reasoning": "Geocode fallo - no se puede clasificar sin tract.",
            "investment_implications": INVESTMENT_IMPLICATIONS["unclassified"],
            "warnings": ["Geocode invalido o incompleto"],
            "data_sources": [],
            "tract_geoid": None,
            "fetched_at": fetched_at,
            "errors": ["geocode_failed"],
        }

    # ─── Census ACS (4 indicadores) ─────────────────────────────────────────
    indicators: dict = {}
    census_data, errs = _fetch_census_acs(geocode)
    indicators.update(census_data)
    all_errors.extend(errs)
    if census_data:
        data_sources.append("US Census ACS 2022 5-Year")

    # ─── FBI UCR (1 indicador) ──────────────────────────────────────────────
    crime_data, errs = _fetch_fbi_crime(geocode)
    # Excluir keys auxiliares con prefijo "_"
    indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")})
    all_errors.extend(errs)
    if crime_data:
        data_sources.append("FBI Crime Data Explorer (state-level)")

    # ─── Firecrawl DOM (1 indicador, opt-in) ────────────────────────────────
    if include_dom:
        dom_data, errs = _fetch_firecrawl_dom(geocode)
        indicators.update(dom_data)
        all_errors.extend(errs)
        if dom_data:
            data_sources.append("Firecrawl (Zillow DOM)")

    # ─── Clasificar ─────────────────────────────────────────────────────────
    classification = _classify(indicators)
    reasoning = _build_reasoning(indicators, classification)
    letter = classification["neighborhood_class"]

    # ─── Warnings ───────────────────────────────────────────────────────────
    warnings: list[str] = []
    if classification["confidence_level"] in ("low", "unclassified"):
        warnings.append(
            f"Confianza {classification['confidence_level']}: "
            f"solo {len(classification['indicators_available'])} indicadores disponibles."
        )
    if "_crime_state_level_note" in crime_data:
        warnings.append(crime_data["_crime_state_level_note"])

    return {
        "neighborhood_class": letter,
        "class_score": classification["class_score"],
        "confidence_level": classification["confidence_level"],
        "indicators": indicators,
        "indicators_available": classification["indicators_available"],
        "weight_coverage_pct": classification["weight_coverage_pct"],
        "class_reasoning": reasoning,
        "investment_implications": INVESTMENT_IMPLICATIONS[letter],
        "warnings": warnings,
        "data_sources": data_sources,
        "tract_geoid": geocode.get("tract_geoid"),
        "tract_name": geocode.get("tract_name"),
        "fetched_at": fetched_at,
        "errors": all_errors,
    }