"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos. CRITICO - COMPLIANCE LEGAL: La clasificacion se basa SOLO en indicadores economicos y datos publicos: income, owner-occupancy, education attainment, vacancy, crime, days on market. NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining. Esta es una clasificacion ECONOMICA, no demografica. Indicadores y pesos (max 100): - median_household_income (Census ACS) 25% - owner_occupied_pct (Census ACS) 20% - education_attainment_pct_bachelor_plus (ACS) 20% - crime_vs_national (FBI UCR) 15% - vacancy_rate (Census ACS) 10% - days_on_market_median (Firecrawl, opt-in) 10% Graceful degradation: si un indicador no esta disponible (API key missing, fetcher fallo), se redistribuye su peso entre los disponibles. confidence_level (basado en CANTIDAD de indicadores disponibles): - "high": 5-6 indicadores - "medium": 3-4 indicadores - "low": 1-2 indicadores - "unclassified": 0 indicadores """ from __future__ import annotations import os import time from datetime import datetime, timezone from typing import Optional import requests from dotenv import load_dotenv from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT # ─── Pesos del algoritmo de clasificacion ─────────────────────────────────── WEIGHTS = { "income": 25, "owner_occupied": 20, "education": 20, "crime": 15, "vacancy": 10, "dom": 10, } # ─── Census ACS variable codes ────────────────────────────────────────────── ACS_VARS = { "income": "B19013_001E", # Median household income (last 12 months) "oo_count": "B25003_002E", # Owner-occupied housing units count "occupied_total": "B25003_001E", # Total occupied housing units "vacant_count": "B25002_003E", # Vacant housing units count "housing_total": "B25002_001E", # Total housing units (occupied + vacant) "home_value": "B25077_001E", # Median home value "edu_total": "B15003_001E", # Total population 25+ "edu_bachelor": "B15003_022E", # Bachelor's degree "edu_master": "B15003_023E", # Master's degree "edu_prof": "B15003_024E", # Professional school degree "edu_doctorate": "B15003_025E", # Doctorate degree } # ─── National crime rates (FBI UCR 2022, per 100K population) ────────────── # Usado como denominador para crime_vs_national. Actualizar anualmente. NATIONAL_VIOLENT_CRIME_PER_100K = 380.7 NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4 # ═══════════════════════════════════════════════════════════════════════════ # Fetchers individuales (fail-soft) # ═══════════════════════════════════════════════════════════════════════════ def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]: """Fetch 4 indicadores Census ACS para el tract del geocode. Returns (indicators_dict, errors_list). """ errors: list[str] = [] out: dict = {} api_key = os.getenv("CENSUS_API_KEY", "").strip() if not api_key: errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)") return out, errors state_fips = geocode.get("state_fips") county_code = geocode.get("county_code_only") tract_code = geocode.get("tract_code") if not state_fips or not county_code or not tract_code: errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})") return out, errors url = "https://api.census.gov/data/2022/acs/acs5" # Pedir todas las vars en una sola llamada (la API acepta hasta 50) var_keys = ["NAME"] + list(ACS_VARS.values()) params = { "get": ",".join(var_keys), "for": f"tract:{tract_code}", "in": f"state:{state_fips} county:{county_code}", "key": api_key, } try: r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT) r.raise_for_status() data = r.json() except requests.RequestException as e: errors.append(f"Census ACS HTTP: {e}") return out, errors except ValueError as e: errors.append(f"Census ACS JSON: {e}") return out, errors if not data or len(data) < 2: errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)") return out, errors header = data[0] row = data[1] idx = {col: i for i, col in enumerate(header)} def _f(col: str) -> Optional[float]: try: v = row[idx[col]] except (KeyError, IndexError): return None if v is None or v == "" or v == "null": return None try: f = float(v) # Census usa valores negativos para "no data" / "suppressed" return f if f >= 0 else None except (ValueError, TypeError): return None # 1) Median household income income = _f(ACS_VARS["income"]) if income is not None: out["median_household_income"] = round(income, 0) # 2) Owner-occupied percentage oo = _f(ACS_VARS["oo_count"]) total = _f(ACS_VARS["occupied_total"]) if oo is not None and total and total > 0: out["owner_occupied_pct"] = round(oo / total * 100, 1) # 3) Vacancy rate vacant = _f(ACS_VARS["vacant_count"]) housing = _f(ACS_VARS["housing_total"]) if vacant is not None and housing and housing > 0: out["vacancy_rate"] = round(vacant / housing * 100, 1) # 4) Median home value home_value = _f(ACS_VARS["home_value"]) if home_value is not None: out["median_home_value"] = round(home_value, 0) # 5) Education attainment (% bachelor's or higher, age 25+) edu_total = _f(ACS_VARS["edu_total"]) edu_b = _f(ACS_VARS["edu_bachelor"]) or 0 edu_m = _f(ACS_VARS["edu_master"]) or 0 edu_p = _f(ACS_VARS["edu_prof"]) or 0 edu_d = _f(ACS_VARS["edu_doctorate"]) or 0 if edu_total and edu_total > 0: pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100 out["education_attainment_pct_bachelor_plus"] = round(pct, 1) return out, errors def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]: """Fetch crime data via FBI Crime Data Explorer (api.data.gov key). NOTA: la API publica gratis de FBI es county-level via summarized endpoint. Implementacion best-effort: si la API responde, devolvemos crime_vs_national. Si no, fail-soft (errors list, indicator ausente). """ errors: list[str] = [] out: dict = {} api_key = os.getenv("API_DATA_GOV_KEY", "").strip() if not api_key: errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)") return out, errors state_abbr = geocode.get("state") # e.g. "FL" if not state_abbr: errors.append("state abbreviation faltante en geocode") return out, errors # Endpoint: FBI Crime Data Explorer state-level estimate # Mejor que tener nada (county-level es complejo de agregar). url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}" params = { "from": "2022", "to": "2022", "API_KEY": api_key, } try: r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT) r.raise_for_status() data = r.json() except requests.RequestException as e: errors.append(f"FBI UCR HTTP: {e}") return out, errors except ValueError as e: errors.append(f"FBI UCR JSON: {e}") return out, errors # Estructura tipica del endpoint: lista de estimates por ano con keys # como 'violent_crime', 'property_crime', 'population', etc. # Defensivo: probar varias formas. estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else []) if not estimates: errors.append("FBI UCR sin estimates en respuesta") return out, errors rec = estimates[0] if isinstance(estimates, list) else estimates if not isinstance(rec, dict): errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}") return out, errors population = rec.get("population") violent = rec.get("violent_crime") property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0 # fallback - se ignora luego if not population or not violent: errors.append("FBI UCR sin population o violent_crime en estimate") return out, errors try: violent_per_100k = float(violent) / float(population) * 100000 ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K if property_c: property_per_100k = float(property_c) / float(population) * 100000 ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K # Promedio ponderado: violent pesa mas (2/3) que property (1/3) combined = (ratio_violent * 2 + ratio_property) / 3 else: combined = ratio_violent out["crime_vs_national"] = round(combined, 2) out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa." except (TypeError, ValueError) as e: errors.append(f"FBI UCR calc error: {e}") return out, errors def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]: """Fetch median days-on-market via Firecrawl scrape de Zillow. OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup). Llamar solo si include_dom=True en classify_neighborhood(). """ errors: list[str] = [] # Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6). # Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado. errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6") return {}, errors # ═══════════════════════════════════════════════════════════════════════════ # Algoritmo de clasificacion # ═══════════════════════════════════════════════════════════════════════════ def _score_income(income: float) -> int: if income >= 100000: return 25 if income >= 60000: return 18 if income >= 35000: return 10 return 3 def _score_owner_occupied(pct: float) -> int: if pct >= 80: return 20 if pct >= 60: return 15 if pct >= 40: return 8 return 3 def _score_education(pct_bach_plus: float) -> int: if pct_bach_plus >= 50: return 20 if pct_bach_plus >= 30: return 14 if pct_bach_plus >= 15: return 7 return 2 def _score_crime(ratio_vs_national: float) -> int: """Lower ratio = better (less crime than national).""" if ratio_vs_national < 0.7: return 15 if ratio_vs_national < 1.0: return 12 if ratio_vs_national < 1.5: return 7 return 2 def _score_vacancy(pct: float) -> int: """Lower vacancy = better.""" if pct < 3: return 10 if pct < 6: return 7 if pct < 10: return 4 return 1 def _score_dom(days: float) -> int: """Lower days-on-market = hotter neighborhood = better.""" if days < 30: return 10 if days < 60: return 7 if days < 90: return 4 return 1 def _classify(indicators: dict) -> dict: """Aplica el algoritmo de scoring con graceful degradation. Returns dict con neighborhood_class, class_score, confidence_level, etc. """ score_funcs = { "income": (_score_income, "median_household_income"), "owner_occupied": (_score_owner_occupied, "owner_occupied_pct"), "education": (_score_education, "education_attainment_pct_bachelor_plus"), "crime": (_score_crime, "crime_vs_national"), "vacancy": (_score_vacancy, "vacancy_rate"), "dom": (_score_dom, "days_on_market_median"), } points = {} indicators_available = [] for key, (func, indicator_name) in score_funcs.items(): val = indicators.get(indicator_name) if val is not None: points[key] = func(val) indicators_available.append(key) n_available = len(indicators_available) # Confidence level por cantidad de indicadores if n_available == 0: confidence = "unclassified" elif n_available <= 2: confidence = "low" elif n_available <= 4: confidence = "medium" else: confidence = "high" if n_available == 0: return { "neighborhood_class": "unclassified", "class_score": 0.0, "confidence_level": "unclassified", "indicators_available": [], "weight_coverage_pct": 0, } # Graceful degradation: scale points contra weights disponibles total_weight_available = sum(WEIGHTS[k] for k in indicators_available) total_points = sum(points.values()) scaled_0_to_100 = (total_points / total_weight_available) * 100 if scaled_0_to_100 >= 85: letter = "A" elif scaled_0_to_100 >= 65: letter = "B" elif scaled_0_to_100 >= 40: letter = "C" else: letter = "D" return { "neighborhood_class": letter, "class_score": round(scaled_0_to_100, 1), "confidence_level": confidence, "indicators_available": indicators_available, "weight_coverage_pct": total_weight_available, "raw_points": points, } # ═══════════════════════════════════════════════════════════════════════════ # Investment implications por clase # ═══════════════════════════════════════════════════════════════════════════ INVESTMENT_IMPLICATIONS = { "A": { "buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos", "section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR", "appreciation_potential": "Alta - tipicamente supera inflacion", "tenant_quality_expected": "Profesional, familias, muy estable", "typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"], }, "B": { "buy_hold_viability": "Alta - balance entre cash flow y apreciacion", "section_8_viability": "Baja - market rents por encima de FMR pero no por mucho", "appreciation_potential": "Media-alta", "tenant_quality_expected": "Profesional, familias, estable", "typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"], }, "C": { "buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management", "section_8_viability": "Alta - market rents cerca o por debajo de FMR", "appreciation_potential": "Baja-media", "tenant_quality_expected": "Working class, estabilidad mixta", "typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"], }, "D": { "buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo", "section_8_viability": "Muy alta - Section 8 puede superar market rent", "appreciation_potential": "Baja - depende de trayectoria del vecindario", "tenant_quality_expected": "Bajos ingresos, screening diligente requerido", "typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"], }, "unclassified": { "buy_hold_viability": "No determinado - sin datos suficientes", "section_8_viability": "No determinado", "appreciation_potential": "No determinado", "tenant_quality_expected": "No determinado", "typical_strategies": [], }, } def _build_reasoning(indicators: dict, classification: dict) -> str: """Genera 1-2 lineas de justificacion del class letter.""" letter = classification["neighborhood_class"] if letter == "unclassified": return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)." parts = [] if (v := indicators.get("median_household_income")) is not None: parts.append(f"median income ${v:,.0f}") if (v := indicators.get("owner_occupied_pct")) is not None: parts.append(f"owner-occupied {v:.0f}%") if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None: parts.append(f"bachelor+ {v:.0f}%") if (v := indicators.get("crime_vs_national")) is not None: parts.append(f"crime {v:.2f}x national") if (v := indicators.get("vacancy_rate")) is not None: parts.append(f"vacancy {v:.1f}%") if (v := indicators.get("days_on_market_median")) is not None: parts.append(f"DOM {v} dias") indicator_str = ", ".join(parts) score = classification["class_score"] conf = classification["confidence_level"] coverage = classification["weight_coverage_pct"] return ( f"Clase {letter} (score {score}/100, confianza {conf}, " f"cobertura {coverage}% del peso). Indicadores: {indicator_str}." ) # ═══════════════════════════════════════════════════════════════════════════ # API publica # ═══════════════════════════════════════════════════════════════════════════ def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict: """Clasifica un vecindario A/B/C/D basado en indicadores objetivos. Args: geocode: output de census_geocode.fetch_geocode (debe tener state_fips, county_code_only, tract_code). include_dom: si True, hace lookup de Days-on-Market via Firecrawl (gasta credits). Default False. Returns: dict con neighborhood_class, class_score, confidence_level, indicators, investment_implications, etc. """ # .env ya fue cargado por data_fetchers/__init__.py al primer import # del paquete. No llamamos load_dotenv() aca para evitar conflictos con # CWD distinto del proyecto. fetched_at = datetime.now(timezone.utc).isoformat() all_errors: list[str] = [] data_sources: list[str] = [] if not geocode or not geocode.get("state_fips"): return { "neighborhood_class": "unclassified", "class_score": 0.0, "confidence_level": "unclassified", "indicators": {}, "indicators_available": [], "weight_coverage_pct": 0, "class_reasoning": "Geocode fallo - no se puede clasificar sin tract.", "investment_implications": INVESTMENT_IMPLICATIONS["unclassified"], "warnings": ["Geocode invalido o incompleto"], "data_sources": [], "tract_geoid": None, "fetched_at": fetched_at, "errors": ["geocode_failed"], } # ─── Census ACS (4 indicadores) ───────────────────────────────────────── indicators: dict = {} census_data, errs = _fetch_census_acs(geocode) indicators.update(census_data) all_errors.extend(errs) if census_data: data_sources.append("US Census ACS 2022 5-Year") # ─── FBI UCR (1 indicador) ────────────────────────────────────────────── crime_data, errs = _fetch_fbi_crime(geocode) # Excluir keys auxiliares con prefijo "_" indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")}) all_errors.extend(errs) if crime_data: data_sources.append("FBI Crime Data Explorer (state-level)") # ─── Firecrawl DOM (1 indicador, opt-in) ──────────────────────────────── if include_dom: dom_data, errs = _fetch_firecrawl_dom(geocode) indicators.update(dom_data) all_errors.extend(errs) if dom_data: data_sources.append("Firecrawl (Zillow DOM)") # ─── Clasificar ───────────────────────────────────────────────────────── classification = _classify(indicators) reasoning = _build_reasoning(indicators, classification) letter = classification["neighborhood_class"] # ─── Warnings ─────────────────────────────────────────────────────────── warnings: list[str] = [] if classification["confidence_level"] in ("low", "unclassified"): warnings.append( f"Confianza {classification['confidence_level']}: " f"solo {len(classification['indicators_available'])} indicadores disponibles." ) if "_crime_state_level_note" in crime_data: warnings.append(crime_data["_crime_state_level_note"]) return { "neighborhood_class": letter, "class_score": classification["class_score"], "confidence_level": classification["confidence_level"], "indicators": indicators, "indicators_available": classification["indicators_available"], "weight_coverage_pct": classification["weight_coverage_pct"], "class_reasoning": reasoning, "investment_implications": INVESTMENT_IMPLICATIONS[letter], "warnings": warnings, "data_sources": data_sources, "tract_geoid": geocode.get("tract_geoid"), "tract_name": geocode.get("tract_name"), "fetched_at": fetched_at, "errors": all_errors, }