"""property_type_inference.py — Heuristica para inferir tipo de propiedad. Cada source (HUD, Miami-Dade clerk, Zillow, etc.) trae signals distintos. Esta modulo centraliza la logica de inferencia para que: 1. Scrapers nuevos pueden pasar deals sin property_type y se infiere. 2. Backfill de deals legacy. 3. Una sola fuente de verdad para mapeo "deal info → property_type". VALORES VALIDOS (alineados con orchestrator.DealInputs.property_type): sfr - Single Family Residence (default residential) condo - Condominium townhome - Townhouse multi_family - Duplex/triplex/quad/apartment land - Vacant land / lot mobile_home - Manufactured / mobile home commercial - Commercial property (warehouse, retail, office) unknown - Insufficient signal — UI shows neutral badge CONVENCION: si no podemos inferir con confianza media+, retornar "unknown", NUNCA adivinar "sfr" porque sesga downstream pipelines. """ from __future__ import annotations from typing import Optional VALID_TYPES = { "sfr", "condo", "townhome", "multi_family", "land", "mobile_home", "commercial", "unknown", } # Spanish labels para UI display TYPE_LABELS_ES = { "sfr": "Casa", "condo": "Condo", "townhome": "Townhouse", "multi_family": "Multi-familiar", "land": "Terreno", "mobile_home": "Mobile home", "commercial": "Comercial", "unknown": "—", } # Emoji badges para UI cards TYPE_EMOJI = { "sfr": "🏠", "condo": "🏢", "townhome": "🏘️", "multi_family": "🏬", "land": "🌳", "mobile_home": "🚐", "commercial": "🏪", "unknown": "❔", } # Keywords ordenados por especificidad (mas especifico primero gana) _DESCRIPTION_KEYWORDS = [ # (keyword_lower, property_type) # Land — debe chequear ANTES que "sfr" porque "vacant land" no tiene beds ("vacant land", "land"), ("vacant lot", "land"), ("raw land", "land"), ("res vacant", "land"), # PA codigos Miami-Dade ("residential lot", "land"), ("acreage", "land"), ("buildable lot", "land"), # Mobile home — chequear antes que SFR ("mobile home", "mobile_home"), ("manufactured home", "mobile_home"), ("manufactured hous", "mobile_home"), ("mfd home", "mobile_home"), # Condo ("condominium", "condo"), (" condo ", "condo"), # space-padded para evitar match en "condominium" twice ("condo unit", "condo"), # Townhouse ("townhouse", "townhome"), ("townhome", "townhome"), ("town home", "townhome"), # Multi-family ("duplex", "multi_family"), ("triplex", "multi_family"), ("quadplex", "multi_family"), ("quad plex", "multi_family"), ("fourplex", "multi_family"), ("multi-family", "multi_family"), ("multi family", "multi_family"), ("multifamily", "multi_family"), ("apartment building","multi_family"), # Commercial ("warehouse", "commercial"), ("retail space", "commercial"), ("office building", "commercial"), ("commercial", "commercial"), # SFR keywords (less specific, last) ("single family", "sfr"), ("single-family", "sfr"), ("sfr", "sfr"), ("sfh", "sfr"), ] # Address-level hints (e.g., "UNIT 4B", "APT 12") _ADDRESS_CONDO_HINTS = ["unit ", "apt ", "#", "suite "] def _scan_keywords(text: str) -> Optional[str]: """Returns property_type if any keyword matches, None otherwise.""" t = f" {text.lower()} " # pad for space-bounded matches for kw, ptype in _DESCRIPTION_KEYWORDS: if kw in t: return ptype return None def infer_property_type(deal: dict) -> str: """Infiere property_type desde data del deal. Args: deal: dict con campos como source, listing_description, address, beds, baths, sqft, deal_type. No requiere todos. Returns: Uno de VALID_TYPES. "unknown" si no hay suficiente signal. HEURISTICA (en orden de prioridad): 1. Si ya viene property_type valido → respetarlo (passthrough) 2. Keywords en listing_description (mas especifico) 3. Address hints (UNIT/APT → condo) 4. beds == 0 + no signal contrario → land (auction lots con 0 beds) 5. Source-based default: - hud_homestore: tiene beds populated → sfr (HUD es ~95% SFR) - zillow: tiene beds + sqft populated → sfr - miami_dade_clerk: sin beds → unknown (taxdeed/foreclosure data limitada) 6. Fallback → unknown """ # 1) Passthrough si ya viene set existing = (deal.get("property_type") or "").strip().lower() if existing in VALID_TYPES and existing != "unknown": return existing desc = (deal.get("listing_description") or "").strip() address = (deal.get("address") or "").strip() beds = deal.get("beds") sqft = deal.get("sqft") source = (deal.get("source") or "").strip().lower() # 2) Description keywords (most reliable signal) if desc: match = _scan_keywords(desc) if match: return match # 3) Address hints — "UNIT 4B" / "APT 12" → likely condo if address: addr_lower = address.lower() if any(hint in addr_lower for hint in _ADDRESS_CONDO_HINTS): return "condo" # Also check address for land/multi keywords match = _scan_keywords(address) if match: return match # 4) beds == 0 → land (vacant lot) if beds == 0: return "land" # 5) Source-based default (only if we have beds populated) if source == "hud_homestore" and beds and beds > 0: return "sfr" # HUD listings son mayormente SFR if source == "zillow" and beds and beds > 0 and sqft and sqft > 0: return "sfr" # Zillow houses listings # 6) Fallback — insufficient signal return "unknown" def ensure_property_type(deal: dict) -> dict: """Devuelve el deal con property_type seteado (infiere si falta). Mutates-and-returns el dict para usar como `insert_deal(ensure_property_type(deal))`. """ pt = infer_property_type(deal) deal["property_type"] = pt return deal def label_for(property_type: Optional[str]) -> str: """Returns Spanish label + emoji para UI display, e.g. '🏠 Casa'.""" pt = (property_type or "unknown").strip().lower() if pt not in VALID_TYPES: pt = "unknown" return f"{TYPE_EMOJI[pt]} {TYPE_LABELS_ES[pt]}"