194 lines
6.6 KiB
Python
194 lines
6.6 KiB
Python
"""property_type_inference.py — Heuristica para inferir tipo de propiedad.
|
|
|
|
Cada source (HUD, Miami-Dade clerk, Zillow, etc.) trae signals distintos. Esta
|
|
modulo centraliza la logica de inferencia para que:
|
|
|
|
1. Scrapers nuevos pueden pasar deals sin property_type y se infiere.
|
|
2. Backfill de deals legacy.
|
|
3. Una sola fuente de verdad para mapeo "deal info → property_type".
|
|
|
|
VALORES VALIDOS (alineados con orchestrator.DealInputs.property_type):
|
|
sfr - Single Family Residence (default residential)
|
|
condo - Condominium
|
|
townhome - Townhouse
|
|
multi_family - Duplex/triplex/quad/apartment
|
|
land - Vacant land / lot
|
|
mobile_home - Manufactured / mobile home
|
|
commercial - Commercial property (warehouse, retail, office)
|
|
unknown - Insufficient signal — UI shows neutral badge
|
|
|
|
CONVENCION: si no podemos inferir con confianza media+, retornar "unknown",
|
|
NUNCA adivinar "sfr" porque sesga downstream pipelines.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
|
|
VALID_TYPES = {
|
|
"sfr", "condo", "townhome", "multi_family",
|
|
"land", "mobile_home", "commercial", "unknown",
|
|
}
|
|
|
|
# Spanish labels para UI display
|
|
TYPE_LABELS_ES = {
|
|
"sfr": "Casa",
|
|
"condo": "Condo",
|
|
"townhome": "Townhouse",
|
|
"multi_family": "Multi-familiar",
|
|
"land": "Terreno",
|
|
"mobile_home": "Mobile home",
|
|
"commercial": "Comercial",
|
|
"unknown": "—",
|
|
}
|
|
|
|
# Emoji badges para UI cards
|
|
TYPE_EMOJI = {
|
|
"sfr": "🏠",
|
|
"condo": "🏢",
|
|
"townhome": "🏘️",
|
|
"multi_family": "🏬",
|
|
"land": "🌳",
|
|
"mobile_home": "🚐",
|
|
"commercial": "🏪",
|
|
"unknown": "❔",
|
|
}
|
|
|
|
|
|
# Keywords ordenados por especificidad (mas especifico primero gana)
|
|
_DESCRIPTION_KEYWORDS = [
|
|
# (keyword_lower, property_type)
|
|
# Land — debe chequear ANTES que "sfr" porque "vacant land" no tiene beds
|
|
("vacant land", "land"),
|
|
("vacant lot", "land"),
|
|
("raw land", "land"),
|
|
("res vacant", "land"), # PA codigos Miami-Dade
|
|
("residential lot", "land"),
|
|
("acreage", "land"),
|
|
("buildable lot", "land"),
|
|
# Mobile home — chequear antes que SFR
|
|
("mobile home", "mobile_home"),
|
|
("manufactured home", "mobile_home"),
|
|
("manufactured hous", "mobile_home"),
|
|
("mfd home", "mobile_home"),
|
|
# Condo
|
|
("condominium", "condo"),
|
|
(" condo ", "condo"), # space-padded para evitar match en "condominium" twice
|
|
("condo unit", "condo"),
|
|
# Townhouse
|
|
("townhouse", "townhome"),
|
|
("townhome", "townhome"),
|
|
("town home", "townhome"),
|
|
# Multi-family
|
|
("duplex", "multi_family"),
|
|
("triplex", "multi_family"),
|
|
("quadplex", "multi_family"),
|
|
("quad plex", "multi_family"),
|
|
("fourplex", "multi_family"),
|
|
("multi-family", "multi_family"),
|
|
("multi family", "multi_family"),
|
|
("multifamily", "multi_family"),
|
|
("apartment building","multi_family"),
|
|
# Commercial
|
|
("warehouse", "commercial"),
|
|
("retail space", "commercial"),
|
|
("office building", "commercial"),
|
|
("commercial", "commercial"),
|
|
# SFR keywords (less specific, last)
|
|
("single family", "sfr"),
|
|
("single-family", "sfr"),
|
|
("sfr", "sfr"),
|
|
("sfh", "sfr"),
|
|
]
|
|
|
|
# Address-level hints (e.g., "UNIT 4B", "APT 12")
|
|
_ADDRESS_CONDO_HINTS = ["unit ", "apt ", "#", "suite "]
|
|
|
|
|
|
def _scan_keywords(text: str) -> Optional[str]:
|
|
"""Returns property_type if any keyword matches, None otherwise."""
|
|
t = f" {text.lower()} " # pad for space-bounded matches
|
|
for kw, ptype in _DESCRIPTION_KEYWORDS:
|
|
if kw in t:
|
|
return ptype
|
|
return None
|
|
|
|
|
|
def infer_property_type(deal: dict) -> str:
|
|
"""Infiere property_type desde data del deal.
|
|
|
|
Args:
|
|
deal: dict con campos como source, listing_description, address, beds,
|
|
baths, sqft, deal_type. No requiere todos.
|
|
|
|
Returns:
|
|
Uno de VALID_TYPES. "unknown" si no hay suficiente signal.
|
|
|
|
HEURISTICA (en orden de prioridad):
|
|
1. Si ya viene property_type valido → respetarlo (passthrough)
|
|
2. Keywords en listing_description (mas especifico)
|
|
3. Address hints (UNIT/APT → condo)
|
|
4. beds == 0 + no signal contrario → land (auction lots con 0 beds)
|
|
5. Source-based default:
|
|
- hud_homestore: tiene beds populated → sfr (HUD es ~95% SFR)
|
|
- zillow: tiene beds + sqft populated → sfr
|
|
- miami_dade_clerk: sin beds → unknown (taxdeed/foreclosure data limitada)
|
|
6. Fallback → unknown
|
|
"""
|
|
# 1) Passthrough si ya viene set
|
|
existing = (deal.get("property_type") or "").strip().lower()
|
|
if existing in VALID_TYPES and existing != "unknown":
|
|
return existing
|
|
|
|
desc = (deal.get("listing_description") or "").strip()
|
|
address = (deal.get("address") or "").strip()
|
|
beds = deal.get("beds")
|
|
sqft = deal.get("sqft")
|
|
source = (deal.get("source") or "").strip().lower()
|
|
|
|
# 2) Description keywords (most reliable signal)
|
|
if desc:
|
|
match = _scan_keywords(desc)
|
|
if match:
|
|
return match
|
|
|
|
# 3) Address hints — "UNIT 4B" / "APT 12" → likely condo
|
|
if address:
|
|
addr_lower = address.lower()
|
|
if any(hint in addr_lower for hint in _ADDRESS_CONDO_HINTS):
|
|
return "condo"
|
|
# Also check address for land/multi keywords
|
|
match = _scan_keywords(address)
|
|
if match:
|
|
return match
|
|
|
|
# 4) beds == 0 → land (vacant lot)
|
|
if beds == 0:
|
|
return "land"
|
|
|
|
# 5) Source-based default (only if we have beds populated)
|
|
if source == "hud_homestore" and beds and beds > 0:
|
|
return "sfr" # HUD listings son mayormente SFR
|
|
if source == "zillow" and beds and beds > 0 and sqft and sqft > 0:
|
|
return "sfr" # Zillow houses listings
|
|
|
|
# 6) Fallback — insufficient signal
|
|
return "unknown"
|
|
|
|
|
|
def ensure_property_type(deal: dict) -> dict:
|
|
"""Devuelve el deal con property_type seteado (infiere si falta).
|
|
|
|
Mutates-and-returns el dict para usar como `insert_deal(ensure_property_type(deal))`.
|
|
"""
|
|
pt = infer_property_type(deal)
|
|
deal["property_type"] = pt
|
|
return deal
|
|
|
|
|
|
def label_for(property_type: Optional[str]) -> str:
|
|
"""Returns Spanish label + emoji para UI display, e.g. '🏠 Casa'."""
|
|
pt = (property_type or "unknown").strip().lower()
|
|
if pt not in VALID_TYPES:
|
|
pt = "unknown"
|
|
return f"{TYPE_EMOJI[pt]} {TYPE_LABELS_ES[pt]}"
|