feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
"""property_type_inference.py — Heuristica para inferir tipo de propiedad.
|
||||
|
||||
Cada source (HUD, Miami-Dade clerk, Zillow, etc.) trae signals distintos. Esta
|
||||
modulo centraliza la logica de inferencia para que:
|
||||
|
||||
1. Scrapers nuevos pueden pasar deals sin property_type y se infiere.
|
||||
2. Backfill de deals legacy.
|
||||
3. Una sola fuente de verdad para mapeo "deal info → property_type".
|
||||
|
||||
VALORES VALIDOS (alineados con orchestrator.DealInputs.property_type):
|
||||
sfr - Single Family Residence (default residential)
|
||||
condo - Condominium
|
||||
townhome - Townhouse
|
||||
multi_family - Duplex/triplex/quad/apartment
|
||||
land - Vacant land / lot
|
||||
mobile_home - Manufactured / mobile home
|
||||
commercial - Commercial property (warehouse, retail, office)
|
||||
unknown - Insufficient signal — UI shows neutral badge
|
||||
|
||||
CONVENCION: si no podemos inferir con confianza media+, retornar "unknown",
|
||||
NUNCA adivinar "sfr" porque sesga downstream pipelines.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
VALID_TYPES = {
|
||||
"sfr", "condo", "townhome", "multi_family",
|
||||
"land", "mobile_home", "commercial", "unknown",
|
||||
}
|
||||
|
||||
# Spanish labels para UI display
|
||||
TYPE_LABELS_ES = {
|
||||
"sfr": "Casa",
|
||||
"condo": "Condo",
|
||||
"townhome": "Townhouse",
|
||||
"multi_family": "Multi-familiar",
|
||||
"land": "Terreno",
|
||||
"mobile_home": "Mobile home",
|
||||
"commercial": "Comercial",
|
||||
"unknown": "—",
|
||||
}
|
||||
|
||||
# Emoji badges para UI cards
|
||||
TYPE_EMOJI = {
|
||||
"sfr": "🏠",
|
||||
"condo": "🏢",
|
||||
"townhome": "🏘️",
|
||||
"multi_family": "🏬",
|
||||
"land": "🌳",
|
||||
"mobile_home": "🚐",
|
||||
"commercial": "🏪",
|
||||
"unknown": "❔",
|
||||
}
|
||||
|
||||
|
||||
# Keywords ordenados por especificidad (mas especifico primero gana)
|
||||
_DESCRIPTION_KEYWORDS = [
|
||||
# (keyword_lower, property_type)
|
||||
# Land — debe chequear ANTES que "sfr" porque "vacant land" no tiene beds
|
||||
("vacant land", "land"),
|
||||
("vacant lot", "land"),
|
||||
("raw land", "land"),
|
||||
("res vacant", "land"), # PA codigos Miami-Dade
|
||||
("residential lot", "land"),
|
||||
("acreage", "land"),
|
||||
("buildable lot", "land"),
|
||||
# Mobile home — chequear antes que SFR
|
||||
("mobile home", "mobile_home"),
|
||||
("manufactured home", "mobile_home"),
|
||||
("manufactured hous", "mobile_home"),
|
||||
("mfd home", "mobile_home"),
|
||||
# Condo
|
||||
("condominium", "condo"),
|
||||
(" condo ", "condo"), # space-padded para evitar match en "condominium" twice
|
||||
("condo unit", "condo"),
|
||||
# Townhouse
|
||||
("townhouse", "townhome"),
|
||||
("townhome", "townhome"),
|
||||
("town home", "townhome"),
|
||||
# Multi-family
|
||||
("duplex", "multi_family"),
|
||||
("triplex", "multi_family"),
|
||||
("quadplex", "multi_family"),
|
||||
("quad plex", "multi_family"),
|
||||
("fourplex", "multi_family"),
|
||||
("multi-family", "multi_family"),
|
||||
("multi family", "multi_family"),
|
||||
("multifamily", "multi_family"),
|
||||
("apartment building","multi_family"),
|
||||
# Commercial
|
||||
("warehouse", "commercial"),
|
||||
("retail space", "commercial"),
|
||||
("office building", "commercial"),
|
||||
("commercial", "commercial"),
|
||||
# SFR keywords (less specific, last)
|
||||
("single family", "sfr"),
|
||||
("single-family", "sfr"),
|
||||
("sfr", "sfr"),
|
||||
("sfh", "sfr"),
|
||||
]
|
||||
|
||||
# Address-level hints (e.g., "UNIT 4B", "APT 12")
|
||||
_ADDRESS_CONDO_HINTS = ["unit ", "apt ", "#", "suite "]
|
||||
|
||||
|
||||
def _scan_keywords(text: str) -> Optional[str]:
|
||||
"""Returns property_type if any keyword matches, None otherwise."""
|
||||
t = f" {text.lower()} " # pad for space-bounded matches
|
||||
for kw, ptype in _DESCRIPTION_KEYWORDS:
|
||||
if kw in t:
|
||||
return ptype
|
||||
return None
|
||||
|
||||
|
||||
def infer_property_type(deal: dict) -> str:
|
||||
"""Infiere property_type desde data del deal.
|
||||
|
||||
Args:
|
||||
deal: dict con campos como source, listing_description, address, beds,
|
||||
baths, sqft, deal_type. No requiere todos.
|
||||
|
||||
Returns:
|
||||
Uno de VALID_TYPES. "unknown" si no hay suficiente signal.
|
||||
|
||||
HEURISTICA (en orden de prioridad):
|
||||
1. Si ya viene property_type valido → respetarlo (passthrough)
|
||||
2. Keywords en listing_description (mas especifico)
|
||||
3. Address hints (UNIT/APT → condo)
|
||||
4. beds == 0 + no signal contrario → land (auction lots con 0 beds)
|
||||
5. Source-based default:
|
||||
- hud_homestore: tiene beds populated → sfr (HUD es ~95% SFR)
|
||||
- zillow: tiene beds + sqft populated → sfr
|
||||
- miami_dade_clerk: sin beds → unknown (taxdeed/foreclosure data limitada)
|
||||
6. Fallback → unknown
|
||||
"""
|
||||
# 1) Passthrough si ya viene set
|
||||
existing = (deal.get("property_type") or "").strip().lower()
|
||||
if existing in VALID_TYPES and existing != "unknown":
|
||||
return existing
|
||||
|
||||
desc = (deal.get("listing_description") or "").strip()
|
||||
address = (deal.get("address") or "").strip()
|
||||
beds = deal.get("beds")
|
||||
sqft = deal.get("sqft")
|
||||
source = (deal.get("source") or "").strip().lower()
|
||||
|
||||
# 2) Description keywords (most reliable signal)
|
||||
if desc:
|
||||
match = _scan_keywords(desc)
|
||||
if match:
|
||||
return match
|
||||
|
||||
# 3) Address hints — "UNIT 4B" / "APT 12" → likely condo
|
||||
if address:
|
||||
addr_lower = address.lower()
|
||||
if any(hint in addr_lower for hint in _ADDRESS_CONDO_HINTS):
|
||||
return "condo"
|
||||
# Also check address for land/multi keywords
|
||||
match = _scan_keywords(address)
|
||||
if match:
|
||||
return match
|
||||
|
||||
# 4) beds == 0 → land (vacant lot)
|
||||
if beds == 0:
|
||||
return "land"
|
||||
|
||||
# 5) Source-based default (only if we have beds populated)
|
||||
if source == "hud_homestore" and beds and beds > 0:
|
||||
return "sfr" # HUD listings son mayormente SFR
|
||||
if source == "zillow" and beds and beds > 0 and sqft and sqft > 0:
|
||||
return "sfr" # Zillow houses listings
|
||||
|
||||
# 6) Fallback — insufficient signal
|
||||
return "unknown"
|
||||
|
||||
|
||||
def ensure_property_type(deal: dict) -> dict:
|
||||
"""Devuelve el deal con property_type seteado (infiere si falta).
|
||||
|
||||
Mutates-and-returns el dict para usar como `insert_deal(ensure_property_type(deal))`.
|
||||
"""
|
||||
pt = infer_property_type(deal)
|
||||
deal["property_type"] = pt
|
||||
return deal
|
||||
|
||||
|
||||
def label_for(property_type: Optional[str]) -> str:
|
||||
"""Returns Spanish label + emoji para UI display, e.g. '🏠 Casa'."""
|
||||
pt = (property_type or "unknown").strip().lower()
|
||||
if pt not in VALID_TYPES:
|
||||
pt = "unknown"
|
||||
return f"{TYPE_EMOJI[pt]} {TYPE_LABELS_ES[pt]}"
|
||||
Reference in New Issue
Block a user