feat: AR-House initial commit

This commit is contained in:
2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
+193
View File
@@ -0,0 +1,193 @@
"""property_type_inference.py — Heuristica para inferir tipo de propiedad.
Cada source (HUD, Miami-Dade clerk, Zillow, etc.) trae signals distintos. Esta
modulo centraliza la logica de inferencia para que:
1. Scrapers nuevos pueden pasar deals sin property_type y se infiere.
2. Backfill de deals legacy.
3. Una sola fuente de verdad para mapeo "deal info → property_type".
VALORES VALIDOS (alineados con orchestrator.DealInputs.property_type):
sfr - Single Family Residence (default residential)
condo - Condominium
townhome - Townhouse
multi_family - Duplex/triplex/quad/apartment
land - Vacant land / lot
mobile_home - Manufactured / mobile home
commercial - Commercial property (warehouse, retail, office)
unknown - Insufficient signal — UI shows neutral badge
CONVENCION: si no podemos inferir con confianza media+, retornar "unknown",
NUNCA adivinar "sfr" porque sesga downstream pipelines.
"""
from __future__ import annotations
from typing import Optional
VALID_TYPES = {
"sfr", "condo", "townhome", "multi_family",
"land", "mobile_home", "commercial", "unknown",
}
# Spanish labels para UI display
TYPE_LABELS_ES = {
"sfr": "Casa",
"condo": "Condo",
"townhome": "Townhouse",
"multi_family": "Multi-familiar",
"land": "Terreno",
"mobile_home": "Mobile home",
"commercial": "Comercial",
"unknown": "",
}
# Emoji badges para UI cards
TYPE_EMOJI = {
"sfr": "🏠",
"condo": "🏢",
"townhome": "🏘️",
"multi_family": "🏬",
"land": "🌳",
"mobile_home": "🚐",
"commercial": "🏪",
"unknown": "",
}
# Keywords ordenados por especificidad (mas especifico primero gana)
_DESCRIPTION_KEYWORDS = [
# (keyword_lower, property_type)
# Land — debe chequear ANTES que "sfr" porque "vacant land" no tiene beds
("vacant land", "land"),
("vacant lot", "land"),
("raw land", "land"),
("res vacant", "land"), # PA codigos Miami-Dade
("residential lot", "land"),
("acreage", "land"),
("buildable lot", "land"),
# Mobile home — chequear antes que SFR
("mobile home", "mobile_home"),
("manufactured home", "mobile_home"),
("manufactured hous", "mobile_home"),
("mfd home", "mobile_home"),
# Condo
("condominium", "condo"),
(" condo ", "condo"), # space-padded para evitar match en "condominium" twice
("condo unit", "condo"),
# Townhouse
("townhouse", "townhome"),
("townhome", "townhome"),
("town home", "townhome"),
# Multi-family
("duplex", "multi_family"),
("triplex", "multi_family"),
("quadplex", "multi_family"),
("quad plex", "multi_family"),
("fourplex", "multi_family"),
("multi-family", "multi_family"),
("multi family", "multi_family"),
("multifamily", "multi_family"),
("apartment building","multi_family"),
# Commercial
("warehouse", "commercial"),
("retail space", "commercial"),
("office building", "commercial"),
("commercial", "commercial"),
# SFR keywords (less specific, last)
("single family", "sfr"),
("single-family", "sfr"),
("sfr", "sfr"),
("sfh", "sfr"),
]
# Address-level hints (e.g., "UNIT 4B", "APT 12")
_ADDRESS_CONDO_HINTS = ["unit ", "apt ", "#", "suite "]
def _scan_keywords(text: str) -> Optional[str]:
"""Returns property_type if any keyword matches, None otherwise."""
t = f" {text.lower()} " # pad for space-bounded matches
for kw, ptype in _DESCRIPTION_KEYWORDS:
if kw in t:
return ptype
return None
def infer_property_type(deal: dict) -> str:
"""Infiere property_type desde data del deal.
Args:
deal: dict con campos como source, listing_description, address, beds,
baths, sqft, deal_type. No requiere todos.
Returns:
Uno de VALID_TYPES. "unknown" si no hay suficiente signal.
HEURISTICA (en orden de prioridad):
1. Si ya viene property_type valido → respetarlo (passthrough)
2. Keywords en listing_description (mas especifico)
3. Address hints (UNIT/APT → condo)
4. beds == 0 + no signal contrario → land (auction lots con 0 beds)
5. Source-based default:
- hud_homestore: tiene beds populated → sfr (HUD es ~95% SFR)
- zillow: tiene beds + sqft populated → sfr
- miami_dade_clerk: sin beds → unknown (taxdeed/foreclosure data limitada)
6. Fallback → unknown
"""
# 1) Passthrough si ya viene set
existing = (deal.get("property_type") or "").strip().lower()
if existing in VALID_TYPES and existing != "unknown":
return existing
desc = (deal.get("listing_description") or "").strip()
address = (deal.get("address") or "").strip()
beds = deal.get("beds")
sqft = deal.get("sqft")
source = (deal.get("source") or "").strip().lower()
# 2) Description keywords (most reliable signal)
if desc:
match = _scan_keywords(desc)
if match:
return match
# 3) Address hints — "UNIT 4B" / "APT 12" → likely condo
if address:
addr_lower = address.lower()
if any(hint in addr_lower for hint in _ADDRESS_CONDO_HINTS):
return "condo"
# Also check address for land/multi keywords
match = _scan_keywords(address)
if match:
return match
# 4) beds == 0 → land (vacant lot)
if beds == 0:
return "land"
# 5) Source-based default (only if we have beds populated)
if source == "hud_homestore" and beds and beds > 0:
return "sfr" # HUD listings son mayormente SFR
if source == "zillow" and beds and beds > 0 and sqft and sqft > 0:
return "sfr" # Zillow houses listings
# 6) Fallback — insufficient signal
return "unknown"
def ensure_property_type(deal: dict) -> dict:
"""Devuelve el deal con property_type seteado (infiere si falta).
Mutates-and-returns el dict para usar como `insert_deal(ensure_property_type(deal))`.
"""
pt = infer_property_type(deal)
deal["property_type"] = pt
return deal
def label_for(property_type: Optional[str]) -> str:
"""Returns Spanish label + emoji para UI display, e.g. '🏠 Casa'."""
pt = (property_type or "unknown").strip().lower()
if pt not in VALID_TYPES:
pt = "unknown"
return f"{TYPE_EMOJI[pt]} {TYPE_LABELS_ES[pt]}"