feat: AR-House initial commit

This commit is contained in:
2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
+37
View File
@@ -0,0 +1,37 @@
"""Data fetchers para AR-House.
Obtiene datos reales de fuentes oficiales ANTES de pasar el deal a los agentes Ollama.
Asi los agentes razonan sobre datos verificados (FEMA, HUD, NOAA, Census ACS) en vez de inventar.
Uso principal:
from data_fetchers.runner import fetch_all
data = fetch_all(deal, status_cb=...)
# data = {"geocode": {...}, "flood": {...}, "fmr": {...},
# "hurricanes": [...], "neighborhood": {...}, "fetch_errors": [...]}
Fail-soft: si algun fetcher falla, devuelve dict vacio en su campo y agrega a fetch_errors.
El pipeline NO se aborta.
Compliance: la clasificacion de vecindarios usa SOLO indicadores economicos objetivos
(income, owner-occupancy, education, vacancy, crime, days-on-market). NUNCA demografia
racial. Esto cumple con Fair Housing Act federal.
"""
# Cargar .env ANTES de los imports — buscando desde este archivo upwards.
# Asi los fetchers (Census, HUD, FBI) encuentran las API keys aunque el caller
# este corriendo desde otro CWD.
import os
from pathlib import Path
from dotenv import load_dotenv
_here = Path(__file__).resolve().parent # .../data_fetchers/
for _parent in [_here.parent] + list(_here.parents):
_candidate = _parent / ".env"
if _candidate.exists():
load_dotenv(_candidate)
break
from .runner import fetch_all
from .price_validator import validate_price
__all__ = ["fetch_all", "validate_price"]
+13
View File
@@ -0,0 +1,13 @@
"""Common types and constants para data fetchers."""
class FetcherError(Exception):
"""Raised when a fetcher fails. Caught por runner para fail-soft."""
pass
# User-Agent para requests HTTP (algunas APIs como Nominatim lo requieren)
USER_AGENT = "AR-House/0.1.0 (real-estate-analysis-tool; +https://localhost)"
# Timeouts default
DEFAULT_TIMEOUT = 15 # segundos para requests HTTP
+78
View File
@@ -0,0 +1,78 @@
"""File-based cache para data fetchers. JSON on disk con TTL.
Estructura:
.cache/data_fetchers/<namespace>_<hash16>.json
Cada entry:
{"cached_at": <epoch_seconds>, "key": "<original_key>", "data": {...}}
TTL se evalua en get() — si la entrada esta vencida, devuelve None
(no la borra; la sobreescribe el siguiente set()).
"""
from __future__ import annotations
import hashlib
import json
import time
from pathlib import Path
from typing import Optional
class FileCache:
def __init__(self, cache_dir: str | Path):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
def _path(self, namespace: str, key: str) -> Path:
safe_key = hashlib.sha1(key.encode("utf-8")).hexdigest()[:16]
safe_ns = "".join(c if c.isalnum() else "_" for c in namespace)[:24]
return self.cache_dir / f"{safe_ns}_{safe_key}.json"
def get(self, namespace: str, key: str, ttl_days: float) -> Optional[dict]:
"""Devuelve el dict cacheado si existe y no esta vencido. Sino None."""
p = self._path(namespace, key)
if not p.exists():
return None
try:
with p.open(encoding="utf-8") as f:
entry = json.load(f)
cached_at = entry.get("cached_at", 0)
age_days = (time.time() - cached_at) / 86400.0
if age_days > ttl_days:
return None
return entry.get("data")
except (json.JSONDecodeError, OSError):
return None
def set(self, namespace: str, key: str, data: dict) -> None:
"""Guarda data al cache. Errores de escritura son silenciados (non-fatal)."""
p = self._path(namespace, key)
entry = {
"cached_at": time.time(),
"namespace": namespace,
"key": key,
"data": data,
}
try:
p.write_text(
json.dumps(entry, ensure_ascii=False, indent=2),
encoding="utf-8",
)
except OSError:
pass # cache failures are non-fatal
def clear(self, namespace: Optional[str] = None) -> int:
"""Borra entradas de cache. Si namespace, solo de esa namespace.
Devuelve cantidad de archivos borrados.
"""
count = 0
pattern = f"{namespace}_*.json" if namespace else "*.json"
for p in self.cache_dir.glob(pattern):
try:
p.unlink()
count += 1
except OSError:
pass
return count
+85
View File
@@ -0,0 +1,85 @@
"""US Census Geocoder - address -> lat/lng/county/state.
API gratis, no key, sin rate limits documentados (uso razonable).
Documentacion: https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf
Limitacion: SOLO USA (incluye PR, GU, AS, MP, VI).
Devuelve dict con:
matched_address, lat, lng, city, state, zip, county_name, county_fips, state_fips
"""
from __future__ import annotations
import requests
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
CENSUS_URL = "https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress"
def fetch_geocode(address: str) -> dict:
"""Geocodifica una direccion USA. Raises FetcherError si no hay match."""
if not address or len(address.strip()) < 5:
raise FetcherError("address vacio o muy corto")
params = {
"address": address.strip(),
"benchmark": "Public_AR_Current",
"vintage": "Current_Current",
"format": "json",
# Necesitamos Census Tracts ademas de Counties para neighborhood_class
"layers": "Census Tracts,Counties,2020 Census Blocks",
}
headers = {"User-Agent": USER_AGENT}
try:
r = requests.get(CENSUS_URL, params=params, headers=headers, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
except requests.RequestException as e:
raise FetcherError(f"HTTP error: {e}") from e
try:
data = r.json()
except ValueError as e:
raise FetcherError(f"JSON parse error: {e}") from e
matches = data.get("result", {}).get("addressMatches", [])
if not matches:
raise FetcherError(f"No geocode match for: {address!r}")
m = matches[0]
coords = m.get("coordinates", {}) or {}
comp = m.get("addressComponents", {}) or {}
geos = m.get("geographies", {}) or {}
# Counties layer; nombre varia entre vintages: a veces "Counties", a veces con sufijo
counties = (
geos.get("Counties")
or geos.get("2020 Census Counties")
or geos.get("County Subdivisions")
or []
)
county = counties[0] if counties else {}
# Census Tracts: granularidad de vecindario (~4K personas por tract)
tracts = geos.get("Census Tracts") or geos.get("2020 Census Tracts") or []
tract = tracts[0] if tracts else {}
return {
"matched_address": m.get("matchedAddress"),
"lat": coords.get("y"),
"lng": coords.get("x"),
"city": comp.get("city"),
"state": comp.get("state"),
"zip": comp.get("zip"),
"county_name": county.get("NAME") or county.get("BASENAME"),
"county_fips": county.get("GEOID"), # e.g. "12086"
"state_fips": county.get("STATE"), # e.g. "12"
# NEW: tract info para neighborhood_class
"tract_geoid": tract.get("GEOID"), # e.g. "12086007608"
"tract_code": tract.get("TRACT"), # e.g. "007608" (6-digit, sin state/county)
"tract_name": tract.get("NAME"), # e.g. "Census Tract 76.08"
"county_code_only": county.get("COUNTY") or tract.get("COUNTY"), # e.g. "086" (3-digit county solo)
}
+456
View File
@@ -0,0 +1,456 @@
"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform.
Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces
que comparten 33 condados de FL. Una sola implementacion los cubre a todos.
COUNTIES COVERED (33):
Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist,
Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes,
Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau,
Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton,
Washington.
NOT INCLUDED (use other adapters):
Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee,
Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe,
Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas,
Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor.
USAGE:
from data_fetchers.civitek_ocrs import fetch_civitek_court_records
result = fetch_civitek_court_records(
county_name="Hernando",
case_number="2024-CA-001234",
)
# → {status, case_data, lis_pendens, sources_used, source_url, errors, ...}
TECHNICAL NOTES:
- Uses Playwright headless Chromium (free, ~$0 cost per query)
- Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests
- Auto-generated DOM ids (j_idt*) change per session — we use text selectors
- Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable
- Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse)
- Rate limit: not stated by Civitek — we self-throttle to 1 req/2s
"""
from __future__ import annotations
import re
import time
from datetime import datetime
from typing import Optional
# ════════════════════════════════════════════════════════════════════════════
# COUNTY CODE MAPPING (Civitek 2-digit codes)
# ════════════════════════════════════════════════════════════════════════════
CIVITEK_COUNTY_CODES: dict[str, str] = {
# Format: "County Name (canonical)": "NN" (2-digit Civitek code)
"Baker": "02",
"Bradford": "04",
"Calhoun": "07",
"Columbia": "12",
"DeSoto": "14",
"Dixie": "15",
"Franklin": "19",
"Gilchrist": "21",
"Glades": "22",
"Gulf": "23",
"Hamilton": "24",
"Hardee": "25",
"Hendry": "26",
"Hernando": "27",
"Highlands": "28",
"Holmes": "30",
"Jackson": "32",
"Jefferson": "33",
"Lafayette": "34",
"Levy": "38",
"Liberty": "39",
"Madison": "40",
"Marion": "42",
"Nassau": "45",
"Okeechobee": "47",
"Pasco": "51",
"Putnam": "54",
"Santa Rosa": "57",
"Sumter": "60",
"Union": "63",
"Wakulla": "65",
"Walton": "66",
"Washington": "67",
}
def is_civitek_county(county_name: Optional[str]) -> bool:
"""True if county is in Civitek (33 FL counties)."""
if not county_name:
return False
cn = county_name.strip().replace(" County", "").replace(" county", "")
return cn in CIVITEK_COUNTY_CODES
def civitek_code_for(county_name: str) -> Optional[str]:
"""Return Civitek 2-digit code for a county name, or None."""
cn = county_name.strip().replace(" County", "").replace(" county", "")
return CIVITEK_COUNTY_CODES.get(cn)
# ════════════════════════════════════════════════════════════════════════════
# CASE NUMBER PARSER
# ════════════════════════════════════════════════════════════════════════════
# Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately.
# Common formats observed in realauction.com deals:
# "2024-CA-001234"
# "23-2024-CA-001234"
# "2024CA001234"
# "2024-001234-CA"
# "27-2024-CA-001234" (court code prefix)
_CASE_PATTERNS = [
# year-type-seq
re.compile(r"(?:\d{2}-)?(?P<year>20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P<seq>\d{3,8})", re.IGNORECASE),
# year-seq-type
re.compile(r"(?P<year>20\d{2})[\-\s]?(?P<seq>\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE),
# tight: yearTypeNNNNNN
re.compile(r"(?P<year>20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P<seq>\d{3,8})", re.IGNORECASE),
]
def parse_case_number(case_number: str) -> Optional[tuple[str, str]]:
"""Parse a FL case_number into (year, sequence). Returns None if unparseable.
Examples:
"2024-CA-001234" → ("2024", "001234")
"23-2024-CA-001234" → ("2024", "001234")
"2024CA001234" → ("2024", "001234")
"""
if not case_number:
return None
cn = case_number.strip().upper()
for pat in _CASE_PATTERNS:
m = pat.search(cn)
if m:
year = m.group("year")
seq = m.group("seq").lstrip("0") or "0"
return (year, seq)
return None
# ════════════════════════════════════════════════════════════════════════════
# PUBLIC API
# ════════════════════════════════════════════════════════════════════════════
def fetch_civitek_court_records(
county_name: str,
case_number: Optional[str] = None,
party_lastname: Optional[str] = None,
party_firstname: Optional[str] = None,
business_name: Optional[str] = None,
headless: bool = True,
timeout_seconds: int = 45,
) -> dict:
"""Fetch court records from Civitek OCRS.
Provide ONE of:
- case_number (e.g., "2024-CA-001234") → fastest, most precise
- party_lastname (with optional firstname) → person search
- business_name → business search
Returns dict matching court_records.py contract:
{
"status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" |
"FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" |
"NOT_FOUND" | "ERROR",
"county": str (normalized),
"case_number_searched": str,
"search_method": "case_number" | "person_name" | "business_name",
"results": list of dicts (raw cases found),
"case_data": dict (top result enriched) | None,
"lis_pendens": list,
"liens_inventory": dict,
"sources_used": ["civitek_ocrs"],
"source_url": str,
"errors": list of strings,
"fetched_at": ISO timestamp,
}
"""
fetched_at = datetime.utcnow().isoformat() + "Z"
county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "")
# Validate county
code = civitek_code_for(county_normalized)
if not code:
return _error_result(
county=county_normalized,
case_number_searched=case_number or "",
error=f"County '{county_normalized}' not on Civitek platform. "
f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...",
fetched_at=fetched_at,
)
# Validate at least one search criterion
if not (case_number or party_lastname or business_name):
return _error_result(
county=county_normalized,
case_number_searched="",
error="Must provide one of: case_number, party_lastname, or business_name",
fetched_at=fetched_at,
)
# Determine search method
if case_number:
parsed = parse_case_number(case_number)
if not parsed:
return _error_result(
county=county_normalized,
case_number_searched=case_number,
error=f"Could not parse case_number '{case_number}' into year+sequence",
fetched_at=fetched_at,
)
year, seq = parsed
search_method = "case_number"
elif business_name:
year = seq = None
search_method = "business_name"
else:
year = seq = None
search_method = "person_name"
# Execute Playwright flow
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except ImportError:
return _error_result(
county=county_normalized,
case_number_searched=case_number or "",
error="playwright not installed. Run: pip install playwright && playwright install chromium",
fetched_at=fetched_at,
)
base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/"
errors: list[str] = []
results: list[dict] = []
final_url = base_url
status_from_results = "UNKNOWN"
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=headless)
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
# Step 1: Entry page
page.goto(base_url)
page.wait_for_timeout(1500)
# Step 2: Click Public
page.locator("button:has-text('Public')").first.click()
page.wait_for_timeout(2500)
# Step 3: Click I Agree (disclaimer)
agree_btn = page.locator("button:has-text('I Agree')").first
if agree_btn.count() == 0:
errors.append("Disclaimer page didn't show 'I Agree' button")
browser.close()
return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at)
agree_btn.click()
page.wait_for_timeout(2500)
final_url = page.url
# Step 4: Fill form based on search_method
if search_method == "case_number":
# Switch to Case Search tab (data-index=1)
case_tab = page.locator("li[role='tab']:has-text('Case Search')").first
if case_tab.count() == 0:
errors.append("Case Search tab not found")
else:
case_tab.click()
page.wait_for_timeout(1500)
page.fill("#form\\:search_tab\\:year", year)
page.fill("#form\\:search_tab\\:seq", seq)
elif search_method == "person_name":
# Person Search tab is default (data-index=0)
page.fill("#form\\:search_tab\\:lastname", party_lastname)
if party_firstname:
page.fill("#form\\:search_tab\\:fname", party_firstname)
elif search_method == "business_name":
page.fill("#form\\:search_tab\\:businessname", business_name)
# Step 5: Submit
search_btn = page.locator(
"button:has(.ui-button-text:text-is('Search'))"
).first
if search_btn.count() == 0:
search_btn = page.locator("button:has-text('Search')").first
search_btn.click()
page.wait_for_timeout(6000)
# Step 6: Capture validation errors
err_msgs = page.locator(".ui-message-error, .ui-messages-error").all()
for m in err_msgs[:5]:
try:
t = (m.inner_text() or "").strip()
if t and t not in errors:
errors.append(t[:200])
except Exception:
pass
# Step 7: Parse results table
# Civitek results render in a data table with role=grid or as a panelgrid.
# Look for tables that contain "Case" headers
results = _parse_results(page)
final_url = page.url
browser.close()
except PWTimeout as e:
errors.append(f"Playwright timeout: {e}")
except Exception as e:
errors.append(f"Playwright crashed: {type(e).__name__}: {e}")
# Determine status from results
if errors and not results:
status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND"
elif not results:
status_from_results = "NOT_FOUND"
else:
# Has results — infer from case_type
first_type = (results[0].get("case_type") or "").upper()
if "CA" in first_type or "CIVIL" in first_type:
status_from_results = "FORECLOSURE_PENDING" # CA cases include foreclosures
elif "CF" in first_type or "FELONY" in first_type:
status_from_results = "CLEAN" # unrelated criminal
else:
status_from_results = "UNKNOWN"
# Top result enriched
case_data = results[0] if results else None
return {
"status": status_from_results,
"county": county_normalized,
"case_number_searched": case_number or "",
"search_method": search_method,
"results": results,
"case_data": case_data,
"lis_pendens": [],
"liens_inventory": {},
"sources_used": ["civitek_ocrs"],
"source_url": final_url,
"errors": errors,
"fetched_at": fetched_at,
}
# ════════════════════════════════════════════════════════════════════════════
# Internal helpers
# ════════════════════════════════════════════════════════════════════════════
def _parse_results(page) -> list[dict]:
"""Parse the results table from a Civitek search results page.
Civitek renders results as a DataTable (PrimeFaces). Look for tables with
case-related headers. Returns list of dicts with case_number, parties,
filed_date, case_type.
"""
results: list[dict] = []
tables = page.locator("table").all()
for tbl in tables:
try:
rows = tbl.locator("tr")
row_count = rows.count()
if row_count < 2:
continue
# Header row
headers_raw = rows.first.locator("th, td").all()
headers = [(h.inner_text() or "").strip().lower() for h in headers_raw]
# Heuristic: this table has case results if headers include any
# of "case", "uniform", "date", "party", "type"
if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type"))
for h in headers):
continue
# Index columns
col_idx = {}
for i, h in enumerate(headers):
if "case" in h or "uniform" in h:
col_idx["case_number"] = i
elif "type" in h or "court" in h:
col_idx["case_type"] = i
elif "filed" in h or "date" in h:
col_idx["filed_date"] = i
elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h:
col_idx["parties"] = i
elif "status" in h:
col_idx["status"] = i
# Data rows
for r in range(1, row_count):
cells = rows.nth(r).locator("td").all()
if not cells:
continue
cell_texts = [(c.inner_text() or "").strip() for c in cells]
row_data = {}
for k, i in col_idx.items():
if i < len(cell_texts):
row_data[k] = cell_texts[i]
if row_data:
results.append(row_data)
except Exception:
continue
# If we found a results table with rows, stop
if results:
break
return results
def _error_result(
county: str,
case_number_searched: str,
error: str,
fetched_at: str,
) -> dict:
return {
"status": "ERROR",
"county": county,
"case_number_searched": case_number_searched,
"search_method": None,
"results": [],
"case_data": None,
"lis_pendens": [],
"liens_inventory": {},
"sources_used": ["civitek_ocrs"],
"source_url": "",
"errors": [error],
"fetched_at": fetched_at,
}
# ════════════════════════════════════════════════════════════════════════════
# CLI for manual testing
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test")
parser.add_argument("--county", required=True, help="County name (e.g., Hernando)")
parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)")
parser.add_argument("--last-name", help="Last name for person search")
parser.add_argument("--first-name", help="First name (optional with last-name)")
parser.add_argument("--business", help="Business name search")
parser.add_argument("--no-headless", action="store_true", help="Show browser window")
args = parser.parse_args()
result = fetch_civitek_court_records(
county_name=args.county,
case_number=args.case,
party_lastname=args.last_name,
party_firstname=args.first_name,
business_name=args.business,
headless=not args.no_headless,
)
print(json.dumps(result, indent=2, default=str))
+999
View File
@@ -0,0 +1,999 @@
"""court_records.py — Deterministic foreclosure detection via county clerk records.
PROBLEMA QUE RESUELVE:
El sistema heuristico (price_validator.py + property_value.py) HIPOTETIZA que un
listing sospechosamente bajo es foreclosure. Para CONFIRMAR deterministicamente
necesitamos consultar los court records publicos del condado:
- Lis pendens (notice of foreclosure filing)
- Code enforcement violations + liens
- Tax delinquency
ALCANCE WAVE 1.5A:
- Solo DUVAL (Jacksonville) implementado en esta version
- Otros condados: soft-fail con URL del clerk para lookup manual
- Replicacion a Miami-Dade / Broward / Palm Beach / Hillsborough en versiones
posteriores SI Duval funciona end-to-end.
STACK:
- Playwright headless Chromium (local, $0 por consulta)
- Fallback a Firecrawl si Playwright falla (opcional, requiere ENABLE_FIRECRAWL=true)
- User-Agent identificable + rate-limit 1 req/2s por dominio
- Cache TTL 7 dias (los procesos judiciales se mueven lento)
OPT-IN:
ENABLE_COURT_RECORDS=true en .env
"""
from __future__ import annotations
import os
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Rate limit por dominio (un request c/2s segun la regla)
_DOMAIN_LAST_REQUEST: dict[str, float] = {}
_RATE_LIMIT_SECONDS = 2.0
# User-Agent identificable (no spoof — somos un servicio legitimo)
USER_AGENT = "AR-House/1.0 (real estate investment analysis; +https://ar-house.example/contact)"
# Counties con scraper implementado en esta version
SUPPORTED_COUNTIES = {"Duval", "duval"}
# ═══════════════════════════════════════════════════════════════════════════
# Wave 1.5A v1.2: Plaintiff classification + Lien survival analysis
# ═══════════════════════════════════════════════════════════════════════════
# Categorias de plaintiff (quien demanda el foreclosure)
PLAINTIFF_TYPE_BANK_NATIONAL = "BANK_NATIONAL" # Wells Fargo, BofA, Chase, Citi, etc
PLAINTIFF_TYPE_BANK_REGIONAL = "BANK_REGIONAL" # Truist, Regions, BB&T, Fifth Third
PLAINTIFF_TYPE_CREDIT_UNION = "CREDIT_UNION" # Navy Federal, VyStar, etc
PLAINTIFF_TYPE_NONBANK_MORTGAGE = "NONBANK_MORTGAGE" # Quicken/Rocket, PHH, Mr. Cooper, Carrington
PLAINTIFF_TYPE_GSE = "GSE" # Fannie Mae, Freddie Mac, Ginnie Mae
PLAINTIFF_TYPE_TRUSTEE = "TRUSTEE_MBS" # Deutsche Bank AS Trustee, US Bank NA Trustee (MBS trusts)
PLAINTIFF_TYPE_IRS = "IRS_FEDERAL" # Internal Revenue Service (federal tax)
PLAINTIFF_TYPE_STATE_TAX = "STATE_TAX" # FL Dept of Revenue
PLAINTIFF_TYPE_HOA = "HOA_ASSOCIATION" # Homeowners / Condo association
PLAINTIFF_TYPE_MUNICIPAL = "MUNICIPAL" # City/County code enforcement, utility liens
PLAINTIFF_TYPE_HARD_MONEY = "HARD_MONEY_LENDER" # LLC nonbank, private high-rate lender
PLAINTIFF_TYPE_PRIVATE = "PRIVATE_INDIVIDUAL" # Private investor (named person)
PLAINTIFF_TYPE_OTHER = "OTHER"
PLAINTIFF_TYPE_UNKNOWN = "UNKNOWN"
# Mapeo de keywords del nombre del plaintiff a su categoria
_BANK_NATIONAL_KEYWORDS = (
"WELLS FARGO", "BANK OF AMERICA", "CHASE", "JPMORGAN", "JP MORGAN",
"CITIBANK", "CITI ", "CITI,", "U.S. BANK", "US BANK", "USBANK",
"PNC BANK", "TD BANK", "HSBC", "CAPITAL ONE",
)
_BANK_REGIONAL_KEYWORDS = (
"TRUIST", "REGIONS BANK", "BB&T", "BBT BANK", "FIFTH THIRD", "5/3 BANK",
"SUNTRUST", "M&T BANK", "KEYBANK", "HUNTINGTON",
)
_CREDIT_UNION_KEYWORDS = ("CREDIT UNION", "VYSTAR", "NAVY FEDERAL", "FCU", "C.U.")
_NONBANK_MORTGAGE_KEYWORDS = (
"QUICKEN", "ROCKET MORTGAGE", "PHH MORTGAGE", "MR. COOPER", "MR COOPER",
"NATIONSTAR", "CARRINGTON", "FREEDOM MORTGAGE", "LOANDEPOT",
"PENNYMAC", "NEW REZ", "NEWREZ", "SHELLPOINT", "OCWEN", "DITECH",
"BAYVIEW", "SPECIALIZED LOAN SERVICING",
)
_GSE_KEYWORDS = ("FANNIE MAE", "FEDERAL NATIONAL MORTGAGE", "FNMA",
"FREDDIE MAC", "FEDERAL HOME LOAN MORTGAGE", "FHLMC",
"GINNIE MAE", "GNMA")
_TRUSTEE_KEYWORDS = ("AS TRUSTEE", "AS INDENTURE TRUSTEE", "TRUSTEE FOR",
"DEUTSCHE BANK NATIONAL", "BANK OF NEW YORK MELLON", "BNY MELLON",
"WILMINGTON TRUST", "WILMINGTON SAVINGS")
_IRS_KEYWORDS = ("INTERNAL REVENUE SERVICE", "I.R.S.", "U.S. INTERNAL REVENUE",
"UNITED STATES OF AMERICA")
_STATE_TAX_KEYWORDS = ("FLORIDA DEPARTMENT OF REVENUE", "FL DEPT OF REVENUE",
"STATE OF FLORIDA")
_HOA_KEYWORDS = ("HOMEOWNERS", "ASSOCIATION INC", "ASSOCIATION, INC", "CONDOMINIUM",
"PROPERTY OWNERS ASSOCIATION", " POA ", "HOA")
_MUNICIPAL_KEYWORDS = ("CITY OF ", "COUNTY OF ", "MUNICIPALITY", "CODE ENFORCEMENT",
"TAX COLLECTOR")
def classify_plaintiff(name: Optional[str]) -> dict:
"""Clasifica el plaintiff de un foreclosure por su nombre.
Returns:
{
name: <input>,
type: PLAINTIFF_TYPE_*,
category: "primary_lender" | "junior_lienholder" | "tax_authority" |
"association" | "government" | "unknown",
is_original_loan_holder: bool | None (None si no se puede determinar)
}
"""
if not name:
return {
"name": None,
"type": PLAINTIFF_TYPE_UNKNOWN,
"category": "unknown",
"is_original_loan_holder": None,
}
upper = name.upper()
# Order matters: more specific patterns first (trustee MBS antes que bank national)
if any(kw in upper for kw in _TRUSTEE_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_TRUSTEE,
"category": "mbs_trustee",
"is_original_loan_holder": False,
"note": "MBS trustee: el loan fue securitizado. El servicer real puede ser otra entity."}
if any(kw in upper for kw in _IRS_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_IRS,
"category": "tax_authority",
"is_original_loan_holder": False,
"note": "IRS federal tax lien. SOBREVIVE el foreclosure con 120-day right of redemption."}
if any(kw in upper for kw in _STATE_TAX_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_STATE_TAX,
"category": "tax_authority",
"is_original_loan_holder": False,
"note": "FL state tax lien. Tipicamente extinguible pero verificar con title search."}
if any(kw in upper for kw in _GSE_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_GSE,
"category": "primary_lender",
"is_original_loan_holder": False,
"note": "GSE (Fannie/Freddie/Ginnie). Compraron el loan al originador. Comun en MLS post-foreclosure como REO."}
if any(kw in upper for kw in _BANK_NATIONAL_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_BANK_NATIONAL,
"category": "primary_lender",
"is_original_loan_holder": True,
"note": "Banco nacional grande. Probablemente originador del loan. Procesos estandarizados."}
if any(kw in upper for kw in _BANK_REGIONAL_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_BANK_REGIONAL,
"category": "primary_lender",
"is_original_loan_holder": True,
"note": "Banco regional. Mas flexible para negociar short sale o cash-for-keys."}
if any(kw in upper for kw in _CREDIT_UNION_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_CREDIT_UNION,
"category": "primary_lender",
"is_original_loan_holder": True,
"note": "Credit union. Members-only, foreclosure menos frecuente, mas dispuestos a workout."}
if any(kw in upper for kw in _NONBANK_MORTGAGE_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_NONBANK_MORTGAGE,
"category": "primary_lender",
"is_original_loan_holder": False,
"note": "Non-bank mortgage servicer. Suele ser servicer asignado, no el originador. Investor real es otro."}
if any(kw in upper for kw in _HOA_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_HOA,
"category": "association",
"is_original_loan_holder": False,
"note": "HOA/Condo association. FL Statute 720.3085(2)(b) limita lo que sobrevive a 12 meses dues o 1% del mortgage."}
if any(kw in upper for kw in _MUNICIPAL_KEYWORDS):
return {"name": name, "type": PLAINTIFF_TYPE_MUNICIPAL,
"category": "government",
"is_original_loan_holder": False,
"note": "Municipal lien (code enforcement / utility). SOBREVIVE el foreclosure — corre con la tierra."}
# LLC sin keyword conocido = probable hard money / private investor
if "LLC" in upper or "L.L.C." in upper:
return {"name": name, "type": PLAINTIFF_TYPE_HARD_MONEY,
"category": "primary_lender",
"is_original_loan_holder": True,
"note": "LLC sin patron de banco/servicer conocido. Probable hard money lender o private investor. Tasas 8-15%, terminos cortos."}
# Si el nombre tiene formato "APELLIDO, NOMBRE" o sin entity → individual
if "," in name or (not any(s in upper for s in ("INC", "CORP", "BANK", "LLC", "TRUST")) and len(name.split()) <= 4):
return {"name": name, "type": PLAINTIFF_TYPE_PRIVATE,
"category": "primary_lender",
"is_original_loan_holder": None,
"note": "Individuo (no entity). Posible seller financing, family loan, o private money."}
return {"name": name, "type": PLAINTIFF_TYPE_OTHER,
"category": "unknown",
"is_original_loan_holder": None,
"note": "Nombre no matchea patrones conocidos. Revisar manualmente."}
# ═══════════════════════════════════════════════════════════════════════════
# Lien types + survival analysis
# ═══════════════════════════════════════════════════════════════════════════
# Tipos de lien (sigue convencion estandar US real estate)
LIEN_TYPE_MORTGAGE_1ST = "MORTGAGE_1ST"
LIEN_TYPE_MORTGAGE_2ND = "MORTGAGE_2ND"
LIEN_TYPE_MORTGAGE_3RD = "MORTGAGE_3RD"
LIEN_TYPE_HELOC = "HELOC"
LIEN_TYPE_IRS_TAX = "IRS_TAX_LIEN"
LIEN_TYPE_STATE_TAX = "STATE_TAX_LIEN"
LIEN_TYPE_PROPERTY_TAX = "PROPERTY_TAX_LIEN" # County property tax delinquency
LIEN_TYPE_HOA = "HOA_LIEN"
LIEN_TYPE_MECHANICS = "MECHANICS_LIEN"
LIEN_TYPE_MUNICIPAL = "MUNICIPAL_LIEN" # Code enforcement, utility
LIEN_TYPE_JUDGMENT = "JUDGMENT_LIEN"
LIEN_TYPE_OTHER = "OTHER"
# Survival outcomes
SURVIVES = "SURVIVES"
EXTINGUISHED = "EXTINGUISHED"
EXTINGUISHED_BY_THIS_ACTION = "EXTINGUISHED_BY_THIS_ACTION" # plaintiff's own mortgage
UNCERTAIN = "UNCERTAIN"
def analyze_lien_survival(
*,
lien_type: str,
is_plaintiff_lien: bool = False,
is_senior_to_plaintiff: bool = False,
lien_filing_date: Optional[str] = None,
plaintiff_filing_date: Optional[str] = None,
) -> dict:
"""Determina si un lien sobrevive un foreclosure judicial en Florida.
Reglas FL standard:
- Plaintiff's own mortgage lien: EXTINGUISHED_BY_THIS_ACTION
- Liens senior (filed before) al plaintiff: SURVIVE (need to be paid off)
- Liens junior (filed after) al plaintiff: EXTINGUISHED
- IRS federal tax lien: SURVIVES with 120-day right of redemption (26 USC 7425(d))
- HOA dues: SURVIVE pero limitados a 12 mo o 1% del mortgage (FL 720.3085, 718.116)
- Municipal/Code Enforcement: SURVIVES — runs with the land
- Property tax: ALWAYS SURVIVES (super-priority)
Returns:
{
survives_foreclosure: SURVIVES | EXTINGUISHED | EXTINGUISHED_BY_THIS_ACTION | UNCERTAIN,
warning: str | None,
legal_basis: str,
}
"""
# Plaintiff's own mortgage gets wiped by the foreclosure that the plaintiff filed
if is_plaintiff_lien:
return {
"survives_foreclosure": EXTINGUISHED_BY_THIS_ACTION,
"warning": None,
"legal_basis": "Plaintiff's own mortgage is the subject of this foreclosure — extinguished by judicial sale.",
}
# Super-priority liens (always survive regardless of filing order)
if lien_type == LIEN_TYPE_PROPERTY_TAX:
return {
"survives_foreclosure": SURVIVES,
"warning": "Property tax delinquency tiene SUPER-PRIORIDAD. Sobrevive a TODO. Pagar inmediatamente post-cierre o el tax collector vende el property por tax deed.",
"legal_basis": "FL Statute 197.122 — ad valorem taxes constitute first lien superior to all other liens.",
}
if lien_type == LIEN_TYPE_IRS_TAX:
return {
"survives_foreclosure": SURVIVES,
"warning": "IRS federal tax lien SOBREVIVE el foreclosure. El IRS tiene 120-day right of redemption (puede recomprar el property pagando el bid + costos). Despues de 120 dias, el buyer queda dueno definitivamente. Sumar al MAB.",
"legal_basis": "26 USC 7425(d) — federal tax liens survive judicial sale with 120-day redemption period.",
}
if lien_type == LIEN_TYPE_MUNICIPAL:
return {
"survives_foreclosure": SURVIVES,
"warning": "Lien municipal (code enforcement / utility) SOBREVIVE — corre con la tierra. Sumar al MAB. Check code enforcement violations open antes de bidear.",
"legal_basis": "FL Statute 162.09 — code enforcement liens equivalent to civil judgment, runs with land.",
}
if lien_type == LIEN_TYPE_HOA:
return {
"survives_foreclosure": SURVIVES,
"warning": "HOA dues SOBREVIVE pero limitado por FL Statute 720.3085(2)(b): el nuevo owner debe 12 meses de dues o 1% del original mortgage, lo que sea menor. Si es condo: FL 718.116. Pedir HOA estoppel letter pre-bid.",
"legal_basis": "FL Statute 720.3085(2)(b) (HOA) o 718.116 (condo) — buyer hereda capped portion.",
}
if lien_type == LIEN_TYPE_STATE_TAX:
# State tax liens vary by chronology
if is_senior_to_plaintiff:
return {
"survives_foreclosure": SURVIVES,
"warning": "State tax lien filed BEFORE plaintiff's mortgage → sobrevive.",
"legal_basis": "FL Statute 197.0 — chronological priority among non-super-priority liens.",
}
return {
"survives_foreclosure": EXTINGUISHED,
"warning": "State tax lien filed AFTER plaintiff's mortgage típicamente se extingue. Validar con title search.",
"legal_basis": "Junior liens (post-mortgage) extinguished by foreclosure of senior lien.",
}
# Standard junior/senior analysis for mortgages, mechanics, judgment liens
if lien_type in (LIEN_TYPE_MORTGAGE_2ND, LIEN_TYPE_MORTGAGE_3RD, LIEN_TYPE_HELOC,
LIEN_TYPE_MECHANICS, LIEN_TYPE_JUDGMENT):
if is_senior_to_plaintiff:
return {
"survives_foreclosure": SURVIVES,
"warning": f"{lien_type} filed BEFORE plaintiff's mortgage → sobrevive. Sumar al MAB.",
"legal_basis": "Senior lien sobrevive foreclosure de lien junior.",
}
return {
"survives_foreclosure": EXTINGUISHED,
"warning": None,
"legal_basis": f"{lien_type} junior al plaintiff's mortgage — extinguido por foreclosure judicial.",
}
# 1st mortgage (no es del plaintiff) — significa que hay otra hipoteca senior
if lien_type == LIEN_TYPE_MORTGAGE_1ST:
if is_senior_to_plaintiff:
return {
"survives_foreclosure": SURVIVES,
"warning": "Existe un mortgage senior al plaintiff's lien. Buyer hereda ESTA hipoteca. Sumar saldo al MAB.",
"legal_basis": "Senior mortgage survives foreclosure of junior lien.",
}
return {
"survives_foreclosure": EXTINGUISHED,
"warning": None,
"legal_basis": "1st mortgage junior al plaintiff (raro pero posible si plaintiff es property tax/super-priority).",
}
# Fallback
return {
"survives_foreclosure": UNCERTAIN,
"warning": f"Tipo de lien '{lien_type}' no tiene regla automatica. Title search profesional ($300-500) requerido.",
"legal_basis": "Default safety: assume UNCERTAIN para tipos no clasificados.",
}
def _empty_liens_inventory(reason: str) -> dict:
"""Estructura vacía estandar para liens cuando v1.1 scraper no esta listo.
Mantiene el shape del output asi los downstream consumers no rompen.
"""
return {
"all_liens": [],
"lien_count": 0,
"total_surviving_debt": 0,
"investor_warning": (
"⚠️ Liens detail no disponible automaticamente (Wave 1.5A v1.1 deferred a Phase 3.5). "
"ANTES de cualquier oferta o bid: hacer **title search profesional** "
"($300-500 USD) en or.duvalclerk.com filtering por document types: "
"MTG (mortgages), NFTL (IRS federal tax liens), SATL (state tax), "
"NOC (mechanics), COD (code enforcement). Listing puede tener hasta "
"$30K-$100K en deudas heredables NO visibles aqui."
),
"detail_status": "PENDING_V1_1",
"detail_pending_reason": reason,
}
# URL del clerk por condado (para soft-fail mensajes en condados no implementados)
COUNTY_CLERK_URLS = {
"Duval": "https://www.duvalclerk.com/online-option/court-records",
"Miami-Dade": "https://www2.miami-dadeclerk.com/ocs",
"Broward": "https://officialrecords.broward.org",
"Palm Beach": "https://www.mypalmbeachclerk.com/departments/records-services-division",
"Hillsborough": "https://hillsclerk.com",
"Orange": "https://myorangeclerk.com",
}
def _enable_court_records() -> bool:
return os.getenv("ENABLE_COURT_RECORDS", "false").lower() == "true"
def _rate_limit(domain: str) -> None:
"""Bloquea si llamamos al mismo dominio < 2s atras."""
now = time.time()
last = _DOMAIN_LAST_REQUEST.get(domain, 0)
delta = now - last
if delta < _RATE_LIMIT_SECONDS:
time.sleep(_RATE_LIMIT_SECONDS - delta)
_DOMAIN_LAST_REQUEST[domain] = time.time()
# ═══════════════════════════════════════════════════════════════════════════
# Duval (Jacksonville) — implementacion completa
# ═══════════════════════════════════════════════════════════════════════════
# Diccionarios para parseo de address (Duval Property Appraiser tiene 5 campos)
_STREET_DIRECTIONS = {"N", "S", "E", "W", "NE", "NW", "SE", "SW",
"NORTH", "SOUTH", "EAST", "WEST"}
_STREET_SUFFIXES = {
"ST": "ST", "STREET": "ST",
"AVE": "AVE", "AVENUE": "AVE",
"RD": "RD", "ROAD": "RD",
"BLVD": "BLVD", "BOULEVARD": "BLVD",
"LN": "LN", "LANE": "LN",
"WAY": "WAY",
"DR": "DR", "DRIVE": "DR",
"CT": "CT", "COURT": "CT",
"PL": "PL", "PLACE": "PL",
"CIR": "CIR", "CIRCLE": "CIR",
"TER": "TER", "TERRACE": "TER",
"PKWY": "PKWY", "PARKWAY": "PKWY",
"HWY": "HWY", "HIGHWAY": "HWY",
"TRL": "TRL", "TRAIL": "TRL",
"XING": "XING", "CROSSING": "XING",
"ALY": "ALY", "ALLEY": "ALY",
"BND": "BND", "BEND": "BND",
}
def _parse_address_duval(address: str) -> Optional[dict]:
"""Parse "3245 N Pearl St, Jacksonville, FL 32206" into Duval form fields.
Returns:
{street_num: "3245", prefix: "N", name: "PEARL", suffix: "ST", zip: "32206"}
o None si no se pudo parsear.
"""
# Strip post-comma (city/state) y aislar street part
street_part = address.split(",")[0].strip()
# Capturar ZIP del original si lo hay
zip_match = re.search(r"\b(\d{5})(?:-\d{4})?\b", address)
zip_code = zip_match.group(1) if zip_match else ""
tokens = street_part.upper().split()
if len(tokens) < 2 or not tokens[0].isdigit():
return None
street_num = tokens[0]
rest = tokens[1:]
# Detectar prefix (direction) en el primer token despues del numero
prefix = ""
if rest and rest[0] in _STREET_DIRECTIONS:
# Normalize NORTH→N etc
prefix_raw = rest.pop(0)
prefix_map = {"NORTH": "N", "SOUTH": "S", "EAST": "E", "WEST": "W"}
prefix = prefix_map.get(prefix_raw, prefix_raw)
# Detectar suffix en el ultimo token
suffix = ""
if rest and rest[-1] in _STREET_SUFFIXES:
suffix = _STREET_SUFFIXES[rest.pop()]
name = " ".join(rest)
if not name:
return None
return {
"street_num": street_num,
"prefix": prefix,
"name": name,
"suffix": suffix,
"zip": zip_code,
}
def _fetch_property_owner_duval(address: str) -> tuple[Optional[dict], list[str]]:
"""Step 1 Duval: dado un address, buscar el owner name + RE# en el Property Appraiser.
Sitio: https://paopropertysearch.coj.net/Basic/Search.aspx
Form fields (ASP.NET, descubiertos via DOM inspection):
- ctl00$cphBody$tbStreetNumber → numero
- ctl00$cphBody$ddStreetPrefix → select (N/S/E/W)
- ctl00$cphBody$tbStreetName → nombre (sin prefix ni suffix)
- ctl00$cphBody$ddStreetSuffix → select (ST/AVE/RD/...)
- ctl00$cphBody$tbZipCode → ZIP opcional
- ctl00$cphBody$bSearch → submit
Resultado en Results.aspx (puede ser direct match o tabla).
Returns ({owner_name, re_number, year_built, tax_assessed, last_sale_date}, errors)
"""
errors: list[str] = []
try:
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
except ImportError as e:
errors.append(f"playwright no instalado: {e}")
return None, errors
parsed = _parse_address_duval(address)
if not parsed:
errors.append(f"No pude parsear el address (formato esperado: '<num> [prefix] <name> [suffix]'): {address}")
return None, errors
_rate_limit("paopropertysearch.coj.net")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(user_agent=USER_AGENT)
page = context.new_page()
page.set_default_timeout(15_000)
# IMPORTANTE: wait_until='networkidle' (no 'domcontentloaded') porque el
# sitio Duval usa ASP.NET WebForms con WebForm_DoPostBackWithOptions, una
# funcion JS que se carga DESPUES del DOMContentLoaded. Si clickeamos
# Search antes que esa JS exista, el click no submite nada (PAGEERROR).
page.goto("https://paopropertysearch.coj.net/Basic/Search.aspx",
wait_until="networkidle", timeout=20_000)
# Nota: el sitio Duval ASP.NET no carga WebForm_DoPostBackWithOptions
# (probablemente por su compat IE8 mode). No esperamos a esa funcion —
# vamos directo al fallback de form.submit() que funciona consistentemente.
# Llenar form fields con los selectores REALES del sitio Duval ASP.NET
page.locator("#ctl00_cphBody_tbStreetNumber").fill(parsed["street_num"])
if parsed["prefix"]:
try:
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(
value=parsed["prefix"]
)
except Exception:
# Fallback: por label
try:
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(
label=parsed["prefix"]
)
except Exception:
pass # Si no matchea, dejar empty y confiar en street name match
page.locator("#ctl00_cphBody_tbStreetName").fill(parsed["name"])
if parsed["suffix"]:
try:
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(
value=parsed["suffix"]
)
except Exception:
try:
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(
label=parsed["suffix"]
)
except Exception:
pass
if parsed["zip"]:
try:
page.locator("#ctl00_cphBody_tbZipCode").fill(parsed["zip"])
except Exception:
pass
# Submit via JS form.submit() — bypasea WebForm_DoPostBackWithOptions
# que el sitio Duval no carga correctamente.
submitted = False
try:
page.evaluate("""() => {
const form = document.forms[0] || document.querySelector('form');
if (!form) throw new Error('no form found');
form.action = 'Results.aspx';
// ASP.NET espera el button name como input para detectar el click
let hidden = document.createElement('input');
hidden.type = 'hidden';
hidden.name = 'ctl00$cphBody$bSearch';
hidden.value = 'Search';
form.appendChild(hidden);
form.submit();
}""")
page.wait_for_url("**Results.aspx**", timeout=10_000)
submitted = True
except Exception as e:
errors.append(f"Property Appraiser: form submit fallo: {e}")
try:
page.wait_for_load_state("networkidle", timeout=10_000)
except PlaywrightTimeout:
pass
current_url = page.url
if not submitted:
errors.append(
f"Property Appraiser: submit no navego a Results.aspx (URL final: {current_url}). "
"ASP.NET WebForms postback fallo en las 3 estrategias (click/Enter/JS-eval)."
)
# Parse Results.aspx — la pagina tiene una tabla con columnas conocidas:
# RE #, Name (Last First), Street #, Street Name, Type, Direction, Unit, City, Zip
# Si "No Results Found" en body → property no existe en Duval PA
body_text = page.locator("body").inner_text() if submitted else ""
owner_name: Optional[str] = None
re_number: Optional[str] = None
year_built: Optional[int] = None
tax_assessed: Optional[int] = None
last_sale_date: Optional[str] = None
no_results = "No Results Found" in body_text or "No information available" in body_text
if no_results:
errors.append(
f"Property Appraiser: 'No Results Found' para "
f"{parsed['street_num']} {parsed['prefix']} {parsed['name']} {parsed['suffix']}. "
"Address probablemente no existe en Duval PA database o esta fuera del condado."
)
elif submitted:
# Parsear tabla de resultados via DOM
try:
results_table = page.locator("table").first
rows = results_table.locator("tr").all()
if len(rows) >= 2:
# Row 0 = headers, Row 1+ = data
# Buscar la fila que matchee el street # exacto si tenemos varios resultados
best_row = None
for r in rows[1:]:
cells = [(c.text_content() or "").strip() for c in r.locator("td").all()]
if len(cells) >= 9 and cells[2] == parsed["street_num"]:
# Match exacto del street #
best_row = cells
break
if not best_row:
# Sin match exacto, tomar el primero
cells_first = [(c.text_content() or "").strip() for c in rows[1].locator("td").all()]
if len(cells_first) >= 9:
best_row = cells_first
if best_row:
re_number = best_row[0] or None
owner_name = best_row[1] or None
# Otros campos en el detail page (TODO si necesario)
except Exception as e:
errors.append(f"Property Appraiser: error parseando tabla de resultados: {e}")
browser.close()
if not owner_name and not re_number:
return None, errors
return {
"owner_name": owner_name,
"re_number": re_number,
"year_built": year_built,
"tax_assessed_value": tax_assessed,
"last_sale_date": last_sale_date,
"source": "Duval Property Appraiser (paopropertysearch.coj.net)",
"result_url": current_url,
}, errors
except Exception as e:
errors.append(f"Property Appraiser Duval scrape error: {e}")
return None, errors
def _fetch_lis_pendens_duval(owner_name: str, address: str) -> tuple[list[dict], list[str]]:
"""Step 2 Duval: dado un owner name, buscar lis pendens recientes en Official Records.
Sitio: https://or.duvalclerk.com/
Flujo (descubierto via DOM inspection):
1. Landing tiene Disclaimer form. Click "I accept the conditions above" (id='btnButton').
2. Despues del POST, redirige a la home autenticada-as-guest.
3. Navegar a /search/SearchTypeName (link directo).
4. Llenar Last Name + First Name.
5. Submit. Resultados muestran columns: Doc Type, Recording Date, Grantor, Grantee.
6. Filter por Doc Type que contenga "LIS PENDENS" o codigo "LP".
Owner name del Property Appraiser viene como "JONES JOHN N" o
"CITY OF JACKSONVILLE" — formato "LASTNAME FIRSTNAME MIDDLE_INITIAL".
Returns (list of {case_number, filing_date, plaintiff, doc_type, instrument_num},
errors_list)
"""
errors: list[str] = []
matches: list[dict] = []
try:
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
except ImportError as e:
errors.append(f"playwright no instalado: {e}")
return matches, errors
_rate_limit("or.duvalclerk.com")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(user_agent=USER_AGENT)
page = context.new_page()
page.set_default_timeout(15_000)
# Step 1: Aceptar disclaimer
page.goto("https://or.duvalclerk.com/", wait_until="networkidle", timeout=20_000)
try:
# El boton tiene id='btnButton' (descubierto via DOM inspection)
page.locator("#btnButton").click()
page.wait_for_load_state("networkidle", timeout=10_000)
except Exception as e:
errors.append(f"Official Records: error aceptando disclaimer: {e}")
browser.close()
return matches, errors
# Step 2: Navegar al Name Search
try:
page.goto("https://or.duvalclerk.com/search/SearchTypeName",
wait_until="networkidle", timeout=15_000)
except Exception as e:
errors.append(f"Official Records: no pude navegar a SearchTypeName: {e}")
browser.close()
return matches, errors
# Step 3: Llenar form de busqueda por nombre
# Owner name del Duval PA viene como "LASTNAME FIRSTNAME M" o "LASTNAME LASTNAME2 FIRSTNAME"
# Algunos casos especiales: "CITY OF JACKSONVILLE" (no es persona)
# "JONES JOHN N" → last=JONES, first=JOHN, middle=N
parts = owner_name.strip().split()
last_name = parts[0] if parts else owner_name
first_name = parts[1] if len(parts) > 1 else ""
# Dump form structure if first time (errors=[] for debug)
# Selectores tipicos para nombre en este sistema (Acclaim Land Records)
ln_selectors = [
"input[name='lastName']", "input[id='lastName']",
"input[name='LastName']", "input[id='LastName']",
"input[name*='last']", "input[id*='last']",
]
fn_selectors = [
"input[name='firstName']", "input[id='firstName']",
"input[name='FirstName']", "input[id='FirstName']",
"input[name*='first']", "input[id*='first']",
]
ln_filled = False
for sel in ln_selectors:
if page.locator(sel).count() > 0:
page.locator(sel).first.fill(last_name)
ln_filled = True
break
if not ln_filled:
# v1: degradar gracefully — Acclaim Land Records system tiene form
# dinamico (JS-rendered) que requiere mas iteracion. v1.1 lo arregla.
errors.append(
"Official Records v1: Name Search form selectors no encontrados. "
"Lis pendens lookup automatico no disponible en este release. "
"Lookup manual: https://or.duvalclerk.com/search/SearchTypeName "
f"con last_name='{last_name}', first_name='{first_name}'"
)
browser.close()
return matches, errors
for sel in fn_selectors:
if page.locator(sel).count() > 0 and first_name:
page.locator(sel).first.fill(first_name)
break
# Submit
search_btns = [
"input[type='submit'][value*='Search']",
"button:has-text('Search')",
"input[type='button'][value*='Search']",
"#searchButton", "#btnSearch", "button[type='submit']",
]
clicked = False
for sel in search_btns:
try:
if page.locator(sel).count() > 0:
page.locator(sel).first.click()
clicked = True
break
except Exception:
pass
if not clicked:
# Fallback: Enter en el lastName input
try:
for sel in ln_selectors:
if page.locator(sel).count() > 0:
page.locator(sel).first.press("Enter")
clicked = True
break
except Exception:
pass
if not clicked:
errors.append("Official Records: no encontre boton Search ni pude enviar via Enter")
browser.close()
return matches, errors
try:
page.wait_for_load_state("networkidle", timeout=15_000)
except PlaywrightTimeout:
pass
current_url = page.url
# Step 4: Parsear resultados. La tabla del sistema Acclaim tiene columnas
# tipicas: Doc Type, Recording Date, Party, Instrument #, Book/Page
# Buscamos rows con doc type que contenga "LIS PENDENS"
try:
# Esperar a que cargue la grid de resultados
page.wait_for_selector("table", timeout=8_000)
except PlaywrightTimeout:
errors.append(f"Official Records: tabla de resultados no apareció. URL: {current_url}")
browser.close()
return matches, errors
# Buscar todas las filas de la tabla y filter por LIS PENDENS / LP
all_rows = page.locator("table tr").all()
lp_keywords = ["LIS PENDENS", "LISPENDENS", " LP ", "FORECLOSURE"]
for row in all_rows[1:]: # skip header
try:
cells = [(c.text_content() or "").strip() for c in row.locator("td").all()]
row_text = " ".join(cells).upper()
if any(kw in row_text for kw in lp_keywords):
# Parsear filas que matchean
match = {
"doc_type": next((c for c in cells if any(kw.strip() in c.upper() for kw in lp_keywords)), "Lis Pendens"),
"all_columns_text": cells,
"source_url": current_url,
}
# Intentar extraer fecha
for c in cells:
date_m = re.search(r"\d{1,2}/\d{1,2}/\d{4}", c)
if date_m:
match["filing_date"] = date_m.group(0)
break
# Intentar extraer instrument #
for c in cells:
inst_m = re.match(r"\d{4,}-\d{4,}", c) or re.match(r"\d{8,}", c)
if inst_m:
match["instrument_number"] = inst_m.group(0)
break
matches.append(match)
except Exception:
pass
if not matches:
# Sin matches NO es error — la propiedad puede estar limpia
errors.append(
f"Sin matches de Lis Pendens para owner '{owner_name}' en Duval Official Records. "
f"Esto puede significar: (a) la propiedad NO esta en foreclosure, o "
f"(b) el owner_name parseado no matchea el formato del clerk. URL final: {current_url}"
)
browser.close()
return matches, errors
except Exception as e:
errors.append(f"Official Records Duval scrape error: {e}")
return matches, errors
def _fetch_duval(address: str) -> dict:
"""Pipeline completo Duval: owner lookup + lis pendens search + liens inventory.
v1: solo Property Appraiser funciona. Lis pendens scraper devuelve estructura
vacia con warning. Liens inventory tambien vacia + warning.
v1.1 (deferred a Phase 3.5): popular lis_pendens + all_liens reales.
"""
errors: list[str] = []
sources_used: list[str] = []
# Step 1: owner name from Property Appraiser
owner_data, owner_errors = _fetch_property_owner_duval(address)
errors.extend(owner_errors)
if owner_data:
sources_used.append(owner_data.get("source", "Duval Property Appraiser"))
# Step 2: lis pendens lookup (solo si tenemos owner_name)
lp_matches = []
if owner_data and owner_data.get("owner_name"):
lp_matches, lp_errors = _fetch_lis_pendens_duval(
owner_data["owner_name"], address
)
errors.extend(lp_errors)
if not lp_errors or "Sin matches" in (lp_errors[0] if lp_errors else ""):
sources_used.append("Duval Official Records (or.duvalclerk.com)")
# Step 3 (Wave 1.5A v1.2): Liens inventory — DEFERRED a v1.1, devolver placeholder
# Cuando v1.1 funcione: _fetch_liens_duval(owner_data['owner_name'], owner_data['re_number'])
# devolvera la lista completa de liens via doc_type filters en or.duvalclerk.com.
liens_data = _empty_liens_inventory(
reason="Acclaim Land Records scraper deferred to v1.1. Lookup manual disponible."
)
# Step 4: Plaintiff classification (solo si hay lis pendens detectado)
plaintiff_info = None
if lp_matches:
# En v1.1 cuando se parsee correctamente, el lis pendens row tendra columns
# con plaintiff name. Por ahora, intentar extraer del primer match si esta.
first_lp = lp_matches[0] if lp_matches else {}
plaintiff_name_raw = (
first_lp.get("plaintiff")
or (first_lp.get("all_columns_text") or [None])[0]
)
plaintiff_info = classify_plaintiff(plaintiff_name_raw)
# Status determination
# v1: si el Lis Pendens lookup degrada gracefully, status='OWNER_VERIFIED'
# (sabemos que la propiedad existe + owner, pero NO podemos confirmar foreclosure).
# En v1.1 cuando Official Records funcione completamente, podra subir a 'CLEAN'.
lis_pendens_degraded = any(
"Official Records v1" in e for e in errors
)
if lp_matches:
status = "LIS_PENDENS_ACTIVE"
most_recent = sorted(
[m for m in lp_matches if m.get("filing_date")],
key=lambda m: m["filing_date"], reverse=True
)
most_recent_date = most_recent[0]["filing_date"] if most_recent else None
elif owner_data:
# Tenemos owner pero no pudimos verificar foreclosures
if lis_pendens_degraded:
status = "OWNER_VERIFIED" # PA OK, lis pendens manual
else:
status = "CLEAN" # Both PA + lis pendens lookups OK, no matches
most_recent_date = None
else:
status = "UNKNOWN"
most_recent_date = None
# Pull case_number from first lis_pendens match if available
case_number = None
if lp_matches:
first_lp = lp_matches[0]
case_number = (
first_lp.get("case_number")
or first_lp.get("instrument_number")
)
return {
"status": status,
"county": "Duval",
"address": address,
# Property Appraiser data
"owner_name": (owner_data or {}).get("owner_name"),
"re_number": (owner_data or {}).get("re_number"),
"tax_assessed_value": (owner_data or {}).get("tax_assessed_value"),
"year_built_official": (owner_data or {}).get("year_built"),
"last_sale_date": (owner_data or {}).get("last_sale_date"),
# Lis pendens detail
"lis_pendens": lp_matches,
"lis_pendens_count": len(lp_matches),
"most_recent_lis_pendens_date": most_recent_date,
"case_number": case_number,
# Wave 1.5A v1.2: Plaintiff + liens structured fields
"plaintiff": plaintiff_info,
"loan_origin": None, # v1.1 popula desde MTG records cuando funcione
"all_liens": liens_data["all_liens"],
"lien_count": liens_data["lien_count"],
"total_surviving_debt": liens_data["total_surviving_debt"],
"investor_warning": liens_data["investor_warning"],
"liens_detail_status": liens_data["detail_status"],
# Meta
"sources_used": sources_used,
"errors": errors,
"fetched_at": datetime.now(timezone.utc).isoformat(),
}
# ═══════════════════════════════════════════════════════════════════════════
# Public API
# ═══════════════════════════════════════════════════════════════════════════
def fetch_court_records(
*,
address: str,
county_name: Optional[str] = None,
) -> dict:
"""Entry point. Dispatches per county. Soft-fail si condado no implementado.
Returns dict con:
status: CLEAN | LIS_PENDENS_ACTIVE | CODE_VIOLATIONS | TAX_DELINQUENT
| UNKNOWN | NOT_IMPLEMENTED | DISABLED
county
owner_name, re_number, tax_assessed_value, year_built_official, last_sale_date
lis_pendens: list of cases (si LIS_PENDENS_ACTIVE)
sources_used, errors, fetched_at
"""
fetched_at = datetime.now(timezone.utc).isoformat()
if not _enable_court_records():
return {
"status": "DISABLED",
"county": county_name,
"address": address,
"recommendation": (
"Court records scraping deshabilitado. Activar ENABLE_COURT_RECORDS=true "
"en .env para deteccion deterministica de foreclosure / lis pendens."
),
"sources_used": [],
"errors": [],
"fetched_at": fetched_at,
}
# Normalize county
cn = (county_name or "").strip()
cn_normalized = cn.replace(" County", "").replace(" county", "").strip()
if cn_normalized.lower() == "duval":
return _fetch_duval(address)
# Soft-fail para condados no implementados
clerk_url = COUNTY_CLERK_URLS.get(cn_normalized, "https://www.flclerks.com/")
return {
"status": "NOT_IMPLEMENTED",
"county": cn_normalized,
"address": address,
"recommendation": (
f"Court records scraper no implementado para {cn_normalized} todavia. "
f"Lookup manual en {clerk_url}. Wave 1.5A v1 cubre solo Duval; "
"Miami-Dade / Broward / Palm Beach / Hillsborough en versiones posteriores."
),
"clerk_url": clerk_url,
"sources_used": [],
"errors": [],
"fetched_at": fetched_at,
}
+87
View File
@@ -0,0 +1,87 @@
"""FEMA NFHL flood zone lookup por lat/lng.
API publica: https://hazards.fema.gov/gis/nfhl/rest/services/public/NFHL/MapServer
Layer 28 = "S_FLD_HAZ_AR" (Special Flood Hazard Areas).
Sin key requerida. Sin rate limits estrictos.
Devuelve dict con:
zone: "X" / "X (shaded)" / "A" / "AE" / "AH" / "AO" / "V" / "VE" / etc.
bfe: Base Flood Elevation (ft) o None
sfha: bool - True si esta en Special Flood Hazard Area
subtype: subzone description o None
"""
from __future__ import annotations
import requests
from .base import FetcherError, DEFAULT_TIMEOUT
FEMA_URL = "https://hazards.fema.gov/arcgis/rest/services/public/NFHL/MapServer/28/query"
# Zonas que son SFHA (Special Flood Hazard Area) segun FEMA
SFHA_ZONES = {"A", "AE", "AH", "AO", "AR", "A99", "V", "VE", "VO"}
def fetch_flood(lat: float, lng: float) -> dict:
"""Consulta FEMA NFHL para flood zone en (lat, lng).
Si el punto NO esta en ninguna SFHA, FEMA devuelve features vacio
y se interpreta como zona X (low risk, default outside SFHA).
"""
if lat is None or lng is None:
raise FetcherError("lat/lng requeridos")
params = {
"geometry": f"{lng},{lat}", # FEMA usa lng,lat (x,y)
"geometryType": "esriGeometryPoint",
"inSR": "4326", # WGS84
"spatialRel": "esriSpatialRelIntersects",
"outFields": "FLD_ZONE,STATIC_BFE,ZONE_SUBTY",
"returnGeometry": "false",
"f": "json",
}
try:
r = requests.get(FEMA_URL, params=params, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
except requests.RequestException as e:
raise FetcherError(f"HTTP error: {e}") from e
try:
data = r.json()
except ValueError as e:
raise FetcherError(f"JSON parse error: {e}") from e
# FEMA puede devolver "error" si la query es invalida
if "error" in data:
raise FetcherError(f"FEMA API error: {data['error']}")
features = data.get("features", [])
if not features:
# Punto fuera de SFHA → low-risk zone X
return {
"zone": "X",
"bfe": None,
"sfha": False,
"subtype": None,
"source": "FEMA NFHL (outside SFHA)",
}
attrs = features[0].get("attributes", {}) or {}
zone = (attrs.get("FLD_ZONE") or "unknown").strip()
subtype = attrs.get("ZONE_SUBTY")
# BFE: FEMA usa -9999 para "no aplica"
bfe_raw = attrs.get("STATIC_BFE")
bfe = bfe_raw if (bfe_raw is not None and bfe_raw != -9999) else None
return {
"zone": zone,
"bfe": bfe,
"sfha": zone in SFHA_ZONES,
"subtype": subtype,
"source": "FEMA NFHL",
}
+134
View File
@@ -0,0 +1,134 @@
"""HUD Fair Market Rent lookup.
API: https://www.huduser.gov/portal/dataset/fmr-api.html
Requiere API key gratis: https://www.huduser.gov/hudapi/public/register
Flow:
1. GET /fmr/listCounties/{state} -> match county_name -> fips_code
2. GET /fmr/data/{fips_code}?year=YYYY -> Efficiency / 1BR / 2BR / 3BR / 4BR
Si HUD_API_KEY no esta en .env, raise FetcherError (caught por runner, fail-soft).
Devuelve dict con:
year, county, state,
fmr_efficiency, fmr_1br, fmr_2br, fmr_3br, fmr_4br (en USD/mes)
"""
from __future__ import annotations
import os
from datetime import datetime
import requests
from dotenv import load_dotenv
from .base import FetcherError, DEFAULT_TIMEOUT
HUD_BASE = "https://www.huduser.gov/hudapi/public"
def _normalize_county_name(s: str) -> str:
"""Normaliza para comparar nombres: lowercase, sin sufijo 'County', sin espacios redundantes."""
if not s:
return ""
s = s.lower().strip()
if s.endswith(" county"):
s = s[:-7].strip()
return " ".join(s.split()) # collapse whitespace
def fetch_fmr(state: str, county_name: str, year: int | None = None) -> dict:
"""Fetch FMR para un condado USA.
state: codigo 2-letras (ej. "FL", "TX")
county_name: nombre del condado (con o sin "County")
year: ano del FMR (default: ano actual)
"""
# .env ya fue cargado por data_fetchers/__init__.py
api_key = os.getenv("HUD_API_KEY", "").strip()
if not api_key:
raise FetcherError("HUD_API_KEY no esta en .env. Registrate en https://www.huduser.gov/hudapi/public/register")
if not state or not county_name:
raise FetcherError(f"state y county_name son requeridos (got state={state!r}, county={county_name!r})")
if year is None:
year = datetime.now().year
headers = {"Authorization": f"Bearer {api_key}"}
# 1. listCounties para encontrar el entity_id (fips_code)
list_url = f"{HUD_BASE}/fmr/listCounties/{state}"
try:
r = requests.get(list_url, headers=headers, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
except requests.RequestException as e:
raise FetcherError(f"listCounties HTTP error: {e}") from e
try:
counties = r.json()
except ValueError as e:
raise FetcherError(f"listCounties JSON error: {e}") from e
if not isinstance(counties, list):
raise FetcherError(f"listCounties unexpected format: {type(counties).__name__}")
target = _normalize_county_name(county_name)
matched = None
for c in counties:
if _normalize_county_name(c.get("county_name", "")) == target:
matched = c
break
if not matched:
sample = [c.get("county_name") for c in counties[:5]]
raise FetcherError(f"County '{county_name}' not in HUD list for {state}. Primeros 5: {sample}")
entity_id = matched.get("fips_code")
if not entity_id:
raise FetcherError(f"Match found but no fips_code: {matched}")
# 2. FMR data
fmr_url = f"{HUD_BASE}/fmr/data/{entity_id}"
try:
r = requests.get(fmr_url, params={"year": year}, headers=headers, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
except requests.RequestException as e:
raise FetcherError(f"fmr/data HTTP error: {e}") from e
try:
payload = r.json()
except ValueError as e:
raise FetcherError(f"fmr/data JSON error: {e}") from e
# payload structure puede variar. Intentamos varias formas conocidas.
data = payload.get("data", payload) if isinstance(payload, dict) else {}
# basicdata puede ser un dict (condado simple) o lista (metro con sub-zonas)
bd = data.get("basicdata")
if isinstance(bd, list):
bd = bd[0] if bd else {}
if not isinstance(bd, dict):
bd = data
def _g(*keys):
"""Devuelve el primer valor presente entre las keys provistas."""
for k in keys:
v = bd.get(k)
if v is not None:
return v
return None
return {
"year": year,
"county": matched.get("county_name"),
"state": state,
"entity_id": entity_id,
"fmr_efficiency": _g("Efficiency", "fmr_0"),
"fmr_1br": _g("One-Bedroom", "fmr_1"),
"fmr_2br": _g("Two-Bedroom", "fmr_2"),
"fmr_3br": _g("Three-Bedroom", "fmr_3"),
"fmr_4br": _g("Four-Bedroom", "fmr_4"),
"source": "HUD User FMR API",
}
+549
View File
@@ -0,0 +1,549 @@
"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos.
CRITICO - COMPLIANCE LEGAL:
La clasificacion se basa SOLO en indicadores economicos y datos publicos:
income, owner-occupancy, education attainment, vacancy, crime, days on market.
NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining.
Esta es una clasificacion ECONOMICA, no demografica.
Indicadores y pesos (max 100):
- median_household_income (Census ACS) 25%
- owner_occupied_pct (Census ACS) 20%
- education_attainment_pct_bachelor_plus (ACS) 20%
- crime_vs_national (FBI UCR) 15%
- vacancy_rate (Census ACS) 10%
- days_on_market_median (Firecrawl, opt-in) 10%
Graceful degradation: si un indicador no esta disponible (API key missing,
fetcher fallo), se redistribuye su peso entre los disponibles.
confidence_level (basado en CANTIDAD de indicadores disponibles):
- "high": 5-6 indicadores
- "medium": 3-4 indicadores
- "low": 1-2 indicadores
- "unclassified": 0 indicadores
"""
from __future__ import annotations
import os
import time
from datetime import datetime, timezone
from typing import Optional
import requests
from dotenv import load_dotenv
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
# ─── Pesos del algoritmo de clasificacion ───────────────────────────────────
WEIGHTS = {
"income": 25,
"owner_occupied": 20,
"education": 20,
"crime": 15,
"vacancy": 10,
"dom": 10,
}
# ─── Census ACS variable codes ──────────────────────────────────────────────
ACS_VARS = {
"income": "B19013_001E", # Median household income (last 12 months)
"oo_count": "B25003_002E", # Owner-occupied housing units count
"occupied_total": "B25003_001E", # Total occupied housing units
"vacant_count": "B25002_003E", # Vacant housing units count
"housing_total": "B25002_001E", # Total housing units (occupied + vacant)
"home_value": "B25077_001E", # Median home value
"edu_total": "B15003_001E", # Total population 25+
"edu_bachelor": "B15003_022E", # Bachelor's degree
"edu_master": "B15003_023E", # Master's degree
"edu_prof": "B15003_024E", # Professional school degree
"edu_doctorate": "B15003_025E", # Doctorate degree
}
# ─── National crime rates (FBI UCR 2022, per 100K population) ──────────────
# Usado como denominador para crime_vs_national. Actualizar anualmente.
NATIONAL_VIOLENT_CRIME_PER_100K = 380.7
NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4
# ═══════════════════════════════════════════════════════════════════════════
# Fetchers individuales (fail-soft)
# ═══════════════════════════════════════════════════════════════════════════
def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]:
"""Fetch 4 indicadores Census ACS para el tract del geocode.
Returns (indicators_dict, errors_list).
"""
errors: list[str] = []
out: dict = {}
api_key = os.getenv("CENSUS_API_KEY", "").strip()
if not api_key:
errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)")
return out, errors
state_fips = geocode.get("state_fips")
county_code = geocode.get("county_code_only")
tract_code = geocode.get("tract_code")
if not state_fips or not county_code or not tract_code:
errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})")
return out, errors
url = "https://api.census.gov/data/2022/acs/acs5"
# Pedir todas las vars en una sola llamada (la API acepta hasta 50)
var_keys = ["NAME"] + list(ACS_VARS.values())
params = {
"get": ",".join(var_keys),
"for": f"tract:{tract_code}",
"in": f"state:{state_fips} county:{county_code}",
"key": api_key,
}
try:
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
data = r.json()
except requests.RequestException as e:
errors.append(f"Census ACS HTTP: {e}")
return out, errors
except ValueError as e:
errors.append(f"Census ACS JSON: {e}")
return out, errors
if not data or len(data) < 2:
errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)")
return out, errors
header = data[0]
row = data[1]
idx = {col: i for i, col in enumerate(header)}
def _f(col: str) -> Optional[float]:
try:
v = row[idx[col]]
except (KeyError, IndexError):
return None
if v is None or v == "" or v == "null":
return None
try:
f = float(v)
# Census usa valores negativos para "no data" / "suppressed"
return f if f >= 0 else None
except (ValueError, TypeError):
return None
# 1) Median household income
income = _f(ACS_VARS["income"])
if income is not None:
out["median_household_income"] = round(income, 0)
# 2) Owner-occupied percentage
oo = _f(ACS_VARS["oo_count"])
total = _f(ACS_VARS["occupied_total"])
if oo is not None and total and total > 0:
out["owner_occupied_pct"] = round(oo / total * 100, 1)
# 3) Vacancy rate
vacant = _f(ACS_VARS["vacant_count"])
housing = _f(ACS_VARS["housing_total"])
if vacant is not None and housing and housing > 0:
out["vacancy_rate"] = round(vacant / housing * 100, 1)
# 4) Median home value
home_value = _f(ACS_VARS["home_value"])
if home_value is not None:
out["median_home_value"] = round(home_value, 0)
# 5) Education attainment (% bachelor's or higher, age 25+)
edu_total = _f(ACS_VARS["edu_total"])
edu_b = _f(ACS_VARS["edu_bachelor"]) or 0
edu_m = _f(ACS_VARS["edu_master"]) or 0
edu_p = _f(ACS_VARS["edu_prof"]) or 0
edu_d = _f(ACS_VARS["edu_doctorate"]) or 0
if edu_total and edu_total > 0:
pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100
out["education_attainment_pct_bachelor_plus"] = round(pct, 1)
return out, errors
def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]:
"""Fetch crime data via FBI Crime Data Explorer (api.data.gov key).
NOTA: la API publica gratis de FBI es county-level via summarized endpoint.
Implementacion best-effort: si la API responde, devolvemos crime_vs_national.
Si no, fail-soft (errors list, indicator ausente).
"""
errors: list[str] = []
out: dict = {}
api_key = os.getenv("API_DATA_GOV_KEY", "").strip()
if not api_key:
errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)")
return out, errors
state_abbr = geocode.get("state") # e.g. "FL"
if not state_abbr:
errors.append("state abbreviation faltante en geocode")
return out, errors
# Endpoint: FBI Crime Data Explorer state-level estimate
# Mejor que tener nada (county-level es complejo de agregar).
url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}"
params = {
"from": "2022",
"to": "2022",
"API_KEY": api_key,
}
try:
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
r.raise_for_status()
data = r.json()
except requests.RequestException as e:
errors.append(f"FBI UCR HTTP: {e}")
return out, errors
except ValueError as e:
errors.append(f"FBI UCR JSON: {e}")
return out, errors
# Estructura tipica del endpoint: lista de estimates por ano con keys
# como 'violent_crime', 'property_crime', 'population', etc.
# Defensivo: probar varias formas.
estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else [])
if not estimates:
errors.append("FBI UCR sin estimates en respuesta")
return out, errors
rec = estimates[0] if isinstance(estimates, list) else estimates
if not isinstance(rec, dict):
errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}")
return out, errors
population = rec.get("population")
violent = rec.get("violent_crime")
property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0 # fallback - se ignora luego
if not population or not violent:
errors.append("FBI UCR sin population o violent_crime en estimate")
return out, errors
try:
violent_per_100k = float(violent) / float(population) * 100000
ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K
if property_c:
property_per_100k = float(property_c) / float(population) * 100000
ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K
# Promedio ponderado: violent pesa mas (2/3) que property (1/3)
combined = (ratio_violent * 2 + ratio_property) / 3
else:
combined = ratio_violent
out["crime_vs_national"] = round(combined, 2)
out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa."
except (TypeError, ValueError) as e:
errors.append(f"FBI UCR calc error: {e}")
return out, errors
def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]:
"""Fetch median days-on-market via Firecrawl scrape de Zillow.
OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup).
Llamar solo si include_dom=True en classify_neighborhood().
"""
errors: list[str] = []
# Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6).
# Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado.
errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6")
return {}, errors
# ═══════════════════════════════════════════════════════════════════════════
# Algoritmo de clasificacion
# ═══════════════════════════════════════════════════════════════════════════
def _score_income(income: float) -> int:
if income >= 100000: return 25
if income >= 60000: return 18
if income >= 35000: return 10
return 3
def _score_owner_occupied(pct: float) -> int:
if pct >= 80: return 20
if pct >= 60: return 15
if pct >= 40: return 8
return 3
def _score_education(pct_bach_plus: float) -> int:
if pct_bach_plus >= 50: return 20
if pct_bach_plus >= 30: return 14
if pct_bach_plus >= 15: return 7
return 2
def _score_crime(ratio_vs_national: float) -> int:
"""Lower ratio = better (less crime than national)."""
if ratio_vs_national < 0.7: return 15
if ratio_vs_national < 1.0: return 12
if ratio_vs_national < 1.5: return 7
return 2
def _score_vacancy(pct: float) -> int:
"""Lower vacancy = better."""
if pct < 3: return 10
if pct < 6: return 7
if pct < 10: return 4
return 1
def _score_dom(days: float) -> int:
"""Lower days-on-market = hotter neighborhood = better."""
if days < 30: return 10
if days < 60: return 7
if days < 90: return 4
return 1
def _classify(indicators: dict) -> dict:
"""Aplica el algoritmo de scoring con graceful degradation.
Returns dict con neighborhood_class, class_score, confidence_level, etc.
"""
score_funcs = {
"income": (_score_income, "median_household_income"),
"owner_occupied": (_score_owner_occupied, "owner_occupied_pct"),
"education": (_score_education, "education_attainment_pct_bachelor_plus"),
"crime": (_score_crime, "crime_vs_national"),
"vacancy": (_score_vacancy, "vacancy_rate"),
"dom": (_score_dom, "days_on_market_median"),
}
points = {}
indicators_available = []
for key, (func, indicator_name) in score_funcs.items():
val = indicators.get(indicator_name)
if val is not None:
points[key] = func(val)
indicators_available.append(key)
n_available = len(indicators_available)
# Confidence level por cantidad de indicadores
if n_available == 0:
confidence = "unclassified"
elif n_available <= 2:
confidence = "low"
elif n_available <= 4:
confidence = "medium"
else:
confidence = "high"
if n_available == 0:
return {
"neighborhood_class": "unclassified",
"class_score": 0.0,
"confidence_level": "unclassified",
"indicators_available": [],
"weight_coverage_pct": 0,
}
# Graceful degradation: scale points contra weights disponibles
total_weight_available = sum(WEIGHTS[k] for k in indicators_available)
total_points = sum(points.values())
scaled_0_to_100 = (total_points / total_weight_available) * 100
if scaled_0_to_100 >= 85: letter = "A"
elif scaled_0_to_100 >= 65: letter = "B"
elif scaled_0_to_100 >= 40: letter = "C"
else: letter = "D"
return {
"neighborhood_class": letter,
"class_score": round(scaled_0_to_100, 1),
"confidence_level": confidence,
"indicators_available": indicators_available,
"weight_coverage_pct": total_weight_available,
"raw_points": points,
}
# ═══════════════════════════════════════════════════════════════════════════
# Investment implications por clase
# ═══════════════════════════════════════════════════════════════════════════
INVESTMENT_IMPLICATIONS = {
"A": {
"buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos",
"section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR",
"appreciation_potential": "Alta - tipicamente supera inflacion",
"tenant_quality_expected": "Profesional, familias, muy estable",
"typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"],
},
"B": {
"buy_hold_viability": "Alta - balance entre cash flow y apreciacion",
"section_8_viability": "Baja - market rents por encima de FMR pero no por mucho",
"appreciation_potential": "Media-alta",
"tenant_quality_expected": "Profesional, familias, estable",
"typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"],
},
"C": {
"buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management",
"section_8_viability": "Alta - market rents cerca o por debajo de FMR",
"appreciation_potential": "Baja-media",
"tenant_quality_expected": "Working class, estabilidad mixta",
"typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"],
},
"D": {
"buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo",
"section_8_viability": "Muy alta - Section 8 puede superar market rent",
"appreciation_potential": "Baja - depende de trayectoria del vecindario",
"tenant_quality_expected": "Bajos ingresos, screening diligente requerido",
"typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"],
},
"unclassified": {
"buy_hold_viability": "No determinado - sin datos suficientes",
"section_8_viability": "No determinado",
"appreciation_potential": "No determinado",
"tenant_quality_expected": "No determinado",
"typical_strategies": [],
},
}
def _build_reasoning(indicators: dict, classification: dict) -> str:
"""Genera 1-2 lineas de justificacion del class letter."""
letter = classification["neighborhood_class"]
if letter == "unclassified":
return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)."
parts = []
if (v := indicators.get("median_household_income")) is not None:
parts.append(f"median income ${v:,.0f}")
if (v := indicators.get("owner_occupied_pct")) is not None:
parts.append(f"owner-occupied {v:.0f}%")
if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None:
parts.append(f"bachelor+ {v:.0f}%")
if (v := indicators.get("crime_vs_national")) is not None:
parts.append(f"crime {v:.2f}x national")
if (v := indicators.get("vacancy_rate")) is not None:
parts.append(f"vacancy {v:.1f}%")
if (v := indicators.get("days_on_market_median")) is not None:
parts.append(f"DOM {v} dias")
indicator_str = ", ".join(parts)
score = classification["class_score"]
conf = classification["confidence_level"]
coverage = classification["weight_coverage_pct"]
return (
f"Clase {letter} (score {score}/100, confianza {conf}, "
f"cobertura {coverage}% del peso). Indicadores: {indicator_str}."
)
# ═══════════════════════════════════════════════════════════════════════════
# API publica
# ═══════════════════════════════════════════════════════════════════════════
def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict:
"""Clasifica un vecindario A/B/C/D basado en indicadores objetivos.
Args:
geocode: output de census_geocode.fetch_geocode (debe tener state_fips,
county_code_only, tract_code).
include_dom: si True, hace lookup de Days-on-Market via Firecrawl
(gasta credits). Default False.
Returns:
dict con neighborhood_class, class_score, confidence_level, indicators,
investment_implications, etc.
"""
# .env ya fue cargado por data_fetchers/__init__.py al primer import
# del paquete. No llamamos load_dotenv() aca para evitar conflictos con
# CWD distinto del proyecto.
fetched_at = datetime.now(timezone.utc).isoformat()
all_errors: list[str] = []
data_sources: list[str] = []
if not geocode or not geocode.get("state_fips"):
return {
"neighborhood_class": "unclassified",
"class_score": 0.0,
"confidence_level": "unclassified",
"indicators": {},
"indicators_available": [],
"weight_coverage_pct": 0,
"class_reasoning": "Geocode fallo - no se puede clasificar sin tract.",
"investment_implications": INVESTMENT_IMPLICATIONS["unclassified"],
"warnings": ["Geocode invalido o incompleto"],
"data_sources": [],
"tract_geoid": None,
"fetched_at": fetched_at,
"errors": ["geocode_failed"],
}
# ─── Census ACS (4 indicadores) ─────────────────────────────────────────
indicators: dict = {}
census_data, errs = _fetch_census_acs(geocode)
indicators.update(census_data)
all_errors.extend(errs)
if census_data:
data_sources.append("US Census ACS 2022 5-Year")
# ─── FBI UCR (1 indicador) ──────────────────────────────────────────────
crime_data, errs = _fetch_fbi_crime(geocode)
# Excluir keys auxiliares con prefijo "_"
indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")})
all_errors.extend(errs)
if crime_data:
data_sources.append("FBI Crime Data Explorer (state-level)")
# ─── Firecrawl DOM (1 indicador, opt-in) ────────────────────────────────
if include_dom:
dom_data, errs = _fetch_firecrawl_dom(geocode)
indicators.update(dom_data)
all_errors.extend(errs)
if dom_data:
data_sources.append("Firecrawl (Zillow DOM)")
# ─── Clasificar ─────────────────────────────────────────────────────────
classification = _classify(indicators)
reasoning = _build_reasoning(indicators, classification)
letter = classification["neighborhood_class"]
# ─── Warnings ───────────────────────────────────────────────────────────
warnings: list[str] = []
if classification["confidence_level"] in ("low", "unclassified"):
warnings.append(
f"Confianza {classification['confidence_level']}: "
f"solo {len(classification['indicators_available'])} indicadores disponibles."
)
if "_crime_state_level_note" in crime_data:
warnings.append(crime_data["_crime_state_level_note"])
return {
"neighborhood_class": letter,
"class_score": classification["class_score"],
"confidence_level": classification["confidence_level"],
"indicators": indicators,
"indicators_available": classification["indicators_available"],
"weight_coverage_pct": classification["weight_coverage_pct"],
"class_reasoning": reasoning,
"investment_implications": INVESTMENT_IMPLICATIONS[letter],
"warnings": warnings,
"data_sources": data_sources,
"tract_geoid": geocode.get("tract_geoid"),
"tract_name": geocode.get("tract_name"),
"fetched_at": fetched_at,
"errors": all_errors,
}
+218
View File
@@ -0,0 +1,218 @@
"""NOAA HURDAT2 - historial de huracanes Atlantico.
Dataset: https://www.nhc.noaa.gov/data/#hurdat
Format spec: https://www.nhc.noaa.gov/data/hurdat/hurdat2-format.pdf
Descarga el archivo en data/hurdat2.txt en el primer uso (lazy).
Re-descarga si tiene mas de 365 dias.
Para una direccion dada (lat/lng), devuelve huracanes que pasaron a menos
de N millas (default 150) en los ultimos K anos (default 20).
Returns:
{
"lookback_years": 20,
"max_distance_mi": 150,
"total_hurricanes_nearby": <int>,
"hurricanes": [
{"name": "Ian", "year": 2022, "category": 4,
"max_wind_mph": 155, "closest_pass_miles": 12},
...
]
}
"""
from __future__ import annotations
import math
import os
import time
from datetime import datetime
from pathlib import Path
import requests
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
# URLs candidatas (NOAA renombra el archivo cada ano).
# Si todas fallan, FetcherError.
HURDAT2_URL_CANDIDATES = [
"https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2024-040425.txt",
"https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2023-051124.txt",
"https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2022-050423.txt",
]
HURDAT2_MAX_AGE_DAYS = 365
def _saffir_simpson(max_wind_mph: float) -> int:
"""Categoria Saffir-Simpson basada en max sustained wind (mph). 0 = TS (no huracan)."""
if max_wind_mph >= 157:
return 5
if max_wind_mph >= 130:
return 4
if max_wind_mph >= 111:
return 3
if max_wind_mph >= 96:
return 2
if max_wind_mph >= 74:
return 1
return 0 # tropical storm or less
def _haversine_mi(lat1: float, lng1: float, lat2: float, lng2: float) -> float:
"""Distancia great-circle entre dos puntos en millas."""
R_MI = 3958.8
p1 = math.radians(lat1)
p2 = math.radians(lat2)
dp = math.radians(lat2 - lat1)
dl = math.radians(lng2 - lng1)
a = math.sin(dp/2)**2 + math.cos(p1) * math.cos(p2) * math.sin(dl/2)**2
return 2 * R_MI * math.asin(math.sqrt(a))
def _parse_coord(s: str) -> float | None:
"""Parse '26.5N' o '80.3W' a float (West/South negativos)."""
s = s.strip()
if not s or len(s) < 2:
return None
try:
val = float(s[:-1])
d = s[-1].upper()
if d in ("S", "W"):
val = -val
return val
except ValueError:
return None
def _download_hurdat2(dest_path: Path) -> None:
"""Intenta descargar HURDAT2 desde varias URLs candidatas."""
dest_path.parent.mkdir(parents=True, exist_ok=True)
last_err = None
for url in HURDAT2_URL_CANDIDATES:
try:
r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=30)
if r.status_code == 200 and len(r.text) > 10000:
dest_path.write_text(r.text, encoding="utf-8")
return
last_err = f"HTTP {r.status_code} from {url}"
except requests.RequestException as e:
last_err = f"{url}: {e}"
continue
raise FetcherError(f"No pude descargar HURDAT2 desde ninguna URL. Ultimo error: {last_err}")
def _ensure_hurdat2_local(local_path: str | Path) -> Path:
"""Garantiza que el archivo local exista y este fresco. Descarga si hace falta."""
p = Path(local_path)
needs_download = (
not p.exists()
or (time.time() - p.stat().st_mtime) / 86400 > HURDAT2_MAX_AGE_DAYS
)
if needs_download:
_download_hurdat2(p)
return p
def fetch_hurricanes(
lat: float,
lng: float,
years_back: int = 20,
max_distance_mi: float = 150.0,
hurdat2_path: str | Path = "data/hurdat2.txt",
) -> dict:
"""Busca huracanes que pasaron cerca de (lat, lng) en los ultimos N anos.
'Cerca' = al menos un track-point del huracan estuvo a <= max_distance_mi.
Solo cuenta sistemas que alcanzaron category 1+ (max_wind >= 74 mph) en algun
momento de su track.
"""
if lat is None or lng is None:
raise FetcherError("lat/lng requeridos")
p = _ensure_hurdat2_local(hurdat2_path)
current_year = datetime.now().year
min_year = current_year - years_back
text = p.read_text(encoding="utf-8", errors="replace")
lines = text.splitlines()
hurricanes_nearby = []
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
if not line:
i += 1
continue
parts = [x.strip() for x in line.split(",")]
# Header: AL112022, IAN, 65,
if len(parts) >= 3 and parts[0].startswith("AL") and len(parts[0]) >= 8:
atcf_id = parts[0]
name = parts[1]
try:
num_records = int(parts[2])
year = int(atcf_id[4:8])
except (ValueError, IndexError):
i += 1
continue
if year < min_year:
# Skip todos los track lines de este huracan
i += 1 + num_records
continue
max_wind_kt = 0
min_dist_mi = float("inf")
for j in range(num_records):
tl_idx = i + 1 + j
if tl_idx >= n:
break
track_parts = [x.strip() for x in lines[tl_idx].split(",")]
if len(track_parts) < 7:
continue
# 0:date 1:time 2:record_id 3:status 4:lat 5:lng 6:wind
tlat = _parse_coord(track_parts[4])
tlng = _parse_coord(track_parts[5])
try:
wind = int(track_parts[6])
except ValueError:
wind = 0
if tlat is None or tlng is None:
continue
dist = _haversine_mi(lat, lng, tlat, tlng)
if dist < min_dist_mi:
min_dist_mi = dist
if wind > max_wind_kt:
max_wind_kt = wind
max_wind_mph = max_wind_kt * 1.15078 # kt -> mph
category = _saffir_simpson(max_wind_mph)
# Solo contamos huracanes (cat 1+) que pasaron cerca
if category >= 1 and min_dist_mi <= max_distance_mi:
hurricanes_nearby.append({
"name": name if name else "UNNAMED",
"year": year,
"category": category,
"max_wind_mph": int(round(max_wind_mph)),
"closest_pass_miles": int(round(min_dist_mi)),
})
i += 1 + num_records
else:
i += 1
# Ordenar: mas reciente y mas fuerte primero
hurricanes_nearby.sort(key=lambda h: (-h["year"], -h["category"]))
return {
"lookback_years": years_back,
"max_distance_mi": max_distance_mi,
"total_hurricanes_nearby": len(hurricanes_nearby),
"hurricanes": hurricanes_nearby,
"source": "NOAA HURDAT2",
}
+399
View File
@@ -0,0 +1,399 @@
"""data_fetchers/owner_classifier.py — Clasifica owner_name del PA en tipos.
Estrategia de negocio: cuando el owner es un BANCO (BoA, Wells Fargo, etc.),
hay oportunidad de REO direct outreach — comprar por debajo de market.
USAGE:
from data_fetchers.owner_classifier import classify_owner, build_reo_signal
classification = classify_owner("BANK OF AMERICA NA TRSTEE")
# → {type, category, confidence, normalized, evidence}
reo = build_reo_signal(
owner_name="BANK OF AMERICA NA TRSTEE",
just_value=322580,
assessed_value=228560,
taxes_paid_last=5256.59,
)
# → {is_reo_opportunity, suggested_offer_range, justification, ...}
"""
from __future__ import annotations
import re
from typing import Optional
# ════════════════════════════════════════════════════════════════════════════
# Owner-type patterns (priority order — first match wins)
# ════════════════════════════════════════════════════════════════════════════
LENDER_PATTERNS = {
# Big national banks
"BANK_NATIONAL": [
"BANK OF AMERICA", "WELLS FARGO", "JPMORGAN", "CHASE BANK", "CHASE MORTGAGE",
"U.S. BANK", "US BANK", "USBANK", "PNC BANK", "TRUIST", "CITIBANK",
"CITIMORTGAGE", "REGIONS BANK", "FIFTH THIRD", "KEY BANK",
"HUNTINGTON NATIONAL", "SUNTRUST", "BMO HARRIS", "TD BANK",
],
# GSE / Federal agencies (foreclosure → these guys hold inventory)
"GSE_FEDERAL": [
"FEDERAL HOME LOAN MORTGAGE", "FREDDIE MAC", "FREDDIEMAC",
"FEDERAL NATIONAL MORTGAGE", "FANNIE MAE", "FANNIEMAE",
"SECRETARY OF HUD", "SECRETARY OF HOUSING", "U.S. DEPARTMENT OF HOUSING",
"VA SECRETARY", "VETERANS AFFAIRS",
"SECRETARY OF VETERANS AFFAIRS",
],
# MBS / Trustee banks (Mortgage-Backed Securities trustees)
"MBS_TRUSTEE": [
"DEUTSCHE BANK NATIONAL TRUST", "BANK OF NEW YORK MELLON",
"U.S. BANK TRUST", "U.S. BANK NATIONAL ASSOCIATION TRUSTEE",
"WILMINGTON SAVINGS", "WILMINGTON TRUST",
" AS TRUSTEE FOR", "NA TRSTEE", "TRSTEE",
"CHRISTIANA TRUST", "WELLS FARGO BANK NA TRSTEE",
"HSBC BANK USA NATIONAL", "HSBC BANK USA",
],
# Loan servicers (often own REO too)
"SERVICER": [
"BAYVIEW LOAN SERVICING", "SHELLPOINT MORTGAGE", "NEWREZ",
"MR. COOPER", "MR COOPER", "SPECIALIZED LOAN SERVICING", "PHH MORTGAGE",
"OCWEN", "SELENE FINANCE", "RUSHMORE LOAN", "FAY SERVICING",
"CARRINGTON MORTGAGE", "PENNYMAC", "FREEDOM MORTGAGE",
],
# Fintech lenders
"FINTECH_LENDER": [
"ROCKET MORTGAGE", "QUICKEN LOANS", "BETTER MORTGAGE", "LOANDEPOT",
"GUILD MORTGAGE", "AMERIHOME",
],
# Community banks / regional (FL-specific common)
"BANK_REGIONAL": [
"SEACOAST BANK", "VALLEY NATIONAL", "FIRST HORIZON",
"CENTENNIAL BANK", "BANKUNITED", "AMERIS BANK", "SYNOVUS",
"ATLANTIC CAPITAL", "PROFESSIONAL BANK",
],
# Tax certificate holders (acquired via tax deed)
"TAX_CERTIFICATE_HOLDER": [
"TAX CERTIFICATE", "TAX DEED HOLDER", "FLORIDA TAX CERTIFICATE",
],
# Insurance / pension (sometimes own real estate)
"INSURANCE_PENSION": [
"STATE FARM", "ALLSTATE", "PRUDENTIAL", "MASS MUTUAL", "METLIFE",
],
}
# Government entities (NOT REO opportunities, usually held for public use)
GOVERNMENT_PATTERNS = [
"STATE OF FLORIDA", "FLORIDA DEPT", "FLORIDA DEPARTMENT",
"CITY OF", "COUNTY OF", "TOWN OF", "VILLAGE OF",
"SCHOOL BOARD", "SCHOOL DISTRICT", "MUNICIPALITY OF",
"UNITED STATES OF AMERICA", "U.S. POSTAL", "U.S. ARMY",
"FLORIDA POWER", "FPL ", "WATER MANAGEMENT DISTRICT",
"DEPARTMENT OF TRANSPORTATION", "DOT ",
]
# Non-profit / religious (rare REO scenarios)
NONPROFIT_PATTERNS = [
"CHURCH OF", "CATHOLIC", "BAPTIST", "METHODIST",
"FOUNDATION", "MINISTRIES", "RELIGIOUS",
"HABITAT FOR HUMANITY", "REDEVELOPMENT",
"NON-PROFIT", "NONPROFIT", "RED CROSS",
]
# LLC patterns (investor-owned, possible negotiation)
LLC_PATTERNS = [
" LLC", "LLC ", "L.L.C.", "L.L.C ",
" INC", " INC.", "INCORPORATED",
" CORP", "CORPORATION", " LTD",
" LP", "LIMITED PARTNERSHIP",
]
# Trust patterns (family trust vs MBS trust — context matters)
TRUST_PATTERNS = [
" TRUST", "TRUSTEE", " TR ", "LIVING TRUST", "FAMILY TRUST",
"REVOCABLE TRUST", "IRREVOCABLE TRUST",
]
# Individual indicators (LE = Life Estate, REM = Remainderman, etc.)
INDIVIDUAL_INDICATORS = [
" LE", " REM", " H/W", "HUSBAND", "WIFE",
"& W ", " &W ", "&H ", " EST", "ESTATE OF",
]
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def classify_owner(owner_name: Optional[str], co_owners: Optional[list[str]] = None) -> dict:
"""Classify owner_name into business categories.
Args:
owner_name: primary owner name from PA
co_owners: optional list of additional owners
Returns:
{
"type": str (category code),
"category": str (business-level),
"is_lender": bool,
"is_government": bool,
"is_individual": bool,
"is_investor_entity": bool,
"is_trust": bool,
"confidence": float (0-1),
"matched_keyword": str | None,
"normalized": str (uppercase, stripped),
"evidence": [str],
}
"""
out = {
"type": "UNKNOWN",
"category": "unknown",
"is_lender": False,
"is_government": False,
"is_individual": False,
"is_investor_entity": False,
"is_trust": False,
"confidence": 0.0,
"matched_keyword": None,
"normalized": "",
"evidence": [],
}
if not owner_name:
return out
# Combine owner + co-owners for full classification
full_text = owner_name.upper()
if co_owners:
full_text += " | " + " | ".join((c or "").upper() for c in co_owners if c)
out["normalized"] = full_text
# 1. Check lender categories (highest priority — REO opportunity)
for category, patterns in LENDER_PATTERNS.items():
for pat in patterns:
if pat.upper() in full_text:
out["type"] = category
out["category"] = "lender"
out["is_lender"] = True
out["matched_keyword"] = pat
out["confidence"] = 0.95
out["evidence"].append(f"matched lender keyword '{pat}'")
return out
# 2. Government entities (not REO, but flagged)
for pat in GOVERNMENT_PATTERNS:
if pat.upper() in full_text:
out["type"] = "GOVERNMENT"
out["category"] = "government"
out["is_government"] = True
out["matched_keyword"] = pat
out["confidence"] = 0.90
out["evidence"].append(f"matched government keyword '{pat}'")
return out
# 3. Non-profit
for pat in NONPROFIT_PATTERNS:
if pat.upper() in full_text:
out["type"] = "NONPROFIT"
out["category"] = "nonprofit"
out["matched_keyword"] = pat
out["confidence"] = 0.85
out["evidence"].append(f"matched nonprofit keyword '{pat}'")
return out
# 4. Trust (family trust vs MBS trust — usually family if not caught by MBS_TRUSTEE above)
is_trust = any(pat in full_text for pat in TRUST_PATTERNS)
if is_trust:
out["is_trust"] = True
out["type"] = "FAMILY_TRUST"
out["category"] = "trust"
out["matched_keyword"] = next((p for p in TRUST_PATTERNS if p in full_text), None)
out["confidence"] = 0.80
out["evidence"].append("Trust keyword detected (likely family/estate trust)")
# Don't return — might also be LLC
# 5. LLC / Corporation
is_llc = any(pat in full_text for pat in LLC_PATTERNS)
if is_llc:
out["is_investor_entity"] = True
if out["type"] == "UNKNOWN":
out["type"] = "LLC_OR_CORP"
out["category"] = "investor_entity"
out["matched_keyword"] = next((p for p in LLC_PATTERNS if p in full_text), None)
out["confidence"] = 0.85
out["evidence"].append(f"matched LLC/corp keyword")
return out
if is_trust:
return out
# 6. Individual heuristic — owner name has comma (LASTNAME, FIRSTNAME format)
# OR contains individual indicators
has_comma = "," in full_text
has_individual = any(ind in full_text for ind in INDIVIDUAL_INDICATORS)
# OR has only 2-4 words and no numbers
words = full_text.replace(",", " ").split()
word_count = len(words)
has_numbers = any(any(c.isdigit() for c in w) for w in words)
if has_comma or has_individual or (2 <= word_count <= 5 and not has_numbers):
out["type"] = "INDIVIDUAL"
out["category"] = "individual"
out["is_individual"] = True
out["confidence"] = 0.70
out["evidence"].append("Name pattern matches individual (comma format or 2-5 words)")
return out
# Default: unknown
out["type"] = "UNKNOWN"
out["category"] = "unknown"
out["confidence"] = 0.10
out["evidence"].append("No clear pattern matched")
return out
def build_reo_signal(
*,
owner_classification: dict,
just_value: Optional[float] = None,
assessed_value: Optional[float] = None,
listing_price: Optional[float] = None,
taxes_paid_last: Optional[float] = None,
mailing_address: Optional[str] = None,
) -> dict:
"""Build REO direct outreach opportunity signal.
Para owners que son lender (BANK_NATIONAL, GSE_FEDERAL, MBS_TRUSTEE):
sugiere oferta directa para evitar MLS + comisiones del agent del banco.
Returns:
{
"is_reo_opportunity": bool,
"lender_type": str | None,
"strategy": str | None,
"suggested_offer_low": int | None,
"suggested_offer_high": int | None,
"discount_pct_vs_market": float | None,
"justification_es": str | None,
"outreach_contact_hint": str | None,
}
"""
out = {
"is_reo_opportunity": False,
"lender_type": None,
"strategy": None,
"suggested_offer_low": None,
"suggested_offer_high": None,
"discount_pct_vs_market": None,
"justification_es": None,
"outreach_contact_hint": None,
}
if not owner_classification.get("is_lender"):
return out
lender_type = owner_classification.get("type")
out["lender_type"] = lender_type
out["is_reo_opportunity"] = True
# Math: suggest 85-95% of assessed value as offer range (banks accept this to liquidate)
base = assessed_value or just_value or listing_price or 0
if base > 0:
offer_low = int(base * 0.85)
offer_high = int(base * 0.92)
out["suggested_offer_low"] = offer_low
out["suggested_offer_high"] = offer_high
market_ref = just_value or listing_price or base
if market_ref > 0:
mid = (offer_low + offer_high) / 2
out["discount_pct_vs_market"] = round((1 - mid / market_ref) * 100, 1)
# Strategy + justification by lender type
if lender_type == "BANK_NATIONAL":
out["strategy"] = "Direct REO desk outreach"
out["justification_es"] = (
f"Banco nacional como owner = REO post-foreclosure en su balance. "
f"REO desks tienen quota mensual de disposicion. Aceptan ofertas direct "
f"para evitar MLS + 6% commission. "
f"Ofrece ${offer_low:,}-${offer_high:,} ({out.get('discount_pct_vs_market', 0)}% bajo market). "
f"Si cubris taxes pendientes (~${taxes_paid_last or 5000:,.0f}/año) y cerras cash o "
f"conventional ready, alta probabilidad de aceptacion."
)
out["outreach_contact_hint"] = (
"Contact: search '{bank} REO disposition' o 'REO asset manager' en LinkedIn. "
"Mail oficial: {mailing_address} (REO department typically receives correspondence here)."
).format(
bank=lender_type.replace("_", " ").title(),
mailing_address=mailing_address or "ver PA record",
)
elif lender_type == "GSE_FEDERAL":
out["strategy"] = "Fannie/Freddie HomePath / HUD HomeStore"
out["justification_es"] = (
f"GSE (Fannie/Freddie/HUD) como owner. Suelen vender via canales "
f"oficiales: HomePath.com (Fannie), HomeSteps.com (Freddie), HUDHomeStore.gov. "
f"Periodo Owner-Occupied first ~15-30 dias, despues investors. "
f"Si la propiedad lleva >30 dias unsold, oferta bajo asking es aceptable. "
f"Sugerido ${offer_low:,}-${offer_high:,}."
)
out["outreach_contact_hint"] = (
"Buscar la propiedad en HomePath.com (Fannie) / HomeSteps.com (Freddie) / "
"HUDHomeStore.gov para ver listing oficial + plazos."
)
elif lender_type == "MBS_TRUSTEE":
out["strategy"] = "Trustee-held REO (MBS securitization)"
out["justification_es"] = (
f"Trustee bank de un MBS (mortgage-backed security). Propiedad fue "
f"foreclosed y entro al inventory del trust. El trustee delega a un "
f"servicer (BAYVIEW/SHELLPOINT/etc) para la liquidacion. "
f"Mas burocratico que un REO de bank-direct pero similar dinamica. "
f"Oferta sugerida ${offer_low:,}-${offer_high:,}."
)
out["outreach_contact_hint"] = (
"Identificar el servicer (suele estar en el mailing address o documento de transferencia). "
"Contactar al servicer's REO/loss mitigation department."
)
elif lender_type == "SERVICER":
out["strategy"] = "Servicer-held REO outreach"
out["justification_es"] = (
f"Loan servicer como owner = post-foreclosure inventory. Servicers tienen "
f"presion por compliance + balance sheet para liquidar. "
f"Oferta sugerida ${offer_low:,}-${offer_high:,}."
)
out["outreach_contact_hint"] = "Contact servicer's REO disposition department."
return out
# ════════════════════════════════════════════════════════════════════════════
# CLI
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Owner classifier + REO signal")
parser.add_argument("owner_name", help="Owner name from PA (e.g. 'BANK OF AMERICA NA TRSTEE')")
parser.add_argument("--co-owners", nargs="*", help="Additional co-owners")
parser.add_argument("--just-value", type=float, help="Just/market value")
parser.add_argument("--assessed-value", type=float, help="Assessed value")
parser.add_argument("--listing-price", type=float, help="Current listing price")
parser.add_argument("--taxes-paid", type=float, help="Taxes paid last year")
parser.add_argument("--mailing", help="Mailing address (for REO contact hint)")
args = parser.parse_args()
cls = classify_owner(args.owner_name, co_owners=args.co_owners)
print("=== CLASSIFICATION ===")
print(json.dumps(cls, indent=2, default=str))
reo = build_reo_signal(
owner_classification=cls,
just_value=args.just_value,
assessed_value=args.assessed_value,
listing_price=args.listing_price,
taxes_paid_last=args.taxes_paid,
mailing_address=args.mailing,
)
print("\n=== REO SIGNAL ===")
print(json.dumps(reo, indent=2, default=str))
+461
View File
@@ -0,0 +1,461 @@
"""data_fetchers/pa_broward.py — Full Broward County Property Appraiser extractor.
Extrae TODO lo publico de bcpa.net para construir un Property Snapshot Report ($15):
- Owner + mailing address
- Property address + neighborhood
- Year built, sqft, use code, units
- Just/Market value, Assessed/SOH value, by year (3 anios)
- Taxes paid (3 anios)
- Tax breakdown por district (County / School Board / Municipal / Independent)
- Exemptions (homestead, senior, vet, disabled, etc.)
- Photo URL
- Legal description
USAGE:
from data_fetchers.pa_broward import fetch_broward_pa_record
record = fetch_broward_pa_record(parcel_id="484226062150")
# record["owner_name"], record["just_value"], record["sales_history"]...
TECHNICAL:
- bcpa.net es Angular SPA — usar Playwright, NO requests/curl
- wait_until="domcontentloaded" + 25s sleep (NO networkidle, nunca termina)
- Element IDs son ESTABLES (data-bound by Angular, NO autogenerados como JSF)
- Per-folio latency: ~28-32s
- Free (Playwright local, no API cost)
"""
from __future__ import annotations
import re
import time
from datetime import datetime
from typing import Optional
# ════════════════════════════════════════════════════════════════════════════
# Field ID mapping — confirmed via probe on folio 484226062150
# ════════════════════════════════════════════════════════════════════════════
# Single-value scalar fields
_SCALAR_IDS = {
"folio_number": "folioNumberId",
"owner_name": "ownerNameId",
"owner_name_2": "ownerName2Id",
"mailing_address": "mailingAddressId",
"situs_address": "situsAddressId",
"neighborhood": "neighborhood",
"use_code": "useCodeId",
"millage_code": "millageCodeId",
"adj_bldg_sqft": "bldgSqFTId",
"under_air_sqft": "bldgUnderAirFootageId",
"effective_year": "effectiveAgeId",
"year_built": "actualAgeId",
"units_beds_baths": "unitsBedsBathsId",
"legal_description": "legalDescId",
"homestead_flag": "homesteadFlagId",
# Current year values (auto-current year, e.g. 2026)
"current_tax_year": "currentTaxYearId",
"land_value_current": "landCurrentYearId",
"bldg_value_current": "bldgCurrentYearId",
"just_value_current": "justCurrentYearId",
"assessed_value_current": "sohCurrentYearId",
# Last year (e.g. 2025)
"last_tax_year": "lastTaxYearId",
"land_value_last": "landLastYearId",
"bldg_value_last": "bldgLastYearId",
"just_value_last": "justLastYearId",
"assessed_value_last": "sohLastYearId",
"taxes_paid_last": "assessedLastYearId",
# Two years ago (e.g. 2024)
"two_years_ago_tax_year": "lastTwoTaxYearId",
"land_value_2yr": "landLasttwoYearsId",
"bldg_value_2yr": "bldgLasttwoYearsId",
"just_value_2yr": "justLasttwoYearsId",
"assessed_value_2yr": "sohLasttwoYearsId",
"taxes_paid_2yr": "assessedLasttwoYearsId",
}
# Tax breakdown by district (current year)
_DISTRICT_IDS = {
# district name: {field: id}
"county": {
"just_value": "justValueCounty",
"portability": "portabilityValueCounty",
"assessed_soh": "sohValueCounty",
"homestead": "he1AmountCounty",
"add_homestead": "he2AmountCounty",
"widow_vet_dis": "wvdAmountCounty",
"senior": "seniorExemptionCounty",
"exemption_type": "mexAmountCounty",
"affordable_housing": "ahAmountCounty",
"taxable": "taxableAmountCounty",
},
"school_board": {
"just_value": "justValueSchoolBoard",
"portability": "portabilityValueSchoolBoard",
"assessed_soh": "sohValueSchoolBoard",
"homestead": "he1AmountSchoolBoard",
"add_homestead": "he2AmountSchoolBoard",
"widow_vet_dis": "wvdAmountSchoolBoard",
"exemption_type": "mexAmountSchoolBoard",
"affordable_housing": "ahAmountSchoolBoard",
"taxable": "taxableAmountSchoolBoard",
},
"municipal": {
"just_value": "justValueMunicipal",
"portability": "portabilityValueMunicipal",
"assessed_soh": "sohValueMunicipal",
"homestead": "he1AmountMunicipal",
"add_homestead": "he2AmountMunicipal",
"widow_vet_dis": "wvdAmountMunicipal",
"senior": "seniorExemptionMunicipal",
"exemption_type": "mexAmountMunicipal",
"affordable_housing": "ahAmountMunicipal",
"taxable": "taxableAmountMunicipal",
},
"independent": {
"just_value": "justValueIndependent",
"portability": "portabilityValueIndependent",
"assessed_soh": "sohValueIndependent",
"homestead": "he1AmountIndependent",
"add_homestead": "he2AmountIndependent",
"widow_vet_dis": "wvdAmountIndependent",
"exemption_type": "mexAmountIndependent",
"affordable_housing": "ahAmountIndependent",
"taxable": "taxableAmountIndependent",
},
}
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def fetch_broward_pa_record(
parcel_id: str,
timeout_seconds: int = 45,
wait_after_load: int = 25,
) -> dict:
"""Fetch full Broward PA record for a parcel_id.
Args:
parcel_id: bcpa folio (e.g., "484226062150")
timeout_seconds: max wait per Playwright operation
wait_after_load: SPA settle time after domcontentloaded (default 25s)
Returns:
{
"folio_number": str,
"owner_name": str (may include " % " corp marker),
"owner_name_2": str (continuation line),
"mailing_address": str,
"situs_address": str,
"neighborhood": str,
"use_code": str (e.g. "01-01 Single Family"),
"year_built": int,
"effective_year": int,
"adj_bldg_sqft": int,
"under_air_sqft": int,
"millage_code": str,
"legal_description": str,
"homestead_active": bool,
"current_year": {
"tax_year": int,
"land_value": int,
"bldg_value": int,
"just_value": int,
"assessed_value": int,
},
"last_year": {
"tax_year": int,
"land_value": int,
"bldg_value": int,
"just_value": int,
"assessed_value": int,
"taxes_paid": float,
},
"two_years_ago": {... same ...},
"tax_breakdown": {
"county": {just_value, portability, assessed_soh,
homestead, add_homestead, widow_vet_dis, senior,
exemption_type, affordable_housing, taxable},
"school_board": {...},
"municipal": {...},
"independent": {...},
},
"sales_history": [
{date, type, qualified_disqualified, price, book_page_or_cin}, ...
],
"photo_url": str | None,
"source_url": str,
"fetched_at": ISO timestamp,
"errors": [str],
}
"""
fetched_at = datetime.utcnow().isoformat() + "Z"
result = {
"folio_number": parcel_id,
"errors": [],
"source_url": f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}",
"source_api_url": f"https://web.bcpa.net/bcpaclient/search.aspx?Folio={parcel_id}",
"fetched_at": fetched_at,
}
if not parcel_id or not parcel_id.strip():
result["errors"].append("no parcel_id provided")
return result
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except ImportError:
result["errors"].append("playwright not installed")
return result
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
viewport={"width": 1400, "height": 1000},
)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
page.goto(url, wait_until="domcontentloaded")
time.sleep(wait_after_load)
# Wait until at least one scalar populates (sentinel: actualAgeId = year built)
try:
page.wait_for_function(
"() => { const el = document.getElementById('actualAgeId'); return el && el.textContent.trim().length > 0; }",
timeout=10000,
)
except Exception:
# If sentinel didn't populate, try anyway — maybe extra time helps
time.sleep(5)
# Extract all scalar fields in one JS call (faster than per-locator)
scalar_values = page.evaluate(
"""(ids) => {
const out = {};
for (const [key, id] of Object.entries(ids)) {
const el = document.getElementById(id);
out[key] = el ? (el.textContent || '').trim() : '';
}
return out;
}""",
_SCALAR_IDS,
)
district_values = page.evaluate(
"""(districts) => {
const out = {};
for (const [name, fields] of Object.entries(districts)) {
out[name] = {};
for (const [field, id] of Object.entries(fields)) {
const el = document.getElementById(id);
out[name][field] = el ? (el.textContent || '').trim() : '';
}
}
return out;
}""",
_DISTRICT_IDS,
)
# Extract sales history table (PrimaryProperty Sales Information)
sales_history = page.evaluate("""
() => {
const out = [];
// The sales table has rows with class containing dates/types
// Look for the table with header "Date | Type | Qualified | Price | Book/Page"
const tables = document.querySelectorAll('table');
for (const tbl of tables) {
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
if (!hdrCells || hdrCells.length < 4) continue;
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
const isSalesHdr = hdrText.some(h => h.includes('date')) &&
hdrText.some(h => h.includes('type')) &&
hdrText.some(h => h.includes('price')) &&
hdrText.some(h => h.includes('qualified'));
if (!isSalesHdr) continue;
// Parse data rows
const rows = tbl.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 4) continue;
const r = {};
cells.forEach((c, idx) => {
const h = hdrText[idx] || `col${idx}`;
r[h] = (c.textContent || '').trim();
});
// Skip empty rows
if (Object.values(r).some(v => v && v.length > 0)) {
out.push(r);
}
}
if (out.length > 0) break;
}
return out;
}
""")
# Photo URL
photo_urls = page.evaluate("""
() => Array.from(document.querySelectorAll('img'))
.filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)
.map(i => i.src)
""")
browser.close()
# ─── Post-process scalars ─────────────────────────────────────
result.update({k: _clean(v) for k, v in scalar_values.items()})
# Coerce numeric fields
for k in ("year_built", "effective_year", "current_tax_year", "last_tax_year",
"two_years_ago_tax_year", "adj_bldg_sqft", "under_air_sqft"):
v = result.get(k, "")
if v:
result[k] = _to_int(v)
for k in ("land_value_current", "bldg_value_current", "just_value_current",
"assessed_value_current", "land_value_last", "bldg_value_last",
"just_value_last", "assessed_value_last",
"land_value_2yr", "bldg_value_2yr", "just_value_2yr", "assessed_value_2yr"):
v = result.get(k, "")
if v:
result[k] = _money_to_int(v)
for k in ("taxes_paid_last", "taxes_paid_2yr"):
v = result.get(k, "")
if v:
result[k] = _money_to_float(v)
# ─── Structured groupings for downstream consumers ───────────
result["current_year"] = {
"tax_year": result.get("current_tax_year"),
"land_value": result.get("land_value_current"),
"bldg_value": result.get("bldg_value_current"),
"just_value": result.get("just_value_current"),
"assessed_value": result.get("assessed_value_current"),
}
result["last_year"] = {
"tax_year": result.get("last_tax_year"),
"land_value": result.get("land_value_last"),
"bldg_value": result.get("bldg_value_last"),
"just_value": result.get("just_value_last"),
"assessed_value": result.get("assessed_value_last"),
"taxes_paid": result.get("taxes_paid_last"),
}
result["two_years_ago"] = {
"tax_year": result.get("two_years_ago_tax_year"),
"land_value": result.get("land_value_2yr"),
"bldg_value": result.get("bldg_value_2yr"),
"just_value": result.get("just_value_2yr"),
"assessed_value": result.get("assessed_value_2yr"),
"taxes_paid": result.get("taxes_paid_2yr"),
}
# Process tax breakdown — clean & convert
result["tax_breakdown"] = {}
for district, fields in district_values.items():
result["tax_breakdown"][district] = {
k: _money_to_int(v) if "$" in v or v.replace(",", "").replace(".", "").isdigit() else _clean(v)
for k, v in fields.items()
}
# Sales history cleanup
result["sales_history"] = []
for s in sales_history:
# Normalize key names from possibly varied headers
norm = {
"date": _clean(s.get("date", "")),
"type": _clean(s.get("type", "")),
"qualified_disqualified": _clean(s.get("qualified/disqualified", s.get("qualified", ""))),
"price": _money_to_int(s.get("price", "")) if s.get("price") else None,
"book_page_or_cin": _clean(s.get("book/page or cin", s.get("book/page", ""))),
}
if any(norm.values()):
result["sales_history"].append(norm)
# Homestead boolean (flag is " , N" or " , Y")
hf = result.get("homestead_flag", "")
result["homestead_active"] = "Y" in hf.upper() and "N" not in hf.upper()
# Photo
result["photo_url"] = photo_urls[0] if photo_urls else None
except PWTimeout as e:
result["errors"].append(f"timeout: {e}")
except Exception as e:
import traceback
result["errors"].append(f"{type(e).__name__}: {e}")
result["_trace"] = traceback.format_exc()[:600]
return result
# ════════════════════════════════════════════════════════════════════════════
# Helpers
# ════════════════════════════════════════════════════════════════════════════
def _clean(s: str) -> str:
"""Collapse whitespace and strip."""
if not isinstance(s, str):
return s
return re.sub(r"\s+", " ", s).strip()
def _to_int(s: str) -> Optional[int]:
"""Parse '1969' or '1,199' → int. Returns None if unparseable."""
if not s:
return None
cleaned = re.sub(r"[^\d-]", "", s)
try:
return int(cleaned) if cleaned else None
except ValueError:
return None
def _money_to_int(s: str) -> Optional[int]:
"""Parse '$322,580' → 322580. Returns None if unparseable."""
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", s)
if not cleaned or cleaned == "-":
return None
try:
return int(float(cleaned))
except ValueError:
return None
def _money_to_float(s: str) -> Optional[float]:
"""Parse '$5,256.59' → 5256.59."""
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", s)
if not cleaned or cleaned == "-":
return None
try:
return float(cleaned)
except ValueError:
return None
# ════════════════════════════════════════════════════════════════════════════
# CLI for manual testing
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Broward PA full record fetcher")
parser.add_argument("parcel_id", help="Folio number (e.g. 484226062150)")
parser.add_argument("--wait", type=int, default=25, help="SPA settle seconds (default 25)")
args = parser.parse_args()
record = fetch_broward_pa_record(args.parcel_id, wait_after_load=args.wait)
print(json.dumps(record, indent=2, default=str))
+820
View File
@@ -0,0 +1,820 @@
"""data_fetchers/pa_duval.py — Full Duval County Property Appraiser extractor.
Sitio: https://paopropertysearch.coj.net (ASP.NET WebForms)
Flow: Search.aspx → Results.aspx → Detail.aspx?ParcelNumber=XXX
Extrae todo lo publico del Duval PA para construir un Property Snapshot Report:
- Owner name(s)
- Property address + subdivision + legal description
- Building: type, year_built, sqft heated/total, bedrooms, bathrooms,
exterior wall, roof type, interior flooring
- Values: just/market, assessed, exemptions (3-year history)
- Tax breakdown por taxing district
- Sales history completa (book/page, date, price, deed type, qualified status)
- Homestead exemption (key signal: owner-occupant vs investor)
- Land details (zoning, lot size, use code)
- Extra features (fireplace, pool, etc.)
USAGE:
from data_fetchers.pa_duval import fetch_duval_pa_record
rec = fetch_duval_pa_record(address="2352 SCENIC VIEW CT", zip_code="32218")
# rec["year_built"], rec["sales_history"], rec["homestead_active"]...
TECHNICAL:
- ASP.NET WebForms con WebForm_DoPostBackWithOptions (compat IE8)
- Element IDs ESTABLES (no autogenerados)
- Per-search latency: ~10-15s (entry → search → results → detail)
- Free (Playwright local, no API cost)
"""
from __future__ import annotations
import re
import time
from datetime import datetime, timezone
from typing import Optional
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
_BASE_URL = "https://paopropertysearch.coj.net"
_SEARCH_URL = f"{_BASE_URL}/Basic/Search.aspx"
# Address parser para Duval ASP.NET form fields.
# Acepta long form (COURT, STREET) y abbreviation (CT, ST). Strip city/state/zip
# antes de parsear (split por primera coma).
_ADDRESS_RE = re.compile(
r"^\s*(?P<num>\d+)\s+"
r"(?:(?P<prefix>N|S|E|W|NE|NW|SE|SW)\s+)?"
r"(?P<name>[A-Z][A-Z\s\d\-']*?)"
r"\s+(?P<suffix>"
r"ST|STREET|AVE|AV|AVENUE|RD|ROAD|DR|DRIVE|CT|COURT|CIR|CIRCLE|"
r"LN|LANE|BLVD|BOULEVARD|HWY|HIGHWAY|WAY|PL|PLACE|PKY|PKWY|PARKWAY|"
r"TRL|TRAIL|TER|TERRACE|LOOP|RUN|ALY|ALLEY|XING|CROSSING"
r")\b",
re.IGNORECASE,
)
# Map long form → ASP.NET ddStreetSuffix value
_SUFFIX_NORMALIZE = {
"STREET": "ST", "AVENUE": "AVE", "AV": "AVE", "ROAD": "RD",
"DRIVE": "DR", "COURT": "CT", "CIRCLE": "CIR", "LANE": "LN",
"BOULEVARD": "BLVD", "HIGHWAY": "HWY", "PLACE": "PL",
"PARKWAY": "PKWY", "PKY": "PKWY", "TRAIL": "TRL", "TERRACE": "TER",
"ALLEY": "ALY", "CROSSING": "XING",
}
# ════════════════════════════════════════════════════════════════════════════
# Field ID mapping (confirmed via probe on 2352 SCENIC VIEW CT)
# ════════════════════════════════════════════════════════════════════════════
_SCALAR_IDS = {
"owner_name": "ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
"site_address_line1": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine1",
"site_address_line2": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine2",
"building_type": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingType",
"year_built": "ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
"building_value": "ctl00_cphBody_repeaterBuilding_ctl00_lblBldgValue",
# Tax values current + last year
"tax_last_year_just": "ctl00_cphBody_lblTaxLastYearJustValue",
"tax_last_year_assessed": "ctl00_cphBody_lblTaxLastYearAssessedValue",
"tax_last_year_exemptions": "ctl00_cphBody_lblTaxLastYearExemptions",
"tax_last_year_taxable": "ctl00_cphBody_lblTaxLastYearTaxableValue",
"tax_current_year_just": "ctl00_cphBody_lblTaxCurrentYearJustValue",
"tax_current_year_assessed": "ctl00_cphBody_lblTaxCurrentYearAssessedValue",
"tax_current_year_exemptions": "ctl00_cphBody_lblTaxCurrentYearExemptions",
"tax_current_year_taxable": "ctl00_cphBody_lblTaxCurrentYearTaxableValue",
# Values from main values table (no current "in progress" year)
"assessed_value_3": "ctl00_cphBody_lblAssessedValue3",
"taxable_value_school": "ctl00_cphBody_lblTaxableValueSchool",
}
def _parse_address(address: str) -> Optional[dict]:
"""Parse address into Duval form fields.
Acepta:
'2352 SCENIC VIEW CT' → simple
'2352 SCENIC VIEW Court, Jacksonville, FL 32218' → con city/state/zip
'123 N MAIN ST, Anytown, FL' → con prefix
"""
if not address:
return None
# Strip city/state/zip — toma solo lo antes del primer comma
street_only = address.split(",")[0].strip().upper()
m = _ADDRESS_RE.search(street_only)
if not m:
return None
suffix_raw = (m.group("suffix") or "").strip().upper()
suffix_normalized = _SUFFIX_NORMALIZE.get(suffix_raw, suffix_raw) if suffix_raw else None
return {
"street_num": m.group("num"),
"prefix": (m.group("prefix") or "").strip().upper() or None,
"name": m.group("name").strip(),
"suffix": suffix_normalized,
}
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def fetch_duval_pa_record(
address: Optional[str] = None,
parcel_id: Optional[str] = None,
zip_code: Optional[str] = None,
timeout_seconds: int = 30,
listing_price: Optional[float] = None,
) -> dict:
"""Fetch full Duval PA record by address OR parcel_id (RE#).
Args:
address: street address (e.g. "2352 SCENIC VIEW CT")
parcel_id: Duval RE# (e.g. "044273-0370") — preferred si lo tenes
zip_code: optional zip filter
timeout_seconds: max wait per Playwright op
Returns:
Dict con TODOS los campos publicos. Si fallo, dict tiene 'errors'.
Key fields:
- owner_name, owner_full_address (mailing)
- site_address, parcel_id (RE#), subdivision
- year_built (ENTERO), building_type, sqft_heated, sqft_gross,
sqft_garage, bedrooms, baths, stories
- exterior_wall, roof_struct, roofing_cover, interior_wall, int_flooring
- just_value_current, assessed_value_current, taxable_current,
exemption_current
- just_value_last, assessed_value_last, taxes_billed_last
- homestead_active (bool — exemptions >= $25,000 = homestead)
- sales_history: [{date, price, deed_type, qualified, book_page,
vacant_improved}, ...]
- extra_features: [{code, description, value}, ...]
- land: {zoning, use_code, lot_acres, lot_total_sqft}
- legal_description: str
- tax_breakdown: [{district, assessed, exempt, taxable, tax_amt}, ...]
- source_url: str (detail page URL)
- fetched_at: ISO timestamp
"""
fetched_at = datetime.now(timezone.utc).isoformat()
result = {
"county": "Duval",
"source": "Duval Property Appraiser (paopropertysearch.coj.net)",
"fetched_at": fetched_at,
"errors": [],
}
if not address and not parcel_id:
result["errors"].append("no address or parcel_id provided")
return result
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except ImportError:
result["errors"].append("playwright not installed")
return result
parsed_addr = _parse_address(address) if address else None
if address and not parsed_addr:
result["errors"].append(f"could not parse address '{address}' (need format: 'NUM [PREFIX] NAME SUFFIX')")
return result
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(user_agent=USER_AGENT)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
page.goto(_SEARCH_URL, wait_until="load", timeout=timeout_seconds * 1000)
time.sleep(2)
if parcel_id:
pid_clean = parcel_id.replace("-", "").strip()
detail_url = f"{_BASE_URL}/Basic/Detail.aspx?RE={pid_clean}"
try:
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
except Exception:
# If 'load' times out, fall back to 'commit' (page has navigated)
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
time.sleep(5)
else:
# Search by address fields
page.locator("#ctl00_cphBody_tbStreetNumber").fill(parsed_addr["street_num"])
if parsed_addr["prefix"]:
try:
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(value=parsed_addr["prefix"])
except Exception:
pass
page.locator("#ctl00_cphBody_tbStreetName").fill(parsed_addr["name"])
if parsed_addr["suffix"]:
try:
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(value=parsed_addr["suffix"])
except Exception:
pass
if zip_code:
try:
page.locator("#ctl00_cphBody_tbZipCode").fill(zip_code)
except Exception:
pass
page.locator("#ctl00_cphBody_bSearch").click()
page.wait_for_timeout(4000)
# If results table → extract href from first row link and
# navigate directly (Playwright click + navigation hangs on ASP.NET)
rs_url = page.url
if "Results.aspx" in rs_url:
detail_href = _extract_detail_href_with_retry(page, max_retries=2)
if not detail_href:
body_preview = page.inner_text("body")[:300].replace("\n", " ")
result["errors"].append(
f"results page returned but no Detail.aspx link found "
f"(url={page.url}, body_preview={body_preview!r})"
)
browser.close()
return result
# Build absolute URL and navigate directly (no click)
if detail_href.startswith("/"):
detail_url = f"{_BASE_URL}{detail_href}"
elif detail_href.startswith("http"):
detail_url = detail_href
else:
detail_url = f"{_BASE_URL}/Basic/{detail_href}"
try:
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
except Exception:
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
time.sleep(5)
elif "Detail.aspx" not in page.url:
result["errors"].append(f"unexpected URL after search: {page.url}")
browser.close()
return result
# We should now be on Detail.aspx
if "Detail.aspx" not in page.url:
result["errors"].append(f"failed to reach Detail page, URL: {page.url}")
browser.close()
return result
result["source_url"] = page.url
# Wait for KEY element to confirm full render before extracting.
# Retry once on server error (Duval intermittent 500s).
render_ok = False
for attempt in range(3):
try:
page.wait_for_selector(
"#ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
state="attached",
timeout=20000,
)
render_ok = True
break
except Exception:
# Try owner sentinel
try:
page.wait_for_selector(
"#ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
state="attached",
timeout=10000,
)
render_ok = True
break
except Exception:
# Check if server error → retry
body = page.inner_text("body")[:500]
if "Server Error" in body or "Runtime Error" in body:
if attempt < 2:
time.sleep(8 * (attempt + 1))
try:
page.reload(wait_until="load", timeout=30000)
except Exception:
pass
continue
break
if not render_ok:
result["errors"].append("detail page didn't render expected elements (server slow or error)")
# Extract all scalar fields in one JS call
scalars = page.evaluate(
"""(ids) => {
const out = {};
for (const [k, id] of Object.entries(ids)) {
const el = document.getElementById(id);
out[k] = el ? (el.textContent || '').trim() : '';
}
return out;
}""",
_SCALAR_IDS,
)
# Extract sales history
sales_history = page.evaluate("""
() => {
const out = [];
document.querySelectorAll('table').forEach((tbl) => {
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
if (!hdrCells || hdrCells.length < 4) return;
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
const isSalesHdr = hdrText.some(h => h.includes('sale date') || h.includes('book/page') || h.includes('deed instrument'));
if (!isSalesHdr) return;
const rows = tbl.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 4) continue;
const r = {};
cells.forEach((c, idx) => {
const h = hdrText[idx] || `col${idx}`;
r[h] = (c.textContent || '').trim();
});
if (Object.values(r).some(v => v && v.length > 0)) out.push(r);
}
});
return out;
}
""")
# Extract building area
building_area = page.evaluate("""
() => {
const out = {};
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingArea');
if (!grid) return out;
const rows = grid.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 4) continue;
const type = (cells[0].textContent || '').trim();
const gross = (cells[1].textContent || '').trim();
const heated = (cells[2].textContent || '').trim();
const effective = (cells[3].textContent || '').trim();
if (type) out[type] = { gross, heated, effective };
}
return out;
}
""")
# Extract building attributes (beds, baths, stories)
attrs = page.evaluate("""
() => {
const out = {};
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingAttributes');
if (!grid) return out;
const rows = grid.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 2) continue;
const element = (cells[0].textContent || '').trim();
const code = (cells[1].textContent || '').trim();
if (element) out[element] = code;
}
return out;
}
""")
# Extract building structural elements (roof, walls, flooring)
# NOTE: these come from the same building section, different grid
structural = page.evaluate("""
() => {
const out = {};
// Find any grid in building section with Element/Code/Detail headers
document.querySelectorAll('table').forEach((tbl) => {
const hdrs = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
if (!hdrs) return;
const ht = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
if (!(ht.includes('element') && ht.includes('code') && ht.includes('detail'))) return;
// Skip the simpler attributes table (only 3 fields)
const rows = tbl.querySelectorAll('tr');
if (rows.length < 4) return;
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 3) continue;
const element = (cells[0].textContent || '').trim();
const detail = (cells[2].textContent || '').trim();
if (element && detail) {
if (out[element]) {
out[element] += '; ' + detail;
} else {
out[element] = detail;
}
}
}
});
return out;
}
""")
# Extract main property identity (RE#, subdivision, etc.) from top table
top_props = page.evaluate("""
() => {
const out = {};
document.querySelectorAll('table').forEach((tbl) => {
const rows = tbl.querySelectorAll('tr');
if (rows.length < 3) return;
// Top table has key:value rows (2 cells per row)
// Heuristic: first cell ends with ':' or matches known labels
const knownLabels = ['re #','re#','tax district','property use',
'# of buildings','legal desc','subdivision','total area'];
let matchCount = 0;
const candidate = {};
for (const tr of rows) {
const cells = tr.querySelectorAll('td, th');
if (cells.length !== 2) continue;
const k = (cells[0].textContent || '').trim().toLowerCase().replace(/:$/, '');
const v = (cells[1].textContent || '').trim();
if (k && v && knownLabels.some(kw => k.startsWith(kw))) {
matchCount++;
candidate[k] = v;
}
}
if (matchCount >= 3) {
Object.assign(out, candidate);
}
});
return out;
}
""")
# Land details
land = page.evaluate("""
() => {
const out = {};
const grid = document.getElementById('ctl00_cphBody_gridLand');
if (!grid) return out;
const rows = grid.querySelectorAll('tr');
if (rows.length < 2) return out;
const hdrs = rows[0].querySelectorAll('th, td');
const hdrText = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
const dataRow = rows[1].querySelectorAll('td');
hdrText.forEach((h, i) => {
if (dataRow[i]) out[h] = (dataRow[i].textContent || '').trim();
});
return out;
}
""")
# Extra features (fireplace, pool, deck, etc.)
features = page.evaluate("""
() => {
const out = [];
const grid = document.getElementById('ctl00_cphBody_gridExtraFeatures');
if (!grid) return out;
const rows = grid.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 5) continue;
out.push({
code: (cells[1]?.textContent || '').trim(),
description: (cells[2]?.textContent || '').trim(),
units: (cells[6]?.textContent || '').trim(),
value: (cells[7]?.textContent || '').trim(),
});
}
return out;
}
""")
browser.close()
# ─── Post-process ─────────────────────────────────────────────
result.update({k: _clean(v) for k, v in scalars.items()})
# Numeric conversions
for k in ("year_built",):
v = result.get(k, "")
if v:
result[k] = _to_int(v)
for k in ("building_value", "tax_last_year_just", "tax_last_year_assessed",
"tax_last_year_exemptions", "tax_last_year_taxable",
"tax_current_year_just", "tax_current_year_assessed",
"tax_current_year_exemptions", "tax_current_year_taxable",
"assessed_value_3", "taxable_value_school"):
v = result.get(k, "")
if v:
result[k] = _money_to_int(v)
# Parcel id / subdivision / etc from top props
result["parcel_id"] = top_props.get("re #", "") or top_props.get("re#", "")
result["tax_district"] = top_props.get("tax district", "")
result["property_use"] = top_props.get("property use", "")
result["num_buildings"] = top_props.get("# of buildings", "")
result["subdivision"] = top_props.get("subdivision", "")
result["lot_total_sqft"] = _to_int(top_props.get("total area", "") or "0")
# Building area summary
result["building_area_grid"] = building_area
result["sqft_heated"] = _to_int(
(building_area.get("Base Area") or {}).get("heated", "0") or
(building_area.get("Total") or {}).get("heated", "0") or "0"
)
result["sqft_gross"] = _to_int(
(building_area.get("Total") or {}).get("gross", "0") or "0"
)
result["sqft_garage"] = _to_int(
(building_area.get("Finished Garage") or {}).get("gross", "0") or "0"
)
# Attributes: beds/baths/stories
def _attr_to_num(s):
if not s:
return None
try:
return float(s.split(".")[0]) if "." in s else float(s)
except (ValueError, TypeError):
return None
result["bedrooms"] = _attr_to_num(attrs.get("Bedrooms", ""))
result["baths"] = _attr_to_num(attrs.get("Baths", ""))
result["stories"] = _attr_to_num(attrs.get("Stories", ""))
result["units"] = _attr_to_num(attrs.get("Rooms / Units", ""))
# Structural elements
result["exterior_wall"] = structural.get("Exterior Wall", "")
result["roof_struct"] = structural.get("Roof Struct", "")
result["roofing_cover"] = structural.get("Roofing Cover", "")
result["interior_wall"] = structural.get("Interior Wall", "")
result["int_flooring"] = structural.get("Int Flooring", "")
# Sales history normalized
result["sales_history"] = []
for s in sales_history:
record = {
"book_page": _clean(s.get("book/page", "")),
"date": _clean(s.get("sale date", "")),
"price": _money_to_int(s.get("sale price", "") or "0"),
"deed_type": _clean(s.get("deed instrument type code", "") or s.get("deed type", "")),
"qualified": _clean(s.get("qualified/unqualified", "") or s.get("qualified", "")),
"vacant_improved": _clean(s.get("vacant/improved", "")),
}
if any(record.values()):
result["sales_history"].append(record)
# Land details
result["land"] = {
"use_code": land.get("code", ""),
"use_description": land.get("use description", ""),
"zoning": land.get("zoning assessment", ""),
"front": land.get("front", ""),
"depth": land.get("depth", ""),
"land_units": land.get("land units", ""),
"land_type": land.get("land type", ""),
"land_value": _money_to_int(land.get("land value", "") or "0"),
}
# Extra features (fireplace, pool, etc.)
result["extra_features"] = features
# Homestead detection: exemptions >= $25K = primary residence with HX
ex_last = result.get("tax_last_year_exemptions") or 0
ex_curr = result.get("tax_current_year_exemptions") or 0
result["homestead_active"] = (ex_last >= 25000) or (ex_curr >= 25000)
result["homestead_amount_current"] = ex_curr
result["homestead_amount_last"] = ex_last
# Convenience: most recent qualified sale price
qualified_sales = [s for s in result["sales_history"]
if s.get("qualified", "").lower().startswith("qualified")
and s.get("price", 0) and s["price"] >= 1000]
result["most_recent_qualified_sale"] = qualified_sales[0] if qualified_sales else None
# Effective renovation signal:
# If most recent qualified sale price >> previous qualified sale price by
# >30% within 24 months → likely renovated/flipped.
renov_signal = _detect_renovation_pattern(
result["sales_history"], listing_price=listing_price,
)
result["renovation_signal"] = renov_signal
except PWTimeout as e:
result["errors"].append(f"timeout: {e}")
except Exception as e:
import traceback
result["errors"].append(f"{type(e).__name__}: {e}")
result["_trace"] = traceback.format_exc()[:600]
return result
# ════════════════════════════════════════════════════════════════════════════
# Helpers — server retry / detail link extraction
# ════════════════════════════════════════════════════════════════════════════
def _extract_detail_href_with_retry(page, max_retries: int = 2) -> Optional[str]:
"""Wait for Detail.aspx link on Results page, retry on server errors.
Duval PA returns intermittent 500 errors ("wait operation timed out")
when rate-limited. Retry with backoff handles that.
"""
for attempt in range(max_retries + 1):
# Wait for results to render
time.sleep(3)
try:
page.wait_for_selector("a[href*='Detail.aspx']", state="attached", timeout=15000)
except Exception:
pass
href = page.evaluate("""
() => {
const links = document.querySelectorAll("a[href*='Detail.aspx']");
return links.length > 0 ? links[0].getAttribute('href') : null;
}
""")
if href:
return href
# Check if this is a server error page
body = page.inner_text("body")[:500]
is_server_error = (
"Server Error" in body or
"wait operation timed out" in body or
"Runtime Error" in body
)
if is_server_error and attempt < max_retries:
# Backoff and retry — reload the search
backoff = 5 * (attempt + 1)
time.sleep(backoff)
try:
page.reload(wait_until="load", timeout=30000)
except Exception:
pass
continue
# If not server error, the link just isn't there — return None
return None
return None
# ════════════════════════════════════════════════════════════════════════════
# Helpers
# ════════════════════════════════════════════════════════════════════════════
def _clean(s) -> str:
if not isinstance(s, str):
return s
return re.sub(r"\s+", " ", s).strip()
def _to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d-]", "", str(s))
try:
return int(cleaned) if cleaned else None
except ValueError:
return None
def _money_to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", str(s))
if not cleaned or cleaned == "-":
return None
try:
return int(float(cleaned))
except ValueError:
return None
def _detect_renovation_pattern(sales: list[dict], listing_price: Optional[float] = None) -> dict:
"""Heuristic: detect flip / renovation / flip-in-progress patterns.
Args:
sales: sales_history (recent first)
listing_price: optional current listing price — habilita flip-in-progress detection
Returns:
{
"is_flip_pattern": bool, # qualified sales historical flip detected
"is_flip_in_progress": bool, # NEW: recent qualified << current listing
"evidence": str,
"most_recent_qualified": dict | None,
"prior_qualified": dict | None,
"value_increase_pct": float | None,
"months_between": int | None,
"listing_premium_pct": float | None, # NEW: (listing - recent_qualified) / recent_qualified * 100
"months_since_recent_sale": int | None,
"interpretation_es": str | None,
}
"""
out = {
"is_flip_pattern": False,
"is_flip_in_progress": False,
"evidence": "",
"most_recent_qualified": None,
"prior_qualified": None,
"value_increase_pct": None,
"months_between": None,
"listing_premium_pct": None,
"months_since_recent_sale": None,
"interpretation_es": None,
}
qualified = [s for s in sales
if s.get("qualified", "").lower().startswith("qualified")
and s.get("price", 0) and s["price"] >= 1000]
if not qualified:
return out
recent = qualified[0]
out["most_recent_qualified"] = recent
# ─── Pattern A: historical flip (prior qualified → recent qualified) ─────
if len(qualified) >= 2:
prior = qualified[1]
out["prior_qualified"] = prior
try:
increase = (recent["price"] - prior["price"]) / prior["price"] * 100
out["value_increase_pct"] = round(increase, 1)
except (TypeError, ZeroDivisionError):
pass
try:
d1 = datetime.strptime(recent["date"], "%m/%d/%Y")
d2 = datetime.strptime(prior["date"], "%m/%d/%Y")
months = abs((d1 - d2).days) // 30
out["months_between"] = months
except (ValueError, TypeError, KeyError):
pass
if out["value_increase_pct"] and out["months_between"]:
if out["value_increase_pct"] >= 25 and out["months_between"] <= 30:
out["is_flip_pattern"] = True
out["evidence"] = (
f"+{out['value_increase_pct']}% in {out['months_between']} months "
f"({prior['date']} ${prior['price']:,} -> {recent['date']} ${recent['price']:,})"
)
# ─── Pattern B: FLIP-IN-PROGRESS (recent qualified << current listing) ──
if listing_price and listing_price > 0 and recent.get("price", 0) > 0:
try:
premium = (listing_price - recent["price"]) / recent["price"] * 100
out["listing_premium_pct"] = round(premium, 1)
except (TypeError, ZeroDivisionError):
pass
try:
d_recent = datetime.strptime(recent["date"], "%m/%d/%Y")
today = datetime.now()
months_since = abs((today - d_recent).days) // 30
out["months_since_recent_sale"] = months_since
except (ValueError, TypeError, KeyError):
pass
# Flip-in-progress: recent qualified sale is 15%+ below listing AND
# the sale was within last 18 months (typical flip turnaround)
if (out["listing_premium_pct"] and out["listing_premium_pct"] >= 15
and out["months_since_recent_sale"] is not None
and out["months_since_recent_sale"] <= 18):
out["is_flip_in_progress"] = True
if out["evidence"]:
out["evidence"] += " | "
out["evidence"] += (
f"FLIP-IN-PROGRESS: owner bought ${recent['price']:,} on {recent['date']} "
f"({out['months_since_recent_sale']}mo ago), listing ${listing_price:,.0f} "
f"(+{out['listing_premium_pct']}%)"
)
# ─── Spanish interpretation ─────────────────────────────────────────────
if out["is_flip_in_progress"] and out["is_flip_pattern"]:
out["interpretation_es"] = (
"PATRON DE FLIP REPETIDO: la propiedad ya fue flipped una vez en el "
"historial. El owner actual la compro reciente y la lista mucho mas "
"alto. Probable renovacion reciente -> precio refleja inversion. "
"Si comprador final, esperate negociacion dura del owner (necesita "
"recuperar costos de rehab + margen)."
)
elif out["is_flip_in_progress"]:
out["interpretation_es"] = (
f"FLIP-IN-PROGRESS: el owner compro hace {out['months_since_recent_sale']}mo "
f"a ${recent['price']:,} y lista a ${listing_price:,.0f} (+{out['listing_premium_pct']:.0f}%). "
"Probable renovacion en el medio. Precio incluye trabajo. Negociar dificil — "
"owner tiene 'sunk cost' del rehab. Validar condicion real con inspeccion."
)
elif out["is_flip_pattern"]:
out["interpretation_es"] = (
f"HISTORIAL DE FLIP: la propiedad subio +{out['value_increase_pct']}% en "
f"{out['months_between']}mo (sale prior). Indica renovacion previa."
)
return out
# ════════════════════════════════════════════════════════════════════════════
# CLI
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Duval PA full record fetcher")
parser.add_argument("--address", help="Street address (e.g. '2352 SCENIC VIEW CT')")
parser.add_argument("--parcel", help="RE# (e.g. '044273-0370')")
parser.add_argument("--zip", help="Optional ZIP filter")
args = parser.parse_args()
if not args.address and not args.parcel:
parser.error("--address or --parcel required")
rec = fetch_duval_pa_record(
address=args.address, parcel_id=args.parcel, zip_code=args.zip,
)
print(json.dumps(rec, indent=2, default=str))
+404
View File
@@ -0,0 +1,404 @@
"""data_fetchers/pa_miami_dade.py — Full Miami-Dade PA extractor.
Sitio: https://apps.miamidadepa.gov/PropertySearch/ (Angular 14 + Kendo UI)
Deep link: /PropertySearch/#/?folio={folio_no_dashes}
Extrae todo lo publico del Miami-Dade PA via los components Angular:
- pa-propertyinformation: folio, sub-division, address, owner, mailing,
PA primary zone, primary land use, beds/baths/half, floors, living units,
living area, adjusted area, lot size, year built
- pa-salesinformation: sales history (date, price, OR book-page, qualification,
previous owner)
- pa-assessmentinformation: land/building/extra/market/assessed 3 anios
- pa-taxablevalueinformation: COUNTY/SCHOOL/etc exemption + taxable
- pa-benefitsinformation: homestead + other exemptions
- pa-legaldescription: legal description completa
USAGE:
from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
rec = fetch_miami_dade_pa_record(parcel_id="31-2202-034-2470")
# rec["owner_name"], rec["year_built"], rec["sales_history"]...
"""
from __future__ import annotations
import re
import time
from datetime import datetime, timezone
from typing import Optional
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
_BASE_URL = "https://apps.miamidadepa.gov/PropertySearch"
# ════════════════════════════════════════════════════════════════════════════
# Text parsing helpers — labels are on left, values on right (newline separated)
# ════════════════════════════════════════════════════════════════════════════
def _grab_after_label(text: str, label: str) -> Optional[str]:
"""Find 'label' and return text immediately after (until next label/newline)."""
if not text or not label:
return None
# Match "Label:value" or "Label\nvalue" or "Label\tvalue"
pattern = re.compile(
rf"{re.escape(label)}\s*[:\t]*\s*\n?\s*([^\n]+?)(?:\n|$)",
re.IGNORECASE,
)
m = pattern.search(text)
if m:
return m.group(1).strip()
return None
def _to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d-]", "", str(s))
try:
return int(cleaned) if cleaned else None
except ValueError:
return None
def _money_to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", str(s))
if not cleaned or cleaned == "-":
return None
try:
return int(float(cleaned))
except ValueError:
return None
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def fetch_miami_dade_pa_record(
parcel_id: Optional[str] = None,
address: Optional[str] = None,
timeout_seconds: int = 45,
listing_price: Optional[float] = None,
) -> dict:
"""Fetch full Miami-Dade PA record.
Args:
parcel_id: folio number (e.g. "31-2202-034-2470" or "3122020342470")
address: alternative search by address (less reliable in this portal)
timeout_seconds: max wait per playwright op
listing_price: enables flip-in-progress detection
Returns: rich dict (same schema as pa_duval/pa_broward) with errors list.
"""
fetched_at = datetime.now(timezone.utc).isoformat()
result = {
"county": "Miami-Dade",
"source": "Miami-Dade Property Appraiser (apps.miamidadepa.gov)",
"fetched_at": fetched_at,
"errors": [],
}
if not parcel_id and not address:
result["errors"].append("no parcel_id or address provided")
return result
try:
from playwright.sync_api import sync_playwright
except ImportError:
result["errors"].append("playwright not installed")
return result
# Normalize folio (no dashes for URL)
folio_clean = (parcel_id or "").replace("-", "").strip()
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(user_agent=USER_AGENT)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
if folio_clean:
# Deep link by folio
url = f"{_BASE_URL}/#/?folio={folio_clean}"
page.goto(url, wait_until="domcontentloaded")
else:
# Search by address — landing page + fill form
page.goto(f"{_BASE_URL}/", wait_until="domcontentloaded")
time.sleep(5)
# Address tab is default. Fill kendo-textbox[formcontrolname='address']
addr_input = page.locator("kendo-textbox[formcontrolname='address'] input").first
addr_input.fill(address or "")
page.locator("button[aria-label='Search button']").first.click()
# Wait for property info to render
try:
page.wait_for_function(
"() => document.querySelector('pa-propertyinformation') "
"&& document.querySelector('pa-propertyinformation').innerText.includes('Folio')",
timeout=20000,
)
except Exception as e:
result["errors"].append(f"detail page didn't render: {e}")
browser.close()
return result
time.sleep(2)
result["source_url"] = page.url
# Extract text from each pa-component
sections = page.evaluate("""
() => {
const out = {};
const components = [
'pa-propertyinformation','pa-salesinformation',
'pa-assessmentinformation','pa-taxablevalueinformation',
'pa-benefitsinformation','pa-legaldescription',
'pa-additionalinformation',
];
for (const tag of components) {
const el = document.querySelector(tag);
out[tag] = el ? (el.innerText || '').trim() : '';
}
return out;
}
""")
# Also extract sales history table rows
sales_rows = page.evaluate("""
() => {
const out = [];
const sec = document.querySelector('pa-salesinformation');
if (!sec) return out;
const tbl = sec.querySelector('table');
if (!tbl) return out;
const rows = tbl.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 4) continue;
out.push({
date: (cells[0]?.textContent || '').trim(),
price: (cells[1]?.textContent || '').trim(),
book_page: (cells[2]?.textContent || '').trim(),
qualification: (cells[3]?.textContent || '').trim(),
previous_owner: cells.length > 4 ? (cells[4]?.textContent || '').trim() : '',
});
}
return out;
}
""")
# Extract assessment table (3 years)
# Header row: find the row whose first cell text is "Year".
assessment_rows = page.evaluate("""
() => {
const out = {};
const sec = document.querySelector('pa-assessmentinformation');
if (!sec) return out;
const tables = sec.querySelectorAll('table');
if (tables.length === 0) return out;
// Find header row in any table
let years = [];
let headerRowIdx = -1;
let chosenTbl = null;
for (const tbl of tables) {
const rows = tbl.querySelectorAll('tr');
for (let i = 0; i < rows.length; i++) {
const firstCell = (rows[i].querySelector('th, td')?.textContent || '').trim().toLowerCase();
if (firstCell === 'year') {
const headerCells = rows[i].querySelectorAll('th, td');
years = Array.from(headerCells).map(c => (c.textContent || '').trim()).slice(1);
headerRowIdx = i;
chosenTbl = tbl;
break;
}
}
if (chosenTbl) break;
}
if (!chosenTbl || years.length === 0) return out;
const rows = chosenTbl.querySelectorAll('tr');
for (let i = headerRowIdx + 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td, th');
if (cells.length < 2) continue;
const label = (cells[0]?.textContent || '').trim();
const values = {};
for (let j = 1; j < cells.length && j-1 < years.length; j++) {
values[years[j-1]] = (cells[j].textContent || '').trim();
}
if (label) out[label] = values;
}
return out;
}
""")
# Extract taxable value table (by district)
taxable_rows = page.evaluate("""
() => {
const out = {};
const sec = document.querySelector('pa-taxablevalueinformation');
if (!sec) return out;
out._text = (sec.innerText || '').trim().substring(0, 2000);
return out;
}
""")
browser.close()
# ─── Post-process — parse via text labels ─────────────────────
prop_text = sections.get("pa-propertyinformation", "")
result["parcel_id"] = _grab_after_label(prop_text, "Folio")
result["subdivision"] = _grab_after_label(prop_text, "Sub-Division")
# Address: "Property Address\n{addr}"
addr_block_match = re.search(
r"Property Address\s*\n([^\n]+)", prop_text, re.IGNORECASE,
)
if addr_block_match:
result["site_address"] = addr_block_match.group(1).strip()
# Owner: "Owner\n{name(s)}"
owner_match = re.search(
r"Owner\s*\n([^\n]+(?:\n[^\n]+)?)", prop_text, re.IGNORECASE,
)
if owner_match:
owner_text = owner_match.group(1).strip()
# Split on newline for multiple owners
lines = [l.strip() for l in owner_text.split("\n") if l.strip()]
result["owner_name"] = lines[0] if lines else None
result["co_owners"] = lines[1:] if len(lines) > 1 else []
mailing_match = re.search(
r"Mailing Address\s*\n((?:[^\n]+\n?){1,3})", prop_text, re.IGNORECASE,
)
if mailing_match:
result["mailing_address"] = re.sub(
r"\s+", " ", mailing_match.group(1).strip(),
)
result["pa_primary_zone"] = _grab_after_label(prop_text, "PA Primary Zone")
result["use_code"] = _grab_after_label(prop_text, "Primary Land Use")
result["use_description"] = result.get("use_code")
beds_baths = _grab_after_label(prop_text, "Beds / Baths /Half")
if beds_baths:
parts = [p.strip() for p in beds_baths.split("/")]
try:
result["bedrooms"] = int(parts[0]) if parts[0] else None
except (ValueError, IndexError):
result["bedrooms"] = None
try:
result["baths"] = float(parts[1]) if len(parts) > 1 and parts[1] else None
except (ValueError, IndexError):
result["baths"] = None
result["floors"] = _to_int(_grab_after_label(prop_text, "Floors"))
result["living_units"] = _to_int(_grab_after_label(prop_text, "Living Units"))
living_area = _grab_after_label(prop_text, "Living Area")
result["sqft_heated"] = _to_int(living_area) if living_area else None
adj_area = _grab_after_label(prop_text, "Adjusted Area")
result["sqft_total"] = _to_int(adj_area) if adj_area else None
lot_size = _grab_after_label(prop_text, "Lot Size")
result["lot_total_sqft"] = _to_int(lot_size) if lot_size else None
result["year_built"] = _to_int(_grab_after_label(prop_text, "Year Built"))
# Sales history — clean each row
result["sales_history"] = []
for r in sales_rows:
date_str = r.get("date", "")
price_str = r.get("price", "")
# Skip header rows / non-data
if not date_str or "Sale" in date_str or date_str.lower() == "previous sale":
continue
rec = {
"date": date_str,
"price": _money_to_int(price_str),
"book_page": r.get("book_page", ""),
"qualification": r.get("qualification", ""),
"previous_owner": r.get("previous_owner", ""),
# Approximate Duval-compatible 'qualified' flag
"qualified": "Qualified" if "qual" in r.get("qualification", "").lower()
and "disqual" not in r.get("qualification", "").lower()
else "Unqualified",
}
if rec["date"]:
result["sales_history"].append(rec)
# Most recent qualified sale
qualified = [s for s in result["sales_history"]
if s.get("qualified", "").startswith("Qualified")
and s.get("price", 0) and s["price"] >= 1000]
result["most_recent_qualified_sale"] = qualified[0] if qualified else None
# Assessment 3-year values (Year column → Land, Building, Market, Assessed)
# assessment_rows = {"Land Value": {"2025": "$0", ...}, "Market Value": {...}}
result["assessment_table"] = assessment_rows
# Resolve current/last/two-years
years_present = []
for label_dict in assessment_rows.values():
if isinstance(label_dict, dict):
for y in label_dict.keys():
if y and y not in years_present:
years_present.append(y)
# Pick most recent year as current
years_present_sorted = sorted([y for y in years_present if y.isdigit()], reverse=True)
current_year = years_present_sorted[0] if years_present_sorted else None
last_year = years_present_sorted[1] if len(years_present_sorted) > 1 else None
def _val(label, year):
if year and assessment_rows.get(label):
return _money_to_int(assessment_rows[label].get(year, "0"))
return None
result["just_value_current"] = _val("Market Value", current_year)
result["assessed_value_current"] = _val("Assessed Value", current_year)
result["just_value_last"] = _val("Market Value", last_year)
result["assessed_value_last"] = _val("Assessed Value", last_year)
result["tax_year_current"] = int(current_year) if current_year else None
result["tax_year_last"] = int(last_year) if last_year else None
# Homestead detection from benefits section text
benefits_text = sections.get("pa-benefitsinformation", "") or ""
result["homestead_active"] = "homestead" in benefits_text.lower() and "$" in benefits_text
# Legal description
legal_text = sections.get("pa-legaldescription", "") or ""
result["legal_description"] = re.sub(
r"^Legal Description\s*\n",
"",
legal_text.strip(),
)[:500] if legal_text else None
# Renovation signal
from data_fetchers.pa_duval import _detect_renovation_pattern
result["renovation_signal"] = _detect_renovation_pattern(
result["sales_history"], listing_price=listing_price,
)
# Raw sections for advanced consumers
result["_raw_sections"] = sections
result["_raw_taxable_text"] = taxable_rows.get("_text", "")
except Exception as e:
import traceback
result["errors"].append(f"{type(e).__name__}: {e}")
result["_trace"] = traceback.format_exc()[:600]
return result
# ════════════════════════════════════════════════════════════════════════════
# CLI
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Miami-Dade PA full record fetcher")
parser.add_argument("--parcel", help="Folio number (e.g. '31-2202-034-2470')")
parser.add_argument("--address", help="Alternative address search")
args = parser.parse_args()
if not args.parcel and not args.address:
parser.error("--parcel or --address required")
rec = fetch_miami_dade_pa_record(parcel_id=args.parcel, address=args.address)
print(json.dumps(rec, indent=2, default=str))
+409
View File
@@ -0,0 +1,409 @@
"""data_fetchers/pa_palm_beach.py — Full Palm Beach PA extractor.
Sitio: https://pbcpao.gov (server-rendered HTML + jQuery, no SPA)
Deep link: /Property/Details?parcelId={parcelId}
VENTAJA: NO necesita Playwright. urllib + HTMLParser stdlib = rapidisimo.
"""
from __future__ import annotations
import re
import urllib.request
from datetime import datetime, timezone
from html.parser import HTMLParser
from typing import Optional
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
_BASE_URL = "https://pbcpao.gov"
# ════════════════════════════════════════════════════════════════════════════
# HTML text extractor (skip script/style)
# ════════════════════════════════════════════════════════════════════════════
class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.in_skip = False
self.parts: list[str] = []
def handle_starttag(self, tag, attrs):
if tag in ("script", "style", "noscript"):
self.in_skip = True
def handle_endtag(self, tag):
if tag in ("script", "style", "noscript"):
self.in_skip = False
def handle_data(self, d):
if not self.in_skip:
t = d.strip()
if t:
self.parts.append(t)
# ════════════════════════════════════════════════════════════════════════════
# Tables extractor (table → list of rows)
# ════════════════════════════════════════════════════════════════════════════
class _TableExtractor(HTMLParser):
"""Extracts all tables as list of {idx, rows: [[cells]]} dicts."""
def __init__(self):
super().__init__()
self.in_table = False
self.in_tr = False
self.in_cell = False
self.in_skip = False
self.current_row: list[str] = []
self.current_cell = ""
self.current_table: list[list[str]] = []
self.tables: list[list[list[str]]] = []
def handle_starttag(self, tag, attrs):
if tag in ("script", "style"):
self.in_skip = True
elif tag == "table":
self.in_table = True
self.current_table = []
elif tag == "tr" and self.in_table:
self.in_tr = True
self.current_row = []
elif tag in ("td", "th") and self.in_tr:
self.in_cell = True
self.current_cell = ""
elif tag == "br" and self.in_cell:
self.current_cell += " "
def handle_endtag(self, tag):
if tag in ("script", "style"):
self.in_skip = False
elif tag == "table":
if self.current_table:
self.tables.append(self.current_table)
self.in_table = False
elif tag == "tr" and self.in_tr:
if self.current_row:
self.current_table.append(self.current_row)
self.in_tr = False
elif tag in ("td", "th") and self.in_cell:
self.current_row.append(re.sub(r"\s+", " ", self.current_cell).strip())
self.in_cell = False
def handle_data(self, d):
if self.in_cell and not self.in_skip:
self.current_cell += d
# ════════════════════════════════════════════════════════════════════════════
# Helpers
# ════════════════════════════════════════════════════════════════════════════
def _grab_after(text: str, label: str, max_len: int = 80) -> Optional[str]:
"""Find label in flat text, return the next non-empty token cluster."""
if not text or not label:
return None
idx = text.find(label)
if idx < 0:
return None
after = text[idx + len(label): idx + len(label) + max_len].strip()
# Take up to next " ", " ", end-of-line, or "Property" / "Address" etc.
# First word/phrase = value until next CAPITALIZED label pattern
m = re.match(r"\s*([^\n]+?)(?:\s{2,}|\s+[A-Z][A-Z\s]+\s+[A-Za-z]+|$)", after)
if m:
return m.group(1).strip()
return after.split("\n")[0].strip()
def _to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d-]", "", str(s))
try:
return int(cleaned) if cleaned else None
except ValueError:
return None
def _money_to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", str(s))
if not cleaned or cleaned == "-":
return None
try:
return int(float(cleaned))
except ValueError:
return None
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def fetch_palm_beach_pa_record(
parcel_id: str,
timeout_seconds: int = 30,
listing_price: Optional[float] = None,
) -> dict:
"""Fetch full Palm Beach PA record by parcel_id (PCN).
Args:
parcel_id: 17-digit PCN (e.g. "00414232000003080") or formatted with dashes
timeout_seconds: HTTP timeout
listing_price: enables flip-in-progress detection
Returns: rich dict (unified schema) with errors list.
"""
fetched_at = datetime.now(timezone.utc).isoformat()
result = {
"county": "Palm Beach",
"source": "Palm Beach County Property Appraiser (pbcpao.gov)",
"fetched_at": fetched_at,
"errors": [],
}
if not parcel_id:
result["errors"].append("no parcel_id provided")
return result
pcn_clean = parcel_id.replace("-", "").strip()
url = f"{_BASE_URL}/Property/Details?parcelId={pcn_clean}"
result["source_url"] = url
# HTTP fetch (no Playwright)
try:
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=timeout_seconds) as resp:
html = resp.read().decode("utf-8", errors="ignore")
except Exception as e:
result["errors"].append(f"HTTP fetch failed: {type(e).__name__}: {e}")
return result
# Detect "no property found"
if "no property" in html.lower()[:5000] or "not found" in html.lower()[:5000]:
result["errors"].append("parcel not found in PA records")
return result
# Extract flat text
text_extractor = _TextExtractor()
text_extractor.feed(html)
flat = " ".join(text_extractor.parts)
# Extract tables
tbl_extractor = _TableExtractor()
tbl_extractor.feed(html)
# ─── Parse scalars from flat text ──────────────────────────────────────
# Owner Name DERMYSHI IRFAN Property Control Number ...
m = re.search(r"Owner Name\s+([A-Z][A-Z\s,'.\-&]+?)(?=\s+(?:Property Control|Mailing|Current|Tax|Subdivision|Total))",
flat)
if m:
result["owner_name"] = m.group(1).strip()
# Property Control Number — formatted as XX-XX-XX-XX-XX-XXX-XXXX
m = re.search(r"Property Control Number\s+([\d\-]+)", flat)
if m:
result["parcel_id"] = m.group(1).strip()
else:
result["parcel_id"] = parcel_id
# Year Built
m = re.search(r"Year Built\s+(\d{4})", flat)
if m:
result["year_built"] = int(m.group(1))
# Beds / Baths
m = re.search(r"Bed\s*Rooms\s+(\d+)", flat, re.IGNORECASE)
if m:
result["bedrooms"] = int(m.group(1))
m = re.search(r"Full Baths\s+(\d+)", flat, re.IGNORECASE)
full_b = int(m.group(1)) if m else 0
m = re.search(r"Half Baths\s+(\d+)", flat, re.IGNORECASE)
half_b = int(m.group(1)) if m else 0
if full_b or half_b:
result["baths"] = float(full_b) + (0.5 * half_b)
result["baths_full"] = full_b
result["baths_half"] = half_b
# Square footage
m = re.search(r"Total Square Footage\s+(\d[\d,]*)", flat) or re.search(r"Square Footage\s+(\d[\d,]*)", flat)
if m:
result["sqft_total"] = _to_int(m.group(1))
m = re.search(r"Area Under Air\s+(\d[\d,]*)", flat)
if m:
result["sqft_heated"] = _to_int(m.group(1))
# Lot acres
m = re.search(r"Acres\s+([\d.]+)", flat)
if m:
try:
result["lot_acres"] = float(m.group(1))
except ValueError:
pass
# Property Use Code + Zoning
m = re.search(r"Property Use Code\s+([\w\d\?\.\-\s]+?)(?:\s+Zoning)", flat)
if m:
result["use_code"] = m.group(1).replace("?", " - ").strip()
m = re.search(r"Zoning\s+([\w\d\?\-]+?(?:\s+\([^)]+\))?)", flat)
if m:
result["zoning"] = m.group(1).replace("?", " - ").strip()
# Subdivision
m = re.search(r"Subdivision\s+([A-Z0-9 ,'.\-]+?)(?=\s+Official Records|Sale Date|Legal Description|$)", flat)
if m:
sub = m.group(1).strip()
result["subdivision"] = sub if sub else None
# Legal description
m = re.search(r"Legal Description\s+([^\n]+?)(?=\s+Show Full Map|Show More|Nearby|Owner INFORMATION|$)", flat)
if m:
result["legal_description"] = m.group(1).strip()[:300]
# Roof / interior info
for label, key in [
("Air Condition Desc.", "ac_description"),
("Heat Type", "heat_type"),
("Heat Fuel", "heat_fuel"),
("Roof Structure", "roof_struct"),
("Roof Cover", "roof_cover"),
("Interior Wall 1", "interior_wall"),
]:
m = re.search(rf"{re.escape(label)}\s+([A-Z][A-Z &/\-]+?)(?=\s+[A-Z][a-z])", flat)
if m:
result[key] = m.group(1).strip()
# Site Address (Property address line)
# PB format: addresses are usually shown after "Location Address" header
m = re.search(r"Location Address\s+([^\n]+?)(?=\s+Subdivision|Owner|Property Use|$)", flat)
if m:
result["site_address"] = m.group(1).strip()
# Homestead detection: "Current Homestead" or "Homestead Exemption"
# Easier: check if exemption appears in benefits section
result["homestead_active"] = bool(
re.search(r"Homestead Exemption\s+\$[\d,]+|Current Homestead\s*Yes",
flat, re.IGNORECASE)
)
# ─── Tax/Assessment values from tables ─────────────────────────────────
# Look for table with rows like: "Tax Year 2025 2024 2023 ..."
# "Total Market Value $758,298 $762,232 ..."
# "Total Assessed Value ..."
tax_years = []
market_vals: dict[str, int] = {}
assessed_vals: dict[str, int] = {}
improvement_vals: dict[str, int] = {}
for tbl in tbl_extractor.tables:
for row in tbl:
if not row:
continue
first = row[0].lower() if row else ""
if first == "tax year":
tax_years = [c for c in row[1:] if c]
elif "market value" in first or "total market" in first:
for i, v in enumerate(row[1:]):
if i < len(tax_years):
market_vals[tax_years[i]] = _money_to_int(v) or 0
elif first == "assessed value" or "total assessed" in first:
for i, v in enumerate(row[1:]):
if i < len(tax_years):
assessed_vals[tax_years[i]] = _money_to_int(v) or 0
elif "improvement value" in first:
for i, v in enumerate(row[1:]):
if i < len(tax_years):
improvement_vals[tax_years[i]] = _money_to_int(v) or 0
# Pick most recent year
valid_years = sorted([y for y in tax_years if y.isdigit()], reverse=True)
current_year = valid_years[0] if valid_years else None
last_year = valid_years[1] if len(valid_years) > 1 else None
result["just_value_current"] = market_vals.get(current_year) if current_year else None
result["assessed_value_current"] = assessed_vals.get(current_year) if current_year else None
result["just_value_last"] = market_vals.get(last_year) if last_year else None
result["assessed_value_last"] = assessed_vals.get(last_year) if last_year else None
result["tax_year_current"] = int(current_year) if current_year else None
result["tax_year_last"] = int(last_year) if last_year else None
result["assessment_history"] = {
"market": market_vals,
"assessed": assessed_vals,
"improvement": improvement_vals,
}
# ─── Sales history from tables ─────────────────────────────────────────
sales: list[dict] = []
for tbl in tbl_extractor.tables:
if not tbl or len(tbl) < 2:
continue
hdr = [c.lower() for c in tbl[0]]
# Sales table heuristic: header has "Sale[s] Date" and "Price".
# PB uses "Sales Date" (with 's'), some sites use "Sale Date".
has_sale_date = any(("sale date" in h or "sales date" in h) for h in hdr)
if has_sale_date and any("price" in h for h in hdr):
idx_date = next((i for i, h in enumerate(hdr)
if "sale date" in h or "sales date" in h), -1)
idx_price = next((i for i, h in enumerate(hdr) if "price" in h), -1)
idx_book = next((i for i, h in enumerate(hdr) if "book" in h or h.startswith("or")), -1)
idx_qual = next((i for i, h in enumerate(hdr)
if "qualified" in h or h == "sale type" or h == "type"), -1)
for row in tbl[1:]:
if len(row) < 2:
continue
d = row[idx_date] if idx_date >= 0 and idx_date < len(row) else ""
p = row[idx_price] if idx_price >= 0 and idx_price < len(row) else ""
if not d and not p:
continue
qual_raw = row[idx_qual] if idx_qual >= 0 and idx_qual < len(row) else ""
price = _money_to_int(p)
# Palm Beach uses "Sale Type" not "qualified/disqualified".
# Treat WARRANTY DEED with price >= 50K as Qualified (typical PB convention).
# CERT OF TITLE = foreclosure deed = Unqualified.
# QUIT CLAIM with low price = Unqualified.
q_low = qual_raw.lower()
if "warranty deed" in q_low and (price or 0) >= 50000:
qualified_flag = "Qualified"
elif "qualified" in q_low and "disqualified" not in q_low:
qualified_flag = "Qualified"
else:
qualified_flag = "Unqualified"
sales.append({
"date": d,
"price": price,
"book_page": row[idx_book] if idx_book >= 0 and idx_book < len(row) else "",
"qualification": qual_raw,
"deed_type": qual_raw,
"qualified": qualified_flag,
})
result["sales_history"] = sales
# Most recent qualified sale
qualified = [s for s in sales
if s.get("qualified", "").startswith("Qualified")
and s.get("price", 0) and s["price"] >= 1000]
result["most_recent_qualified_sale"] = qualified[0] if qualified else None
# Renovation signal
from data_fetchers.pa_duval import _detect_renovation_pattern
result["renovation_signal"] = _detect_renovation_pattern(
sales, listing_price=listing_price,
)
return result
# ════════════════════════════════════════════════════════════════════════════
# CLI
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Palm Beach PA full record fetcher")
parser.add_argument("--parcel", required=True, help="PCN (e.g. '00414232000003080')")
args = parser.parse_args()
rec = fetch_palm_beach_pa_record(parcel_id=args.parcel)
print(json.dumps(rec, indent=2, default=str))
+145
View File
@@ -0,0 +1,145 @@
"""data_fetchers/pa_photo_lookup.py — Buscar fotos de propiedad en sitios PA (gratis).
PROPOSITO:
Los County Property Appraisers (PA) de Florida tienen fotos de las propiedades.
Acceso público vía Playwright (cero costo Firecrawl).
Es la alternativa GRATIS a `zillow_photo_lookup` (que cuesta 1 credit por property).
COVERAGE actual:
- Broward (bcpa.net): ✓ tested, 100% hit rate en 3-sample
- Miami-Dade (miamidadepa.gov): ✗ solo aerial, no street photo
- Duval (paopropertysearch.coj.net): pendiente investigar URL correcta
- Otros counties: stub para Phase 3.5.B
USO:
from data_fetchers.pa_photo_lookup import fetch_pa_photo
url, meta = fetch_pa_photo(county="Broward", parcel_id="484226062150")
"""
from __future__ import annotations
from typing import Optional
def fetch_pa_photo(
county: str,
parcel_id: str,
timeout_seconds: int = 25,
) -> tuple[Optional[str], dict]:
"""Fetch property photo URL from County Property Appraiser.
Args:
county: county name (e.g. "Broward", "Miami-Dade", "Duval")
parcel_id: county-specific parcel/folio number
timeout_seconds: max wait per Playwright call
Returns:
(photo_url, metadata)
photo_url: str or None
metadata: {county, parcel_id, source, error}
"""
meta = {"county": county, "parcel_id": parcel_id, "source": None, "error": None}
if not parcel_id:
meta["error"] = "no parcel_id"
return None, meta
cnorm = (county or "").lower().replace(" county", "").strip().replace(" ", "_")
fetcher = _FETCHERS.get(cnorm)
if not fetcher:
meta["error"] = f"no PA fetcher for county {county!r} (supported: {sorted(_FETCHERS.keys())})"
return None, meta
try:
url, source_name = fetcher(parcel_id, timeout_seconds)
meta["source"] = source_name
return url, meta
except Exception as e:
meta["error"] = f"{type(e).__name__}: {e}"
return None, meta
# ────────────────────────────────────────────────────────────────────────────
# Per-county implementations
# ────────────────────────────────────────────────────────────────────────────
_CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"
def _fetch_broward(parcel_id: str, timeout_seconds: int) -> tuple[Optional[str], str]:
"""Broward bcpa.net SPA fetcher.
URL pattern: https://web.bcpa.net/bcpaclient/#/Record-Search?folio=XXX
Photo embedded in <img src=".../Photographs/<first6>/<next2>/<next4>/<file>.jpg">
"""
from playwright.sync_api import sync_playwright
import time
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
photo = None
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
try:
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
time.sleep(7) # SPA render delay
photos = page.evaluate(
"Array.from(document.querySelectorAll('img'))"
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
".map(i => i.src)"
)
if photos:
photo = photos[0]
finally:
browser.close()
return photo, "bcpa.net"
def _fetch_broward_batch(parcel_ids: list[str], timeout_seconds: int = 20) -> dict[str, Optional[str]]:
"""Optimized batch fetcher for Broward.
Re-uses browser across folios (single session) for speed.
Returns: {parcel_id: photo_url or None}
"""
from playwright.sync_api import sync_playwright
import time
out: dict[str, Optional[str]] = {}
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
for parcel_id in parcel_ids:
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
try:
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
time.sleep(7)
photos = page.evaluate(
"Array.from(document.querySelectorAll('img'))"
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
".map(i => i.src)"
)
out[parcel_id] = photos[0] if photos else None
except Exception:
out[parcel_id] = None
page.close()
browser.close()
return out
_FETCHERS = {
"broward": _fetch_broward,
# TODO Phase 3.5.B:
# "miami-dade": _fetch_miami_dade, (only aerial, no street photo — skip)
# "duval": _fetch_duval,
# "hillsborough": _fetch_hillsborough,
# "orange": _fetch_orange,
}
def list_supported_counties() -> list[str]:
"""Returns counties with implemented PA photo fetchers."""
return sorted(_FETCHERS.keys())
+465
View File
@@ -0,0 +1,465 @@
"""price_validator.py - detecta discrepancia listing vs market estimates.
PROBLEMA QUE RESUELVE:
Deal con listing $70K mientras Zillow Zestimate dice $280K. Cap rate sale 18%.
Sistema procede a calcular como deal normal sin detectar el RED FLAG mas obvio:
ese precio bajo casi siempre indica problema oculto heredable (liens, foreclosure,
damage, code violations, title issues, etc.).
OBJETIVO:
Detectar discrepancia >30% entre listing price y market value estimates.
Alertar al usuario MASIVAMENTE (CRITICAL_RED_FLAG) con lista de razones probables
y due diligence obligatoria.
LOGICA:
- discrepancy < 10% → NORMAL (listing dentro de rango razonable)
- 10-30% → WARNING (listing fuera de rango pero no escandalo)
- ≥30% → CRITICAL_RED_FLAG (algo huele mal — investigar antes de seguir)
FUENTES de market value:
1. Zillow Zestimate (Firecrawl ~3 credits) — opt-in con ENABLE_FIRECRAWL_PRICE_CHECK
2. Redfin estimate (Firecrawl ~3 credits) — opt-in
3. Tax Assessed Value (county scraper, gratis cuando funcione)
4. Existing comps via property_value.py si ya estan computados
FAIL-SOFT: si no hay ninguna fuente disponible, retorna status='unknown' con warning.
"""
from __future__ import annotations
import os
import re
from datetime import datetime, timezone
from typing import Optional
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
# Thresholds
NORMAL_THRESHOLD_PCT = 10.0
WARNING_THRESHOLD_PCT = 30.0
# Tax assessed → market value typical ratio in FL
TAX_TO_MARKET_RATIO = 0.85 # tax assessed suele ser 85% del market value
def _firecrawl_price_check_enabled() -> bool:
"""Flag separado de comps. Default OFF para no quemar credits."""
flag = os.getenv("ENABLE_FIRECRAWL_PRICE_CHECK", "false").lower() == "true"
has_key = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
return flag and has_key
# ═══════════════════════════════════════════════════════════════════════════
# Fetchers de market value
# ═══════════════════════════════════════════════════════════════════════════
def fetch_zillow_zestimate(address: str) -> tuple[Optional[int], list[str]]:
"""Fetch Zillow Zestimate via Firecrawl scrape de la pagina de la propiedad.
Returns (zestimate_value, errors). zestimate puede ser None si no se encuentra.
Consume ~3 credits Firecrawl.
"""
errors: list[str] = []
if not _firecrawl_price_check_enabled():
errors.append(
"Firecrawl price check deshabilitado. "
"Setear ENABLE_FIRECRAWL_PRICE_CHECK=true en .env para activar."
)
return None, errors
try:
from firecrawl import FirecrawlApp
except ImportError as e:
errors.append(f"firecrawl-py no importable: {e}")
return None, errors
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
if not api_key:
errors.append("FIRECRAWL_API_KEY ausente en .env")
return None, errors
# Zillow address search url
# Formato: https://www.zillow.com/homes/{address-with-dashes}_rb/
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
url = f"https://www.zillow.com/homes/{addr_slug}_rb/"
try:
app = FirecrawlApp(api_key=api_key)
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
result = app.scrape(url, formats=["markdown"])
if not result or not hasattr(result, "markdown"):
errors.append("Firecrawl Zillow: respuesta vacia")
return None, errors
md = result.markdown or ""
except Exception as e:
errors.append(f"Firecrawl Zillow error: {e}")
return None, errors
# Parser: buscar patron "Zestimate" + precio cercano
m = re.search(
r"zestimate[^\$]*\$([\d,]{4,9})",
md,
re.IGNORECASE,
)
if not m:
errors.append("Zillow markdown OK pero patron 'Zestimate $XXX' no encontrado")
return None, errors
try:
zestimate = int(m.group(1).replace(",", ""))
if 20_000 <= zestimate <= 50_000_000:
return zestimate, errors
else:
errors.append(f"Zestimate fuera de rango razonable: ${zestimate}")
return None, errors
except ValueError as e:
errors.append(f"Parse Zestimate: {e}")
return None, errors
def fetch_redfin_estimate(address: str) -> tuple[Optional[int], list[str]]:
"""Similar a Zillow pero Redfin. ~3 credits Firecrawl."""
errors: list[str] = []
if not _firecrawl_price_check_enabled():
errors.append("Firecrawl price check deshabilitado (ENABLE_FIRECRAWL_PRICE_CHECK=true)")
return None, errors
try:
from firecrawl import FirecrawlApp
except ImportError as e:
errors.append(f"firecrawl-py no importable: {e}")
return None, errors
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
if not api_key:
errors.append("FIRECRAWL_API_KEY ausente")
return None, errors
# Redfin search por address
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
url = f"https://www.redfin.com/?location={addr_slug}"
try:
app = FirecrawlApp(api_key=api_key)
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
result = app.scrape(url, formats=["markdown"])
md = result.markdown if result and hasattr(result, "markdown") else ""
except Exception as e:
errors.append(f"Firecrawl Redfin error: {e}")
return None, errors
m = re.search(
r"redfin estimate[^\$]*\$([\d,]{4,9})",
md,
re.IGNORECASE,
)
if not m:
errors.append("Redfin: patron 'Redfin Estimate' no encontrado")
return None, errors
try:
est = int(m.group(1).replace(",", ""))
if 20_000 <= est <= 50_000_000:
return est, errors
return None, errors
except ValueError:
return None, errors
# ═══════════════════════════════════════════════════════════════════════════
# Posibles razones de discrepancia (educacion al usuario)
# ═══════════════════════════════════════════════════════════════════════════
# Bug 6: Hipotesis distressed ordenadas por likelihood en Florida.
# Cuando listing es <$150K SFR + status=UNKNOWN, estas son las CAUSAS PRIMARIAS
# que el sistema debe surface ANTES de las 12 razones generales.
# Frecuencia derivada de datos publicos: ~60% de listings <$150K SFR en Florida
# son foreclosure-related (auction, REO, pre-foreclosure short sale) o tax deed.
DISTRESSED_HYPOTHESIS_REASONS = [
"🥇 FORECLOSURE AUCTION — listing puede ser el opening bid en la subasta judicial. "
"Lookup obligatorio: lis pendens en CCIS del condado (clerk online).",
"🥈 REO (Real Estate Owned) — el banco recupero la propiedad post-foreclosure y la "
"lista as-is cash-quick-close. Comun en bancos chicos / credit unions.",
"🥉 TAX DEED — el condado vendio el certificado por tax delinquency severa. "
"1-year redemption period donde el ex-owner puede recomprar.",
"Pre-foreclosure short sale — owner intenta vender antes de la subasta. "
"Requiere aprobacion del lender (puede llevar 3-6 meses).",
"Wholesale assignment — el wholesaler tiene el deal bajo contrato y vende el contrato. "
"Puede haber issues con marketable title.",
"Probate / estate sale — heirs liquidando rapido. Requiere certificado del juez.",
]
POSSIBLE_RED_FLAG_REASONS = [
"Tax delinquency severa (property tax + interes acumulado puede ser >20% del valor)",
"IRS lien sobre el owner (federal tax lien, 120-day right of redemption)",
"Code enforcement violations grandes (municipalidad puede tener liens de $50K+)",
"Foreclosure en curso (lis pendens publico) — el seller intenta vender antes de subasta",
"Damage severo no fotografiado (fire, water, structural) que requiere $50K-$200K rehab",
"Title issues (clouds en el chain — heirs no identificados, divorce sin completar, fraud)",
"Bankruptcy quick-sale (trustee debe liquidar rapido, precio bajo para cerrar)",
"Wholesaler problem deal (el wholesaler le bajo el precio porque tuvo issues con buyers anteriores)",
"HOA litigation pendiente — lender no presta hasta resolver",
"Open insurance claims que el buyer hereda",
"Polybutylene plumbing + electrical Federal Pacific (re-pipe + repanel costoso)",
"Inhabitable / no certificate of occupancy (puede ser ilegal alquilar tal como esta)",
]
MANDATORY_INVESTIGATION_LIST = [
"Court records search (county clerk: lis pendens, foreclosure docket, civil suits)",
"Tax collector / appraiser: verificar pagos al dia + assessed value",
"Code enforcement check con la municipalidad: violations + liens",
"Property records: chain of title del county recorder",
"Title search profesional ($300-$500) ANTES de hacer oferta",
"Drive-by inspection (sin entrar): nivel de mantenimiento exterior, signos de damage",
"PACER bankruptcy search (federal): auto-stay del owner puede invalidar transferencia",
"Permits buscador: openings sin cerrar pueden tener implicaciones legales",
"Open insurance claims: pedir disclosure al seller",
]
# ═══════════════════════════════════════════════════════════════════════════
# API publica
# ═══════════════════════════════════════════════════════════════════════════
def validate_price(
*,
address: str,
listing_price: float,
tax_assessed_value: Optional[float] = None,
existing_comps_estimate: Optional[float] = None,
existing_comps_confidence: Optional[str] = None, # Bug 4: "high"|"medium"|"low"|None
existing_comps_sources: Optional[list] = None, # Bug 4: list of source labels
neighborhood_class: Optional[str] = None, # Bug 6: A|B|C|D|None
use_firecrawl: Optional[bool] = None,
) -> dict:
"""Entry point. Valida listing_price contra fuentes de market value.
Args:
address: full address de la propiedad
listing_price: precio listado
tax_assessed_value: opcional, si ya se computo por property_value.py
existing_comps_estimate: opcional, mid del estimated_value de property_value.py
existing_comps_confidence: confidence level del estimate ("high"/"medium"/"low").
Si "low", el estimate NO se usa como baseline (Bug 4 fix).
existing_comps_sources: lista de sources del property_value (para detectar
heuristica-only). Bug 4: si solo viene de "Deductions",
NO usar como baseline.
neighborhood_class: A/B/C/D del Census ACS. Bug 6: si UNKNOWN + listing muy bajo
en zona Class C/D, surface hipotesis foreclosure.
use_firecrawl: si True, hace lookups Zillow + Redfin (consume credits).
None → usa flag ENABLE_FIRECRAWL_PRICE_CHECK de .env
Returns dict con:
status: NORMAL | WARNING | CRITICAL_RED_FLAG | UNKNOWN
listing_price, market_estimates {zillow, redfin, tax_implied, comps_mid}
max_discrepancy_pct
possible_reasons (list of str) — si CRITICAL_RED_FLAG o UNKNOWN+sospechoso
mandatory_investigation (list of str)
recommendation: brief one-liner
fetched_at, sources_used, errors
"""
fetched_at = datetime.now(timezone.utc).isoformat()
errors: list[str] = []
sources_used: list[str] = []
estimates: dict[str, Optional[int]] = {
"zillow_zestimate": None,
"redfin_estimate": None,
"tax_implied_market": None,
"comps_mid": None,
}
rejected_sources: list[str] = [] # Bug 4: tracking de sources descartadas
# 1. Tax assessed → market implied (FL ratio ~85%)
if tax_assessed_value and tax_assessed_value > 1000:
estimates["tax_implied_market"] = int(tax_assessed_value / TAX_TO_MARKET_RATIO)
sources_used.append(f"Tax assessed → market implied (${tax_assessed_value:,.0f} / {TAX_TO_MARKET_RATIO})")
# 2. Existing comps estimate (de property_value.py) — CON VALIDACION DE CALIDAD (Bug 4)
if existing_comps_estimate and existing_comps_estimate > 1000:
# Reject if confidence is "low" — significa que property_value.py no tuvo
# data real y cayo en fallback heuristico de deductions. Usarlo como
# baseline produce direccion INVERTIDA (visto en Jacksonville test).
is_heuristic_only = False
if existing_comps_sources:
srcs_str = " | ".join(str(s) for s in existing_comps_sources).lower()
# Si la UNICA source es "Deductions por edad" → no es un comp real
is_heuristic_only = (
("deduction" in srcs_str or "heurística" in srcs_str or "heuristica" in srcs_str)
and "comp" not in srcs_str
and "tax" not in srcs_str
and "zillow" not in srcs_str
and "redfin" not in srcs_str
)
if existing_comps_confidence == "low" or is_heuristic_only:
rejected_sources.append(
f"property_value comps_mid descartado: confidence={existing_comps_confidence}, "
f"sources={existing_comps_sources} — fallback heuristico no es baseline valido"
)
errors.append(
"property_value estimate descartado por baja calidad (heuristic-only). "
"Para validacion confiable: activar ENABLE_FIRECRAWL_COMPS o esperar tax_assessed scraper."
)
else:
estimates["comps_mid"] = int(existing_comps_estimate)
sources_used.append(f"Comps mid (confidence={existing_comps_confidence or 'unknown'}, ${existing_comps_estimate:,.0f})")
# 3. Firecrawl Zillow Zestimate
if use_firecrawl is None:
do_firecrawl = _firecrawl_price_check_enabled()
else:
do_firecrawl = use_firecrawl
if do_firecrawl:
z, z_errors = fetch_zillow_zestimate(address)
if z:
estimates["zillow_zestimate"] = z
sources_used.append(f"Zillow Zestimate (${z:,.0f})")
errors.extend(z_errors)
r, r_errors = fetch_redfin_estimate(address)
if r:
estimates["redfin_estimate"] = r
sources_used.append(f"Redfin Estimate (${r:,.0f})")
errors.extend(r_errors)
# 4. Calcular discrepancia
available_estimates = [v for v in estimates.values() if v]
if not available_estimates:
# Bug 6: UNKNOWN-pero-listing-sospechosamente-bajo → surface hipotesis
# distressed (foreclosure / tax_deed / REO / pre-foreclosure short sale).
# Heuristica: listing < $150K + zona conocida por foreclosures FL
# (Duval, Hillsborough, Polk, Marion, Brevard, Volusia, Lake) o sin info
# de neighborhood = surface hypothesis.
suspicious_low_listing = listing_price < 150_000
is_low_class_area = (neighborhood_class or "").upper() in ("C", "D")
possible = []
investigation = []
recommendation_text = (
"No se pudo validar el precio contra fuentes de mercado confiables. "
"Activar ENABLE_FIRECRAWL_PRICE_CHECK + ENABLE_FIRECRAWL_COMPS en .env "
"o esperar el tax_assessed scraper para validacion automatica. "
"Considera lookup manual en Zillow/Redfin antes de proceder."
)
if suspicious_low_listing:
# En USA real estate, listing <$150K SFR en Florida es estadisticamente raro
# excepto en: (a) zonas Class D donde es market-rate (rare), (b) deals
# distressed donde el listing es el opening bid o el "as-is cash quick close".
# Surface las hipotesis distressed como PRIMER orden de explicacion.
possible = DISTRESSED_HYPOTHESIS_REASONS + POSSIBLE_RED_FLAG_REASONS
investigation = MANDATORY_INVESTIGATION_LIST
if is_low_class_area:
class_note = (
f"Vecindario Class {neighborhood_class} (income bajo) — listing en este rango "
"puede ser market-rate. Pero foreclosure tampoco esta descartado: en Class D "
"FL, el porcentaje de foreclosures es ~3x el promedio nacional."
)
elif neighborhood_class in ("A", "B"):
class_note = (
f"Vecindario Class {neighborhood_class} (income medio/alto) — listing tan bajo "
"es PROBABLEMENTE deal distressed. Investigar lis pendens en CCIS antes de proceder."
)
else:
# neighborhood_class unknown
class_note = (
"Neighborhood class no disponible — no se puede inferir si el listing es "
"market-rate-para-la-zona o distressed."
)
recommendation_text = (
f"⚠️ Listing ${listing_price:,.0f} es estadisticamente raro para SFR en Florida "
f"(<$150K). {class_note} "
"HIPOTESIS PRIMARIA: deal distressed (foreclosure, tax_deed, REO, short sale, "
"pre-foreclosure). Re-verificar deal_type del usuario, hacer court records lookup "
"(lis pendens en CCIS del condado), y tratar este analisis como PRELIMINAR hasta "
"confirmar el status real."
)
return {
"status": "UNKNOWN",
"listing_price": int(listing_price),
"market_estimates": estimates,
"max_discrepancy_pct": None,
"min_discrepancy_pct": None,
"possible_reasons": possible,
"mandatory_investigation": investigation,
"recommendation": recommendation_text,
"sources_used": sources_used,
"rejected_sources": rejected_sources,
"suspicious_low_listing": suspicious_low_listing,
"errors": errors,
"fetched_at": fetched_at,
}
# Discrepancia % vs cada estimate (negativo = listing < market, positivo = listing > market)
discrepancies = {}
for src, val in estimates.items():
if val:
disc_pct = (listing_price - val) / val * 100
discrepancies[src] = round(disc_pct, 1)
# max ABS discrepancy = la mas alarmante (ya sea sobre o bajo el mercado)
max_abs_disc = max(abs(d) for d in discrepancies.values())
# signed para reportar direccion
signed_max = max(discrepancies.values(), key=abs)
# Status
if max_abs_disc < NORMAL_THRESHOLD_PCT:
status = "NORMAL"
recommendation = (
f"Listing dentro de ±{NORMAL_THRESHOLD_PCT}% de market estimates. "
"Procede con analisis financiero estandar."
)
possible_reasons = []
investigation = []
elif max_abs_disc < WARNING_THRESHOLD_PCT:
status = "WARNING"
direction = "sobre" if signed_max > 0 else "bajo"
recommendation = (
f"Listing {abs(signed_max):.0f}% {direction} el market estimate. "
"Verifica condiciones del deal antes de proceder. "
"Si listing > market: probable inflación del seller. "
"Si listing < market: investigar razon (motivacion legitima vs problema oculto)."
)
possible_reasons = []
investigation = []
else:
# CRITICAL_RED_FLAG
status = "CRITICAL_RED_FLAG"
direction = "sobre" if signed_max > 0 else "bajo"
if signed_max < 0:
# Listing < market — el caso peligroso de problema oculto
recommendation = (
f"🚨 LISTING ${listing_price:,.0f} esta {abs(signed_max):.0f}% BAJO el market estimate. "
"Esto NO es un 'gran deal' por default — es una RED FLAG masiva. "
"El precio bajo casi siempre indica problema oculto heredable. "
"NO procedas con analisis financiero estandar hasta entender el POR QUE del precio bajo. "
"Cap rate alto en este contexto puede ser ilusion — los costos heredables pueden destruir el deal."
)
possible_reasons = POSSIBLE_RED_FLAG_REASONS
investigation = MANDATORY_INVESTIGATION_LIST
else:
# Listing > market — clasico seller inflacionado pero no peligroso
recommendation = (
f"Listing ${listing_price:,.0f} esta {abs(signed_max):.0f}% SOBRE el market estimate. "
"Probable inflacion del seller. Oferta agresiva justificada. "
"Si declinan, walk away — hay deals mejores."
)
possible_reasons = []
investigation = []
return {
"status": status,
"listing_price": int(listing_price),
"market_estimates": estimates,
"discrepancies_pct": discrepancies,
"max_discrepancy_pct": round(max_abs_disc, 1),
"signed_max_discrepancy_pct": round(signed_max, 1),
"direction": "listing_BELOW_market" if signed_max < 0 else "listing_ABOVE_market" if signed_max > 0 else "match",
"possible_reasons": possible_reasons,
"mandatory_investigation": investigation,
"recommendation": recommendation,
"sources_used": sources_used,
"errors": errors,
"fetched_at": fetched_at,
}
+545
View File
@@ -0,0 +1,545 @@
"""data_fetchers/property_appraiser.py — Unified PA router.
Source of Truth para CUALQUIER propiedad en USA. El pre-screening llama a
esta funcion COMO PRIMER paso para evitar inferencias erroneas sobre listing
data (Zillow puede mentir, ser viejo, o estar incompleto).
USAGE:
from data_fetchers.property_appraiser import fetch_pa_record, is_pa_supported
if is_pa_supported(county_name, state):
record = fetch_pa_record(
address="2352 SCENIC VIEW CT",
parcel_id=None,
county_name="Duval",
state="FL",
zip_code="32218",
)
UNIFIED RETURN SCHEMA (cada adapter llena lo que pueda; campos faltantes = None):
{
# Identity
"parcel_id": str, # County-specific RE#/folio
"owner_name": str, # Primary owner
"co_owners": [str], # If multiple
"mailing_address": str,
"site_address": str, # Property address
"owner_address_mismatch": bool, # mailing != site (absentee owner)
# Building
"year_built": int,
"effective_year_built": int | None, # If county exposes renovations
"sqft_heated": int,
"sqft_total": int,
"lot_acres": float,
"lot_total_sqft": int,
"bedrooms": int,
"baths": float,
"stories": float,
"building_type": str,
"roof_type": str,
"exterior_wall": str,
"interior_wall": str,
"int_flooring": str,
# Values
"just_value_current": int,
"assessed_value_current": int,
"taxable_value_current": int,
"exemption_current": int,
"just_value_last": int,
"assessed_value_last": int,
"taxes_paid_last": float,
"tax_year_current": int,
"tax_year_last": int,
# Owner signals
"homestead_active": bool, # primary residence flag
"homestead_amount": int,
# Sales history (most recent first)
"sales_history": [
{
"date": "MM/DD/YYYY",
"price": int,
"deed_type": str, # "Warranty Deed", "Quit Claim", etc.
"qualified": str, # "Qualified" | "Unqualified"
"vacant_improved": str, # "Vacant" | "Improved"
"book_page": str,
}, ...
],
# Improvements / permits (when county exposes)
"improvements": [
{"year": int, "type": str, "permit": str | None}, ...
],
# Computed signals
"most_recent_qualified_sale": dict | None,
"renovation_signal": { # Heuristic flip/renov detection
"is_flip_pattern": bool,
"evidence": str,
"value_increase_pct": float,
"months_between": int,
},
# Land
"zoning": str,
"use_code": str,
"use_description": str,
# Metadata
"county": str,
"state": str,
"source": str, # "Duval PA (paopropertysearch.coj.net)" etc.
"source_url": str,
"fetched_at": ISO timestamp,
"errors": [str],
}
"""
from __future__ import annotations
from typing import Optional
# ════════════════════════════════════════════════════════════════════════════
# County → adapter mapping
# ════════════════════════════════════════════════════════════════════════════
# Counties with FULL extractors (returning rich unified schema).
# Cuando se agreguen Palm Beach, Orange, Hillsborough, etc → sumarlos aca.
_SUPPORTED_COUNTIES: dict[str, str] = {
# county_lowercase: state
"duval": "FL",
"broward": "FL",
"miami-dade": "FL",
"palm beach": "FL",
}
def is_pa_supported(county_name: Optional[str], state: Optional[str]) -> bool:
"""True si hay full PA extractor para este county."""
if not county_name or not state:
return False
key = county_name.lower().replace(" county", "").strip()
return _SUPPORTED_COUNTIES.get(key) == state.upper()
def list_supported_counties() -> list[tuple[str, str]]:
"""Returns [(county_lower, state_upper), ...] de counties con PA full."""
return [(c, s) for c, s in _SUPPORTED_COUNTIES.items()]
# ════════════════════════════════════════════════════════════════════════════
# Main entry point
# ════════════════════════════════════════════════════════════════════════════
def fetch_pa_record(
*,
county_name: Optional[str],
state: Optional[str] = "FL",
address: Optional[str] = None,
parcel_id: Optional[str] = None,
zip_code: Optional[str] = None,
timeout_seconds: int = 45,
listing_price: Optional[float] = None,
) -> Optional[dict]:
"""Fetch PA record for a property using the county-specific adapter.
Args:
listing_price: optional — enables flip-in-progress detection
(recent qualified sale << listing → owner is flipping)
Returns:
Unified dict with PA data, or None if county not supported.
"""
if not is_pa_supported(county_name, state):
return None
key = county_name.lower().replace(" county", "").strip()
if key == "duval":
return _fetch_duval(address=address, parcel_id=parcel_id,
zip_code=zip_code, timeout_seconds=timeout_seconds,
listing_price=listing_price)
if key == "broward":
return _fetch_broward(parcel_id=parcel_id, timeout_seconds=timeout_seconds)
if key == "miami-dade" or key == "miami dade":
return _fetch_miami_dade(parcel_id=parcel_id, address=address,
timeout_seconds=timeout_seconds,
listing_price=listing_price)
if key == "palm beach":
return _fetch_palm_beach(parcel_id=parcel_id, timeout_seconds=timeout_seconds,
listing_price=listing_price)
return None
# ════════════════════════════════════════════════════════════════════════════
# Adapter wrappers (normalize per-county output to unified schema)
# ════════════════════════════════════════════════════════════════════════════
def _fetch_duval(
*,
address: Optional[str],
parcel_id: Optional[str],
zip_code: Optional[str],
timeout_seconds: int,
listing_price: Optional[float] = None,
) -> dict:
"""Duval adapter wrapper: pa_duval.fetch_duval_pa_record → unified schema."""
try:
from data_fetchers.pa_duval import fetch_duval_pa_record
except ImportError as e:
return {
"county": "Duval",
"state": "FL",
"errors": [f"pa_duval module import failed: {e}"],
}
raw = fetch_duval_pa_record(
address=address,
parcel_id=parcel_id,
zip_code=zip_code,
timeout_seconds=timeout_seconds,
listing_price=listing_price,
)
# Normalize raw → unified schema
site_addr = " ".join(filter(None, [raw.get("site_address_line1"), raw.get("site_address_line2")]))
return {
# Identity
"parcel_id": raw.get("parcel_id"),
"owner_name": raw.get("owner_name"),
"co_owners": [], # Duval shows one owner; multi-owner detection pending
"mailing_address": None, # not in current detail extraction
"site_address": site_addr.strip(),
"owner_address_mismatch": None,
# Building
"year_built": raw.get("year_built"),
"effective_year_built": None, # Duval doesn't expose explicitly
"sqft_heated": raw.get("sqft_heated"),
"sqft_total": raw.get("sqft_gross"),
"lot_acres": None,
"lot_total_sqft": raw.get("lot_total_sqft"),
"bedrooms": int(raw.get("bedrooms") or 0) or None,
"baths": raw.get("baths"),
"stories": raw.get("stories"),
"building_type": raw.get("building_type"),
"roof_type": raw.get("roof_struct"),
"roofing_cover": raw.get("roofing_cover"),
"exterior_wall": raw.get("exterior_wall"),
"interior_wall": raw.get("interior_wall"),
"int_flooring": raw.get("int_flooring"),
# Values
"just_value_current": raw.get("tax_current_year_just"),
"assessed_value_current": raw.get("tax_current_year_assessed"),
"taxable_value_current": raw.get("tax_current_year_taxable"),
"exemption_current": raw.get("tax_current_year_exemptions"),
"just_value_last": raw.get("tax_last_year_just"),
"assessed_value_last": raw.get("tax_last_year_assessed"),
"taxable_value_last": raw.get("tax_last_year_taxable"),
"taxes_paid_last": None, # Duval doesn't show direct tax amount here
"tax_year_current": None,
"tax_year_last": None,
# Owner signals
"homestead_active": raw.get("homestead_active"),
"homestead_amount": raw.get("homestead_amount_current"),
# Sales history
"sales_history": raw.get("sales_history", []),
"most_recent_qualified_sale": raw.get("most_recent_qualified_sale"),
# Computed signals
"renovation_signal": raw.get("renovation_signal"),
# Improvements (Duval no expone formales; backlog para Acclaim integration)
"improvements": [],
# Land
"zoning": (raw.get("land") or {}).get("zoning"),
"use_code": raw.get("property_use") or "",
"use_description": (raw.get("land") or {}).get("use_description"),
"subdivision": raw.get("subdivision"),
"legal_description": None, # raw has gridLegal but not parsed to flat string here
# Metadata
"county": "Duval",
"state": "FL",
"source": raw.get("source"),
"source_url": raw.get("source_url"),
"fetched_at": raw.get("fetched_at"),
"errors": raw.get("errors", []),
# Raw passthrough for advanced consumers
"_raw": raw,
}
def _fetch_broward(*, parcel_id: Optional[str], timeout_seconds: int) -> dict:
"""Broward adapter wrapper: pa_broward.fetch_broward_pa_record → unified schema."""
if not parcel_id:
return {
"county": "Broward",
"state": "FL",
"errors": ["Broward PA needs parcel_id (folio); address search not yet supported"],
}
try:
from data_fetchers.pa_broward import fetch_broward_pa_record
except ImportError as e:
return {
"county": "Broward",
"state": "FL",
"errors": [f"pa_broward module import failed: {e}"],
}
raw = fetch_broward_pa_record(parcel_id, timeout_seconds=timeout_seconds)
cy = raw.get("current_year") or {}
ly = raw.get("last_year") or {}
# Concatenate owner names if continuation
owner_full = (raw.get("owner_name") or "")
if raw.get("owner_name_2"):
owner_full = f"{owner_full} {raw['owner_name_2']}".strip()
# Detect address mismatch
mailing = (raw.get("mailing_address") or "").upper()
site = (raw.get("situs_address") or "").upper()
owner_addr_mismatch = bool(mailing and site and mailing.split()[0] != site.split()[0])
# Parse beds/baths from units_beds_baths
beds = baths = None
ubb = (raw.get("units_beds_baths") or "").split("/")
if len(ubb) >= 3:
for raw_v, key in [(ubb[1], "beds"), (ubb[2], "baths")]:
v = raw_v.strip()
try:
if key == "beds":
beds = int(v) if v.replace(".", "").isdigit() else None
else:
baths = float(v) if v.replace(".", "").isdigit() else None
except (ValueError, IndexError):
pass
return {
"parcel_id": raw.get("folio_number"),
"owner_name": owner_full,
"co_owners": [],
"mailing_address": raw.get("mailing_address"),
"site_address": raw.get("situs_address"),
"owner_address_mismatch": owner_addr_mismatch,
# Building
"year_built": raw.get("year_built"),
"effective_year_built": raw.get("effective_year"),
"sqft_heated": raw.get("under_air_sqft"),
"sqft_total": raw.get("adj_bldg_sqft"),
"lot_acres": None,
"lot_total_sqft": None,
"bedrooms": beds,
"baths": baths,
"stories": None,
"building_type": raw.get("use_code"),
"roof_type": None,
"roofing_cover": None,
"exterior_wall": None,
"interior_wall": None,
"int_flooring": None,
# Values
"just_value_current": cy.get("just_value"),
"assessed_value_current": cy.get("assessed_value"),
"taxable_value_current": (raw.get("tax_breakdown") or {}).get("county", {}).get("taxable"),
"exemption_current": (raw.get("tax_breakdown") or {}).get("county", {}).get("homestead", 0),
"just_value_last": ly.get("just_value"),
"assessed_value_last": ly.get("assessed_value"),
"taxable_value_last": None,
"taxes_paid_last": ly.get("taxes_paid"),
"tax_year_current": cy.get("tax_year"),
"tax_year_last": ly.get("tax_year"),
# Owner signals
"homestead_active": raw.get("homestead_active"),
"homestead_amount": (raw.get("tax_breakdown") or {}).get("county", {}).get("homestead", 0),
# Sales history
"sales_history": raw.get("sales_history", []),
"most_recent_qualified_sale": None, # not separately calculated in pa_broward
# Computed
"renovation_signal": None, # pa_broward doesn't compute this yet
"improvements": [],
# Land
"zoning": None,
"use_code": raw.get("use_code"),
"use_description": raw.get("use_code"),
"subdivision": raw.get("neighborhood"),
"legal_description": raw.get("legal_description"),
# Metadata
"county": "Broward",
"state": "FL",
"source": "Broward Property Appraiser (bcpa.net)",
"source_url": raw.get("source_url"),
"fetched_at": raw.get("fetched_at"),
"errors": raw.get("errors", []),
"_raw": raw,
}
def _fetch_palm_beach(
*,
parcel_id: Optional[str],
timeout_seconds: int,
listing_price: Optional[float] = None,
) -> dict:
"""Palm Beach wrapper: pa_palm_beach.fetch_palm_beach_pa_record → unified."""
if not parcel_id:
return {
"county": "Palm Beach",
"state": "FL",
"errors": ["Palm Beach PA needs parcel_id (PCN); address search not yet supported"],
}
try:
from data_fetchers.pa_palm_beach import fetch_palm_beach_pa_record
except ImportError as e:
return {
"county": "Palm Beach",
"state": "FL",
"errors": [f"pa_palm_beach module import failed: {e}"],
}
raw = fetch_palm_beach_pa_record(
parcel_id=parcel_id,
timeout_seconds=timeout_seconds,
listing_price=listing_price,
)
# Mailing/site address mismatch heuristic: PB doesn't expose mailing
# separately in flat parser; site_address may include "Municipality" noise.
return {
"parcel_id": raw.get("parcel_id"),
"owner_name": raw.get("owner_name"),
"co_owners": [],
"mailing_address": None,
"site_address": raw.get("site_address"),
"owner_address_mismatch": None,
# Building
"year_built": raw.get("year_built"),
"effective_year_built": None,
"sqft_heated": raw.get("sqft_heated"),
"sqft_total": raw.get("sqft_total"),
"lot_acres": raw.get("lot_acres"),
"lot_total_sqft": None,
"bedrooms": raw.get("bedrooms"),
"baths": raw.get("baths"),
"stories": None,
"building_type": raw.get("use_code"),
"roof_type": raw.get("roof_struct"),
"roofing_cover": raw.get("roof_cover"),
"exterior_wall": None,
"interior_wall": raw.get("interior_wall"),
"int_flooring": None,
# Values
"just_value_current": raw.get("just_value_current"),
"assessed_value_current": raw.get("assessed_value_current"),
"taxable_value_current": None,
"exemption_current": None,
"just_value_last": raw.get("just_value_last"),
"assessed_value_last": raw.get("assessed_value_last"),
"taxable_value_last": None,
"taxes_paid_last": None,
"tax_year_current": raw.get("tax_year_current"),
"tax_year_last": raw.get("tax_year_last"),
# Owner signals
"homestead_active": raw.get("homestead_active"),
"homestead_amount": None,
# Sales
"sales_history": raw.get("sales_history", []),
"most_recent_qualified_sale": raw.get("most_recent_qualified_sale"),
"renovation_signal": raw.get("renovation_signal"),
"improvements": [],
# Land
"zoning": raw.get("zoning"),
"use_code": raw.get("use_code"),
"use_description": raw.get("use_code"),
"subdivision": raw.get("subdivision"),
"legal_description": raw.get("legal_description"),
# Metadata
"county": "Palm Beach",
"state": "FL",
"source": raw.get("source"),
"source_url": raw.get("source_url"),
"fetched_at": raw.get("fetched_at"),
"errors": raw.get("errors", []),
"_raw": raw,
}
def _fetch_miami_dade(
*,
parcel_id: Optional[str],
address: Optional[str],
timeout_seconds: int,
listing_price: Optional[float] = None,
) -> dict:
"""Miami-Dade adapter wrapper: pa_miami_dade.fetch_miami_dade_pa_record → unified schema."""
try:
from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
except ImportError as e:
return {
"county": "Miami-Dade",
"state": "FL",
"errors": [f"pa_miami_dade module import failed: {e}"],
}
raw = fetch_miami_dade_pa_record(
parcel_id=parcel_id,
address=address,
timeout_seconds=timeout_seconds,
listing_price=listing_price,
)
# Normalize to unified schema
return {
# Identity
"parcel_id": raw.get("parcel_id"),
"owner_name": raw.get("owner_name"),
"co_owners": raw.get("co_owners", []),
"mailing_address": raw.get("mailing_address"),
"site_address": raw.get("site_address"),
# Owner address mismatch — primitive heuristic (Miami-Dade doesn't expose
# separate site/mailing comparison cleanly; can compute later if needed)
"owner_address_mismatch": None,
# Building
"year_built": raw.get("year_built"),
"effective_year_built": None, # Miami-Dade doesn't expose explicitly
"sqft_heated": raw.get("sqft_heated"),
"sqft_total": raw.get("sqft_total"),
"lot_acres": None,
"lot_total_sqft": raw.get("lot_total_sqft"),
"bedrooms": raw.get("bedrooms"),
"baths": raw.get("baths"),
"stories": raw.get("floors"),
"building_type": raw.get("use_code"),
"roof_type": None,
"roofing_cover": None,
"exterior_wall": None,
"interior_wall": None,
"int_flooring": None,
# Values
"just_value_current": raw.get("just_value_current"),
"assessed_value_current": raw.get("assessed_value_current"),
"taxable_value_current": None, # in pa-taxablevalueinformation, not parsed yet
"exemption_current": None,
"just_value_last": raw.get("just_value_last"),
"assessed_value_last": raw.get("assessed_value_last"),
"taxable_value_last": None,
"taxes_paid_last": None,
"tax_year_current": raw.get("tax_year_current"),
"tax_year_last": raw.get("tax_year_last"),
# Owner signals
"homestead_active": raw.get("homestead_active"),
"homestead_amount": None,
# Sales history
"sales_history": raw.get("sales_history", []),
"most_recent_qualified_sale": raw.get("most_recent_qualified_sale"),
# Renovation
"renovation_signal": raw.get("renovation_signal"),
"improvements": [],
# Land
"zoning": raw.get("pa_primary_zone"),
"use_code": raw.get("use_code"),
"use_description": raw.get("use_description"),
"subdivision": raw.get("subdivision"),
"legal_description": raw.get("legal_description"),
# Metadata
"county": "Miami-Dade",
"state": "FL",
"source": raw.get("source"),
"source_url": raw.get("source_url"),
"fetched_at": raw.get("fetched_at"),
"errors": raw.get("errors", []),
"_raw": raw,
}
+682
View File
@@ -0,0 +1,682 @@
"""Property value fetcher para Wave 2 (ValueEstimator).
Objetivo: estimar valor real de una propiedad combinando:
1. Tax assessed value (gratis, county-specific scraping)
2. Comparables recently sold (Firecrawl, MAX 5 OPT-IN para no quemar credits)
3. Deductions automaticas por edad del inmueble (AC, roof, plumbing, panel)
OUTPUT consolidado:
{
"listing_price": int,
"tax_assessed_value": int | None,
"comps_used": [{address, sold_price, sold_date, sqft, $/sqft}, ...],
"estimated_value": {"low": int, "mid": int, "high": int, "confidence": str},
"price_per_sqft_comps_avg": float | None,
"price_per_sqft_subject": float,
"overpriced_pct": float | None,
"inflation_score": float, # 0-10
"deductions": {"ac": int, "roof": int, "plumbing": int, "panel": int, "total": int},
"market_trend": {"direction": str, "evidence": str},
"sources_used": [...],
"fetch_errors": [...],
}
FAIL-SOFT:
- Sin Firecrawl key o ENABLE_FIRECRAWL_COMPS=false comps_used=[], confidence baja
- Sin tax assessed tax_assessed_value=None, fallback a comps
- Sin nada estimacion basada SOLO en deductions vs listing
"""
from __future__ import annotations
import os
import re
from datetime import datetime, timezone
from typing import Optional
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
# ─── Deductions por edad del inmueble (Florida real estate norms) ──────────
DEDUCTION_AC = 6_000 # AC central viejo (<2010) si no hay evidencia de uno nuevo
DEDUCTION_ROOF = 10_000 # Roof shingle viejo (<2005)
DEDUCTION_PLUMBING_POLYBUTYLENE = 12_000 # Polybutylene plumbing risk (1978-1995 FL)
DEDUCTION_ELECTRICAL_PANEL = 5_000 # Federal Pacific / Zinsco panels (<1990)
# ─── Keywords para detectar items renovados (suprimir deduccion correspondiente) ──
# Bug fix 2026-05-15: el sistema antes solo chequeaba "new ac"/"ac nuevo".
# Ahora detecta variantes reales que aparecen en listings: "BRAND NEW ROOF",
# "Updated/Remodeled", "Fully updated throughout", "AC replaced 2023", etc.
NEW_ITEM_KEYWORDS = {
"ac": [
"new ac", "new a/c", "new hvac", "new a.c.", "new air conditioning",
"ac replaced", "a/c replaced", "hvac replaced", "ac unit replaced",
"new air handler", "ac nuevo", "a/c nuevo", "hvac nuevo",
"ac (20", "ac 20", "a/c (20", # "AC (2023)" or "AC 2023"
"newer ac", "newer a/c", "newer hvac",
"recently replaced ac", "recently replaced a/c", "recently replaced hvac",
],
"roof": [
"new roof", "brand new roof", "brand-new roof", "newer roof",
"roof replaced", "roof recently replaced", "recent roof",
"roof 20", "roof (20", "new shingles", "metal roof installed",
"roof installed 20", "roof nuevo", "techo nuevo",
"replaced roof", "roof was replaced", "roof replacement",
],
"plumbing": [
"re-piped", "repiped", "re piped", "pex plumbing", "new plumbing",
"plumbing replaced", "plumbing nuevo", "fully repiped",
"copper plumbing", "plumbing updated", "new pipes",
],
"panel": [
"new panel", "panel upgraded", "panel replaced", "new electrical",
"electrical updated", "200 amp", "new wiring", "rewired",
"panel nuevo", "panel electrico nuevo", "upgraded electrical",
],
}
# Keywords globales que indican "renovada completa" → CERO deducciones
RENOVATED_GLOBAL_KEYWORDS = [
"updated/remodeled", # Zillow explicit condition tag
"fully updated", "fully renovated", "completely renovated", "completely remodeled",
"move-in ready", "move in ready", "turnkey", "turn key", "turn-key",
"totally renovated", "totally updated",
"renovacion completa", "completamente renovada", "lista para mudarse",
"newly renovated", "newly remodeled",
"fully remodeled",
]
# Keywords de condition_status que indican propiedad renovada
RENOVATED_CONDITION_VALUES = {
"updated/remodeled", "remodeled", "renovated", "updated",
"new construction", "newly built",
}
def _matches_any_keyword(text: str, keywords: list[str]) -> Optional[str]:
"""Returns first matched keyword (case-insensitive substring), or None."""
if not text:
return None
text_lower = text.lower()
for kw in keywords:
if kw.lower() in text_lower:
return kw
return None
# ─── Comps estimation ──────────────────────────────────────────────────────
COMPS_MAX_COUNT = 5
COMPS_LOOKBACK_DAYS = 180
COMPS_SQFT_TOLERANCE_PCT = 0.15
PRICE_LOW_PCT = 0.92 # estimated_value['low'] = mid * 0.92
PRICE_HIGH_PCT = 1.08 # estimated_value['high'] = mid * 1.08
def calculate_age_deductions(
year_built: int,
photo_findings_text: str = "",
listing_description: str = "",
condition_status: str = "",
features_special: Optional[list] = None,
) -> dict:
"""Deducciones automaticas segun edad de la propiedad.
BUG FIX 2026-05-15: antes solo chequeaba keywords minimos en photo_findings_text.
Ahora respeta condition_status, listing_description, y features_special tags.
Args:
year_built: año de construccion. Si <= 0 ZERO deductions.
photo_findings_text: output del PhotoInspector (puede mencionar AC nuevo, etc.)
listing_description: full description del listing Zillow/Realtor
condition_status: Zillow "condition" tag (e.g. "Updated/Remodeled", "Original")
features_special: array de tags del "What's special" Zillow box
(e.g. ["BRAND NEW ROOF", "Fresh paint", "NEW AC"])
Returns:
{ac, roof, plumbing, panel, total, _skipped_global: bool,
_suppressed_items: [str], _reasons: {item: kw_matched}}
Si condition_status indica renovada O description tiene keywords globales
TODO 0, _skipped_global=True.
Per-item: si features_special O description menciona NEW ROOF/AC/etc,
suprime esa deduccion especifica.
"""
deductions = {"ac": 0, "roof": 0, "plumbing": 0, "panel": 0}
suppressed: list[str] = []
reasons: dict[str, str] = {}
# Combinar todos los textos en uno solo para keyword matching
combined_text = " ".join([
photo_findings_text or "",
listing_description or "",
" ".join(features_special or []),
])
# 1. CHECK GLOBAL: si condition status es renovada → CERO deducciones
cs_lower = (condition_status or "").lower().strip()
if cs_lower in RENOVATED_CONDITION_VALUES:
return {
**deductions, "total": 0,
"_skipped_global": True,
"_skip_reason": f"condition_status='{condition_status}' (Zillow tag)",
"_suppressed_items": list(deductions.keys()),
"_reasons": {k: f"condition={condition_status}" for k in deductions.keys()},
}
# 2. CHECK GLOBAL: si description o features mencionan "fully updated"/"move-in ready"
global_kw = _matches_any_keyword(combined_text, RENOVATED_GLOBAL_KEYWORDS)
if global_kw:
return {
**deductions, "total": 0,
"_skipped_global": True,
"_skip_reason": f"keyword '{global_kw}' detected in listing",
"_suppressed_items": list(deductions.keys()),
"_reasons": {k: f"keyword:{global_kw}" for k in deductions.keys()},
}
# 3. PER-ITEM: aplicar deduccion solo si edad gatilla Y no hay keyword item-specific
if not year_built or year_built <= 0:
deductions["total"] = 0
deductions["_skipped_global"] = False
deductions["_suppressed_items"] = []
deductions["_reasons"] = {}
return deductions
# AC: edad <2010 y NO hay keyword "new ac"
if year_built < 2010:
ac_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["ac"])
if ac_kw:
suppressed.append("ac")
reasons["ac"] = f"keyword:{ac_kw}"
else:
deductions["ac"] = DEDUCTION_AC
# Roof: edad <2005 y NO hay keyword "new roof"
if year_built < 2005:
roof_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["roof"])
if roof_kw:
suppressed.append("roof")
reasons["roof"] = f"keyword:{roof_kw}"
else:
deductions["roof"] = DEDUCTION_ROOF
# Plumbing polybutylene: edad 1978-1995 y NO hay keyword "repiped"
if 1978 <= year_built <= 1995:
pl_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["plumbing"])
if pl_kw:
suppressed.append("plumbing")
reasons["plumbing"] = f"keyword:{pl_kw}"
else:
deductions["plumbing"] = DEDUCTION_PLUMBING_POLYBUTYLENE
# Electrical panel: edad <1990 y NO hay keyword "new panel"
if year_built < 1990:
panel_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["panel"])
if panel_kw:
suppressed.append("panel")
reasons["panel"] = f"keyword:{panel_kw}"
else:
deductions["panel"] = DEDUCTION_ELECTRICAL_PANEL
deductions["total"] = sum(v for k, v in deductions.items() if k in ("ac", "roof", "plumbing", "panel"))
deductions["_skipped_global"] = False
deductions["_suppressed_items"] = suppressed
deductions["_reasons"] = reasons
return deductions
# ═══════════════════════════════════════════════════════════════════════════
# Tax Assessed Value (county-specific scrapers)
# ═══════════════════════════════════════════════════════════════════════════
def fetch_tax_assessed_miami_dade(address: str) -> Optional[dict]:
"""Stub para scraper Miami-Dade property appraiser.
TODO Wave 2 follow-up: implementar Playwright scraping de:
https://www.miamidade.gov/pa/property_search.asp
Steps:
1. Search by address
2. Parse results, click property card
3. Extract assessed_value, market_value, sale_history
Tiempo estimado: ~30s por lookup. Cache 30 dias.
Por ahora retorna None el agente trabaja con comps + deductions sin tax assessed.
"""
return None
# Set explicito de counties que TIENEN scraper real (no stub).
# Hoy: Broward funciona via pa_broward.py (full PA record extraction).
# Pendiente: Miami-Dade, Palm Beach, Orange, Hillsborough, Pinellas... (custom adapters)
# qPublic (~30 rurales) — bloqueado por Cloudflare
_TAX_ASSESSED_IMPLEMENTED_COUNTIES: set[str] = {
"broward",
}
def is_tax_assessed_supported(county_name: Optional[str], state: Optional[str]) -> bool:
"""True si tenemos scraper real para este county. False si es stub o no esta.
El orchestrator usa esto para distinguir 'no implementado' (decisión nuestra)
de 'not found' (buscamos y no estaba). NO mentir al usuario.
"""
if not county_name or state != "FL":
return False
cname = county_name.lower().replace(" county", "").strip()
return cname in _TAX_ASSESSED_IMPLEMENTED_COUNTIES
def fetch_tax_assessed(
address: str,
county_name: Optional[str],
state: Optional[str],
parcel_id: Optional[str] = None,
) -> Optional[dict]:
"""Router por condado para Property Appraiser data.
Args:
address: street address (used by counties whose adapter supports address search)
county_name: e.g. "Broward", "Miami-Dade"
state: must be "FL" today
parcel_id: county-specific folio number (required for Broward; preferred
for all counties since address matching is often fuzzy on PA sites)
Returns:
Rich dict with legacy keys (assessed_value, market_value, year_built, sqft,
owner_name, source) plus extended fields when adapter supports them
(sales_history, mailing_address, tax_breakdown, photo_url, etc.).
None if county not implemented OR adapter returned empty result.
Use is_tax_assessed_supported() to distinguish "not implemented" vs "not found".
"""
if not is_tax_assessed_supported(county_name, state):
return None
cname = (county_name or "").lower().replace(" county", "").strip()
if cname == "broward":
if not parcel_id:
# Broward needs folio — we can't do reliable address search yet
return None
return _fetch_broward(parcel_id)
if "miami-dade" in cname or "miami dade" in cname:
return fetch_tax_assessed_miami_dade(address)
# Defensive: not reachable if is_tax_assessed_supported is in sync
return None
def _fetch_broward(parcel_id: str) -> Optional[dict]:
"""Broward adapter: pa_broward.py rich record → legacy contract + extensions."""
try:
from data_fetchers.pa_broward import fetch_broward_pa_record
except ImportError:
return None
rec = fetch_broward_pa_record(parcel_id)
if not rec or rec.get("errors") and not rec.get("just_value_current"):
# Broward adapter failed AND no fallback data → return None
return None
cy = rec.get("current_year", {})
ly = rec.get("last_year", {})
# Concatenate owner_name + owner_name_2 if continuation exists (e.g. "BANK OF AMERICA NA TRSTEE" + "% CORP REAL ESTATE ASSMT")
owner_full = rec.get("owner_name", "") or ""
if rec.get("owner_name_2"):
owner_full = f"{owner_full} {rec['owner_name_2']}".strip()
# Parse beds/baths from "1 / 3 / 2.50" format (units/beds/baths)
beds = baths = None
ubb = (rec.get("units_beds_baths") or "").split("/")
if len(ubb) >= 3:
try:
beds_raw = ubb[1].strip()
beds = int(beds_raw) if beds_raw and beds_raw.replace(".", "").isdigit() else None
except (ValueError, IndexError):
pass
try:
baths_raw = ubb[2].strip()
baths = float(baths_raw) if baths_raw and baths_raw.replace(".", "").isdigit() else None
except (ValueError, IndexError):
pass
# ─── Legacy contract (back-compat with existing orchestrator/LLM payload) ──
return {
# Required legacy keys
"assessed_value": cy.get("assessed_value") or ly.get("assessed_value"),
"market_value": cy.get("just_value") or ly.get("just_value"),
"just_value": cy.get("just_value") or ly.get("just_value"),
"year_built": rec.get("year_built"),
"sqft": rec.get("under_air_sqft") or rec.get("adj_bldg_sqft"),
"beds": beds,
"baths": baths,
"owner_name": owner_full,
"source": "bcpa.net",
# ─── Extended fields (Property Snapshot Report inputs) ────────────────
"folio_number": rec.get("folio_number"),
"mailing_address": rec.get("mailing_address"),
"situs_address": rec.get("situs_address"),
"neighborhood": rec.get("neighborhood"),
"use_code": rec.get("use_code"),
"millage_code": rec.get("millage_code"),
"legal_description": rec.get("legal_description"),
"adj_bldg_sqft": rec.get("adj_bldg_sqft"),
"under_air_sqft": rec.get("under_air_sqft"),
"effective_year": rec.get("effective_year"),
"homestead_active": rec.get("homestead_active", False),
"taxes_paid_last_year": ly.get("taxes_paid"),
"tax_year_last": ly.get("tax_year"),
"tax_year_current": cy.get("tax_year"),
"current_year_values": cy,
"last_year_values": ly,
"two_years_ago_values": rec.get("two_years_ago", {}),
"tax_breakdown": rec.get("tax_breakdown", {}),
"sales_history": rec.get("sales_history", []),
"photo_url": rec.get("photo_url"),
"source_url": rec.get("source_url"),
"source_api_url": rec.get("source_api_url"),
"fetched_at": rec.get("fetched_at"),
}
# ═══════════════════════════════════════════════════════════════════════════
# Firecrawl comps (OPT-IN para no quemar credits)
# ═══════════════════════════════════════════════════════════════════════════
def _firecrawl_enabled() -> bool:
"""Check env flag + key presence."""
flag = os.getenv("ENABLE_FIRECRAWL_COMPS", "false").lower() == "true"
has_key = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
return flag and has_key
def fetch_zillow_comps(
zip_code: str,
beds: int,
baths: float,
sqft: int,
max_count: int = COMPS_MAX_COUNT,
) -> tuple[list[dict], list[str]]:
"""Fetch recently sold comps via Firecrawl scrape de Zillow.
Returns (comps_list, errors). Comps list capped at max_count.
Cada comp: {address, sold_price, sold_date_text, sqft, price_per_sqft}
OPT-IN: requiere ENABLE_FIRECRAWL_COMPS=true en .env.
"""
errors: list[str] = []
if not _firecrawl_enabled():
errors.append(
"Firecrawl comps deshabilitado. Setear ENABLE_FIRECRAWL_COMPS=true en .env para activar."
)
return [], errors
try:
from firecrawl import FirecrawlApp
except ImportError as e:
errors.append(f"firecrawl-py no importable: {e}")
return [], errors
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
if not api_key:
errors.append("FIRECRAWL_API_KEY ausente en .env")
return [], errors
# Zillow recently sold URL para ZIP
url = f"https://www.zillow.com/homes/recently_sold/{zip_code}_rb/"
try:
app = FirecrawlApp(api_key=api_key)
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url()).
# Returns Document object with .markdown attribute on success.
result = app.scrape(url, formats=["markdown"])
if not result or not hasattr(result, "markdown"):
errors.append("Firecrawl devolvio resultado vacio")
return [], errors
md = result.markdown or ""
except Exception as e:
errors.append(f"Firecrawl scrape error: {e}")
return [], errors
# Parser best-effort del markdown de Zillow.
# Buscar bloques con: $price + sqft + address + sold date
# Patrones tipicos en markdown de Zillow recently sold:
# "$485,000" ... "1,450 sqft" ... "123 Main St" ... "Sold X/Y/Z"
comps = _parse_zillow_markdown(md, beds=beds, baths=baths, sqft_target=sqft)
if not comps:
errors.append("Firecrawl OK pero parser no extrajo comps (Zillow cambio formato?)")
return comps[:max_count], errors
def _parse_zillow_markdown(md: str, beds: int, baths: float, sqft_target: int) -> list[dict]:
"""Best-effort parser de Zillow markdown.
Extrae bloques con price + sqft + address. Tolera variaciones.
"""
comps = []
# Buscar todos los matches de precio + sqft cercanos
# Pattern: $XXX,XXX (con o sin centavos) seguido en proximidad de "X,XXX sqft" o "X bd"
price_pattern = re.compile(r"\$([\d,]{3,9})", re.IGNORECASE)
sqft_pattern = re.compile(r"([\d,]{3,5})\s*sq(?:\.|uare)?\s*ft", re.IGNORECASE)
sold_pattern = re.compile(r"(sold|vendido)[\s:]+([0-9/.-]+)", re.IGNORECASE)
bed_pattern = re.compile(r"([\d.]+)\s*(?:bd|bed)", re.IGNORECASE)
# Segmentar md en bloques de ~500 chars que probablemente contengan 1 listing
segments = re.split(r"\n\n+|---+", md)
for seg in segments:
if len(seg) < 50 or len(seg) > 2000:
continue
prices = price_pattern.findall(seg)
sqfts = sqft_pattern.findall(seg)
if not prices or not sqfts:
continue
try:
price = int(prices[0].replace(",", ""))
sqft = int(sqfts[0].replace(",", ""))
except (ValueError, IndexError):
continue
# Filtro: precio razonable para single home
if price < 30_000 or price > 5_000_000:
continue
if sqft < 400 or sqft > 8_000:
continue
# Filtro: sqft cerca del subject (+/- COMPS_SQFT_TOLERANCE_PCT)
if sqft_target > 0:
ratio = sqft / sqft_target
if ratio < (1 - COMPS_SQFT_TOLERANCE_PCT) or ratio > (1 + COMPS_SQFT_TOLERANCE_PCT):
continue
sold_match = sold_pattern.search(seg)
sold_date = sold_match.group(2) if sold_match else "?"
bed_match = bed_pattern.search(seg)
comp_beds = bed_match.group(1) if bed_match else "?"
# Address best-effort: primera linea o primer link
addr_match = re.search(r"\[([^\]]+(?:St|Ave|Rd|Dr|Ln|Way|Blvd|Ct|Ter|Pl)[^\]]*)\]", seg, re.IGNORECASE)
address = addr_match.group(1) if addr_match else "(direccion no parseada)"
comps.append({
"address": address,
"sold_price": price,
"sold_date": sold_date,
"sqft": sqft,
"beds_text": comp_beds,
"price_per_sqft": round(price / sqft, 2) if sqft > 0 else 0,
})
return comps
# ═══════════════════════════════════════════════════════════════════════════
# Estimacion de valor y inflation score
# ═══════════════════════════════════════════════════════════════════════════
def estimate_value_from_comps(comps: list[dict], subject_sqft: int) -> tuple[Optional[int], Optional[float]]:
"""Calcula valor estimado mid + $/sqft promedio de comps."""
if not comps or subject_sqft <= 0:
return None, None
prices_per_sqft = [c["price_per_sqft"] for c in comps if c.get("price_per_sqft", 0) > 0]
if not prices_per_sqft:
return None, None
avg_ppsqft = sum(prices_per_sqft) / len(prices_per_sqft)
estimated_mid = int(avg_ppsqft * subject_sqft)
return estimated_mid, round(avg_ppsqft, 2)
def calculate_inflation_score(listing_price: float, estimated_mid: float) -> float:
"""Score 0-10 de cuanto el listing esta sobre el valor estimado.
0 = listing al valor o por debajo (subvaluado)
5 = listing 10% sobre valor
10 = listing 30%+ sobre valor (gravemente inflado)
"""
if estimated_mid <= 0:
return 5.0 # default medium
overpriced = (listing_price - estimated_mid) / estimated_mid
if overpriced <= 0:
return 0.0
if overpriced >= 0.30:
return 10.0
return round(overpriced / 0.30 * 10, 1)
# ═══════════════════════════════════════════════════════════════════════════
# API publica
# ═══════════════════════════════════════════════════════════════════════════
def fetch_property_value(
*,
address: str,
listing_price: float,
sqft: int,
beds: int,
baths: float,
year_built: int,
zip_code: Optional[str] = None,
county_name: Optional[str] = None,
state: Optional[str] = None,
photo_findings_text: str = "",
listing_description: str = "",
condition_status: str = "",
features_special: Optional[list] = None,
include_firecrawl_comps: Optional[bool] = None,
) -> dict:
"""Entry point. Combina tax assessed + comps + deductions en un dict consolidado.
Bug fix 2026-05-15: ahora acepta listing_description, condition_status,
features_special para que calculate_age_deductions pueda detectar listings
renovados (e.g. condition='Updated/Remodeled' o description='Fully updated
throughout, BRAND NEW ROOF, NEW AC') y suprimir deducciones falsas.
include_firecrawl_comps:
None usa el flag ENABLE_FIRECRAWL_COMPS de .env (default false)
True fuerza llamada a Firecrawl (consume credits)
False omite Firecrawl
"""
fetched_at = datetime.now(timezone.utc).isoformat()
sources_used: list[str] = []
errors: list[str] = []
# 1. Deductions por edad (siempre, gratis) — ahora respeta condition + keywords
deductions = calculate_age_deductions(
year_built=year_built,
photo_findings_text=photo_findings_text,
listing_description=listing_description,
condition_status=condition_status,
features_special=features_special,
)
if deductions["total"] > 0:
sources_used.append("Deductions por edad (heuristica FL)")
elif deductions.get("_skipped_global"):
sources_used.append(
f"Deductions SKIPPED (renovated: {deductions.get('_skip_reason', '?')})"
)
# 2. Tax assessed (stub Miami-Dade por ahora)
tax_assessed_data = fetch_tax_assessed(address, county_name, state)
tax_assessed_value = None
if tax_assessed_data:
tax_assessed_value = tax_assessed_data.get("assessed_value")
sources_used.append(f"Tax assessed ({county_name})")
else:
errors.append(f"Tax assessed no disponible para {county_name or '?'} (scraper pendiente Wave 2 follow-up)")
# 3. Firecrawl comps (opt-in)
if include_firecrawl_comps is None:
do_firecrawl = _firecrawl_enabled()
else:
do_firecrawl = include_firecrawl_comps
comps: list[dict] = []
if do_firecrawl and zip_code:
comps, comp_errors = fetch_zillow_comps(zip_code, beds, baths, sqft)
errors.extend(comp_errors)
if comps:
sources_used.append(f"Zillow recently sold ({len(comps)} comps via Firecrawl)")
# 4. Calcular valor estimado
estimated_mid_from_comps, avg_ppsqft = estimate_value_from_comps(comps, sqft)
price_per_sqft_subject = round(listing_price / sqft, 2) if sqft > 0 else 0
# Combinar tax_assessed + comps + deductions para mid estimate
candidates_mid = []
if estimated_mid_from_comps:
candidates_mid.append(estimated_mid_from_comps)
if tax_assessed_value:
# Tax assessed en FL suele ser ~85% del market value
candidates_mid.append(int(tax_assessed_value / 0.85))
if candidates_mid:
estimated_mid = int(sum(candidates_mid) / len(candidates_mid)) - deductions["total"]
else:
# Fallback: listing - deductions, con muy low confidence
estimated_mid = max(0, int(listing_price) - deductions["total"])
estimated_mid = max(estimated_mid, 1) # nunca zero
estimated_low = int(estimated_mid * PRICE_LOW_PCT)
estimated_high = int(estimated_mid * PRICE_HIGH_PCT)
# Confidence segun fuentes disponibles
if comps and tax_assessed_value:
confidence = "high"
elif comps or tax_assessed_value:
confidence = "medium"
else:
confidence = "low"
overpriced_pct = None
if estimated_mid > 0:
overpriced_pct = round((listing_price - estimated_mid) / estimated_mid * 100, 1)
inflation_score = calculate_inflation_score(listing_price, estimated_mid)
return {
"listing_price": int(listing_price),
"tax_assessed_value": tax_assessed_value,
"comps_used": comps,
"estimated_value": {
"low": estimated_low,
"mid": estimated_mid,
"high": estimated_high,
"confidence": confidence,
},
"price_per_sqft_comps_avg": avg_ppsqft,
"price_per_sqft_subject": price_per_sqft_subject,
"overpriced_pct": overpriced_pct,
"inflation_score": inflation_score,
"deductions": deductions,
"market_trend": {
"direction": "unknown",
"evidence": "Para detectar tendencia requiere historial de comps (no implementado en MVP)",
},
"sources_used": sources_used,
"fetch_errors": errors,
"firecrawl_used": do_firecrawl and bool(comps),
"fetched_at": fetched_at,
}
+233
View File
@@ -0,0 +1,233 @@
"""Runner para data fetchers.
Flujo:
1. Geocode (sequential) - sin esto no podemos hacer FEMA ni NOAA
2. FEMA + HUD + NOAA en paralelo (ThreadPoolExecutor, max 3 workers)
Fail-soft en cada fetcher: si uno falla, el campo queda {} y se anota en fetch_errors.
El pipeline nunca aborta.
Output schema:
{
"geocode": {matched_address, lat, lng, city, state, zip, county_name, county_fips, state_fips} | {}
"flood": {zone, bfe, sfha, subtype, source} | {}
"fmr": {year, county, state, fmr_efficiency, fmr_1br..fmr_4br, source} | {}
"hurricanes": [{name, year, category, max_wind_mph, closest_pass_miles}, ...]
"hurricanes_summary": {lookback_years, max_distance_mi, total_hurricanes_nearby, source}
"fetch_errors": ["geocode: ...", "hud: ...", ...] # strings con explicacion
"duration_seconds": float
}
"""
from __future__ import annotations
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Callable, Optional, TYPE_CHECKING
from .base import FetcherError
from .cache import FileCache
from .census_geocode import fetch_geocode
from .fema_flood import fetch_flood
from .hud_fmr import fetch_fmr
from .noaa_hurricanes import fetch_hurricanes
from .neighborhood_class import fetch_neighborhood
from .court_records import fetch_court_records, _enable_court_records # Wave 1.5A
# Paths ABSOLUTOS anclados al proyecto (no relativos al CWD del caller).
# Asi el cache y data files siempre estan en D:\Proyectos Software\AR-House\
# sin importar desde donde se llama fetch_all().
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "data_fetchers"
DEFAULT_HURDAT2_PATH = _PROJECT_ROOT / "data" / "hurdat2.txt"
if TYPE_CHECKING:
from orchestrator import DealInputs
# TTL por namespace (dias)
TTL = {
"geocode": 30,
"fema": 30,
"hud_fmr": 365, # cambia anualmente
"hurricanes": 30,
"neighborhood": 90, # ACS y crime cambian lentamente
"court_records": 7, # Wave 1.5A: procesos judiciales se mueven lento
}
def _emit(cb: Optional[Callable[[str], None]], msg: str) -> None:
if cb:
cb(msg)
def _safe(
cache: FileCache,
namespace: str,
cache_key: str,
ttl: float,
func: Callable[[], dict],
errors: list,
error_prefix: str,
) -> dict:
"""Wrapper fail-soft: usa cache, llama func si miss, captura errores."""
cached = cache.get(namespace, cache_key, ttl)
if cached is not None:
return cached
try:
data = func()
cache.set(namespace, cache_key, data)
return data
except FetcherError as e:
errors.append(f"{error_prefix}: {e}")
return {}
except Exception as e:
errors.append(f"{error_prefix}: unexpected {type(e).__name__}: {e}")
return {}
def fetch_all(
deal: "DealInputs",
status_cb: Optional[Callable[[str], None]] = None,
cache_dir: str | Path | None = None,
hurdat2_path: str | Path | None = None,
include_neighborhood_dom: bool = False,
) -> dict:
"""Obtiene todos los datos verificados para un deal.
Geocode primero (necesario para FEMA y NOAA). Los demas en paralelo.
Si cache_dir o hurdat2_path son None, usa paths absolutos anclados al
proyecto (independientes del CWD del caller).
"""
# Default paths absolutos al proyecto (no relativos al CWD del caller)
if cache_dir is None:
cache_dir = DEFAULT_CACHE_DIR
if hurdat2_path is None:
hurdat2_path = DEFAULT_HURDAT2_PATH
t0 = time.perf_counter()
cache = FileCache(cache_dir)
errors: list[str] = []
# --- 1. Geocode (sequential, bloquea a los demas) ----------------------
_emit(status_cb, "Geocodificando direccion (Census)...")
geocode = _safe(
cache, "geocode", deal.address, TTL["geocode"],
lambda: fetch_geocode(deal.address),
errors, "geocode",
)
if not geocode or not geocode.get("lat") or not geocode.get("lng"):
_emit(status_cb, " Geocodificacion fallo - omitiendo FEMA/NOAA/HUD/neighborhood")
return {
"geocode": geocode,
"flood": {},
"fmr": {},
"hurricanes": [],
"hurricanes_summary": {},
"neighborhood": {},
"fetch_errors": errors + (["geocode_failed_no_coords"] if not errors else []),
"duration_seconds": round(time.perf_counter() - t0, 2),
}
lat = float(geocode["lat"])
lng = float(geocode["lng"])
state = geocode.get("state", "")
county_name = geocode.get("county_name", "")
_emit(
status_cb,
f" OK: {geocode.get('matched_address', '?')} | "
f"{county_name}, {state} | ({lat:.4f}, {lng:.4f})"
)
# --- 2. FEMA + HUD + NOAA + Neighborhood en paralelo -------------------
_emit(status_cb, "Fetching FEMA / HUD / NOAA / Neighborhood en paralelo...")
def task_fema():
return _safe(
cache, "fema", f"{lat:.5f},{lng:.5f}", TTL["fema"],
lambda: fetch_flood(lat, lng),
errors, "fema",
)
def task_hud():
if not state or not county_name:
errors.append("hud: state o county_name faltantes en geocode")
return {}
return _safe(
cache, "hud_fmr", f"{state}|{county_name}", TTL["hud_fmr"],
lambda: fetch_fmr(state, county_name),
errors, "hud",
)
def task_noaa():
return _safe(
cache, "hurricanes", f"{lat:.4f},{lng:.4f}", TTL["hurricanes"],
lambda: fetch_hurricanes(lat, lng, years_back=20, hurdat2_path=hurdat2_path),
errors, "noaa",
)
def task_neighborhood():
tract = geocode.get("tract_geoid") or "no_tract"
return _safe(
cache, "neighborhood", f"{tract}|dom={include_neighborhood_dom}", TTL["neighborhood"],
lambda: fetch_neighborhood(geocode, include_dom=include_neighborhood_dom),
errors, "neighborhood",
)
# Wave 1.5A: court records (opt-in via ENABLE_COURT_RECORDS=true)
# Solo si el county es Duval (Wave 1.5A v1). Otros condados → soft-fail.
def task_court_records():
if not _enable_court_records():
return {"status": "DISABLED",
"recommendation": "Activar ENABLE_COURT_RECORDS=true en .env para "
"deteccion deterministica de foreclosure/lis pendens."}
return _safe(
cache, "court_records", f"{deal.address}|{county_name}", TTL["court_records"],
lambda: fetch_court_records(address=deal.address, county_name=county_name),
errors, "court_records",
)
with ThreadPoolExecutor(max_workers=5) as ex:
f_fema = ex.submit(task_fema)
f_hud = ex.submit(task_hud)
f_noaa = ex.submit(task_noaa)
f_nbh = ex.submit(task_neighborhood)
f_court = ex.submit(task_court_records)
flood = f_fema.result()
fmr = f_hud.result()
noaa_data = f_noaa.result()
neighborhood = f_nbh.result()
court_records = f_court.result()
hurricanes = noaa_data.get("hurricanes", []) if isinstance(noaa_data, dict) else []
hurricanes_summary = {
k: v for k, v in (noaa_data or {}).items() if k != "hurricanes"
}
# Log de resumen
f_zone = flood.get("zone", "N/A") if flood else "N/A"
h3 = fmr.get("fmr_3br", "N/A") if fmr else "N/A"
n_hur = len(hurricanes)
nbh_class = neighborhood.get("neighborhood_class", "?") if neighborhood else "?"
nbh_conf = neighborhood.get("confidence_level", "?") if neighborhood else "?"
_emit(status_cb, f" Datos: FEMA={f_zone}, HUD 3BR=${h3}, {n_hur} huracanes, Nbh={nbh_class}({nbh_conf})")
if errors:
_emit(status_cb, f" Fetcher errors: {len(errors)} (continuamos con datos parciales)")
return {
"geocode": geocode,
"flood": flood,
"fmr": fmr,
"hurricanes": hurricanes,
"hurricanes_summary": hurricanes_summary,
"neighborhood": neighborhood,
"court_records": court_records, # Wave 1.5A
"fetch_errors": errors,
"duration_seconds": round(time.perf_counter() - t0, 2),
}
+127
View File
@@ -0,0 +1,127 @@
"""data_fetchers/zillow_photo_lookup.py — Buscar fotos de Zillow por address.
PROPOSITO:
Los scrapers de county clerks (Miami-Dade, Duval, Broward, etc.) no exponen
fotos de la propiedad. Sin embargo, Zillow tiene fotos para casi cualquier
address en USA (incluso para foreclosures off-market).
Estrategia:
1. Construir URL de Zillow address search: https://www.zillow.com/homes/{slug}_rb/
2. Firecrawl scrape markdown
3. Regex sobre markdown para extraer photos.zillowstatic.com URLs
4. Retornar list[str] de URLs (cap 5)
COSTO: 1 Firecrawl credit por address lookup.
USO:
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
urls = fetch_zillow_photos_by_address("2837 BLACK BUCK CIR, JACKSONVILLE, FL")
# → ["https://photos.zillowstatic.com/fp/X.jpg", ...]
"""
from __future__ import annotations
import os
import re
from typing import Optional
# Photo URL pattern (Zillow CDN)
_PHOTO_PAT = re.compile(
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png|jpeg))\)",
re.IGNORECASE,
)
def _build_address_search_url(address: str) -> str:
"""Build Zillow address search URL.
Format: https://www.zillow.com/homes/{slug}_rb/
Slug = uppercased address with dashes, no commas/extras.
e.g., "2837 BLACK BUCK CIR, JACKSONVILLE, FL"
https://www.zillow.com/homes/2837-BLACK-BUCK-CIR-JACKSONVILLE-FL_rb/
"""
s = address.upper().replace(",", "").replace(".", "")
s = re.sub(r"\s+", "-", s.strip())
s = re.sub(r"-+", "-", s)
return f"https://www.zillow.com/homes/{s}_rb/"
def fetch_zillow_photos_by_address(
address: str,
max_photos: int = 1, # Solo 1 foto (la principal). Las demas ve user en Zillow directly.
debug: bool = False,
) -> tuple[list[str], dict]:
"""Fetch photo URLs from Zillow address search.
Returns (photo_urls, metadata).
metadata: {url_attempted, address_matched_in_md, credits_used, error}
Caveats:
- Si Zillow no tiene la propiedad, returns ([], {...}) silently.
- Si Firecrawl falla, returns ([], {"error": ...}).
- Caller debe persistir el resultado y NO reintentar si vacío
(perderia credits sin ganar nada).
"""
meta = {
"url_attempted": None,
"address_matched_in_md": False,
"credits_used": 0,
"error": None,
"markdown_size": 0,
}
if not address or len(address.strip()) < 5:
meta["error"] = "address too short / empty"
return [], meta
api_key = os.getenv("FIRECRAWL_API_KEY", "")
if not api_key:
meta["error"] = "FIRECRAWL_API_KEY not configured"
return [], meta
url = _build_address_search_url(address)
meta["url_attempted"] = url
try:
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=api_key)
resp = app.scrape(url, formats=["markdown"])
md = resp.markdown if hasattr(resp, "markdown") else resp.get("markdown", "")
meta["markdown_size"] = len(md)
meta["credits_used"] = 1
except Exception as e:
meta["error"] = f"firecrawl error: {type(e).__name__}: {e}"
return [], meta
# Verify address actually matched (sanity check — Zillow sometimes returns
# a "no results" page or different property)
# Extract street number + a distinctive street word
addr_upper = address.upper()
street_num_match = re.match(r"(\d+)", addr_upper.strip())
street_num = street_num_match.group(1) if street_num_match else ""
# Check if street number AND some distinctive word from address appears in markdown
addr_in_md = bool(street_num and street_num in md)
if addr_in_md:
# Look for at least one >3-char word from address
words = [w for w in re.findall(r"[A-Z]+", addr_upper) if len(w) >= 4]
word_match = any(w in md.upper() for w in words[:3])
addr_in_md = addr_in_md and word_match
meta["address_matched_in_md"] = addr_in_md
# Extract photos
photos = _PHOTO_PAT.findall(md)
# Dedup keeping order
seen = set()
unique = []
for p in photos:
if p not in seen:
seen.add(p)
unique.append(p)
# Only return photos if address matched (defensive)
if not addr_in_md and not debug:
meta["error"] = "address not matched in Zillow markdown (no result page)"
return [], meta
return unique[:max_photos], meta