feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
"""Data fetchers para AR-House.
|
||||
|
||||
Obtiene datos reales de fuentes oficiales ANTES de pasar el deal a los agentes Ollama.
|
||||
Asi los agentes razonan sobre datos verificados (FEMA, HUD, NOAA, Census ACS) en vez de inventar.
|
||||
|
||||
Uso principal:
|
||||
from data_fetchers.runner import fetch_all
|
||||
data = fetch_all(deal, status_cb=...)
|
||||
# data = {"geocode": {...}, "flood": {...}, "fmr": {...},
|
||||
# "hurricanes": [...], "neighborhood": {...}, "fetch_errors": [...]}
|
||||
|
||||
Fail-soft: si algun fetcher falla, devuelve dict vacio en su campo y agrega a fetch_errors.
|
||||
El pipeline NO se aborta.
|
||||
|
||||
Compliance: la clasificacion de vecindarios usa SOLO indicadores economicos objetivos
|
||||
(income, owner-occupancy, education, vacancy, crime, days-on-market). NUNCA demografia
|
||||
racial. Esto cumple con Fair Housing Act federal.
|
||||
"""
|
||||
|
||||
# Cargar .env ANTES de los imports — buscando desde este archivo upwards.
|
||||
# Asi los fetchers (Census, HUD, FBI) encuentran las API keys aunque el caller
|
||||
# este corriendo desde otro CWD.
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
_here = Path(__file__).resolve().parent # .../data_fetchers/
|
||||
for _parent in [_here.parent] + list(_here.parents):
|
||||
_candidate = _parent / ".env"
|
||||
if _candidate.exists():
|
||||
load_dotenv(_candidate)
|
||||
break
|
||||
|
||||
from .runner import fetch_all
|
||||
from .price_validator import validate_price
|
||||
|
||||
__all__ = ["fetch_all", "validate_price"]
|
||||
@@ -0,0 +1,13 @@
|
||||
"""Common types and constants para data fetchers."""
|
||||
|
||||
|
||||
class FetcherError(Exception):
|
||||
"""Raised when a fetcher fails. Caught por runner para fail-soft."""
|
||||
pass
|
||||
|
||||
|
||||
# User-Agent para requests HTTP (algunas APIs como Nominatim lo requieren)
|
||||
USER_AGENT = "AR-House/0.1.0 (real-estate-analysis-tool; +https://localhost)"
|
||||
|
||||
# Timeouts default
|
||||
DEFAULT_TIMEOUT = 15 # segundos para requests HTTP
|
||||
@@ -0,0 +1,78 @@
|
||||
"""File-based cache para data fetchers. JSON on disk con TTL.
|
||||
|
||||
Estructura:
|
||||
.cache/data_fetchers/<namespace>_<hash16>.json
|
||||
|
||||
Cada entry:
|
||||
{"cached_at": <epoch_seconds>, "key": "<original_key>", "data": {...}}
|
||||
|
||||
TTL se evalua en get() — si la entrada esta vencida, devuelve None
|
||||
(no la borra; la sobreescribe el siguiente set()).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class FileCache:
|
||||
def __init__(self, cache_dir: str | Path):
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _path(self, namespace: str, key: str) -> Path:
|
||||
safe_key = hashlib.sha1(key.encode("utf-8")).hexdigest()[:16]
|
||||
safe_ns = "".join(c if c.isalnum() else "_" for c in namespace)[:24]
|
||||
return self.cache_dir / f"{safe_ns}_{safe_key}.json"
|
||||
|
||||
def get(self, namespace: str, key: str, ttl_days: float) -> Optional[dict]:
|
||||
"""Devuelve el dict cacheado si existe y no esta vencido. Sino None."""
|
||||
p = self._path(namespace, key)
|
||||
if not p.exists():
|
||||
return None
|
||||
try:
|
||||
with p.open(encoding="utf-8") as f:
|
||||
entry = json.load(f)
|
||||
cached_at = entry.get("cached_at", 0)
|
||||
age_days = (time.time() - cached_at) / 86400.0
|
||||
if age_days > ttl_days:
|
||||
return None
|
||||
return entry.get("data")
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
def set(self, namespace: str, key: str, data: dict) -> None:
|
||||
"""Guarda data al cache. Errores de escritura son silenciados (non-fatal)."""
|
||||
p = self._path(namespace, key)
|
||||
entry = {
|
||||
"cached_at": time.time(),
|
||||
"namespace": namespace,
|
||||
"key": key,
|
||||
"data": data,
|
||||
}
|
||||
try:
|
||||
p.write_text(
|
||||
json.dumps(entry, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
except OSError:
|
||||
pass # cache failures are non-fatal
|
||||
|
||||
def clear(self, namespace: Optional[str] = None) -> int:
|
||||
"""Borra entradas de cache. Si namespace, solo de esa namespace.
|
||||
|
||||
Devuelve cantidad de archivos borrados.
|
||||
"""
|
||||
count = 0
|
||||
pattern = f"{namespace}_*.json" if namespace else "*.json"
|
||||
for p in self.cache_dir.glob(pattern):
|
||||
try:
|
||||
p.unlink()
|
||||
count += 1
|
||||
except OSError:
|
||||
pass
|
||||
return count
|
||||
@@ -0,0 +1,85 @@
|
||||
"""US Census Geocoder - address -> lat/lng/county/state.
|
||||
|
||||
API gratis, no key, sin rate limits documentados (uso razonable).
|
||||
Documentacion: https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf
|
||||
|
||||
Limitacion: SOLO USA (incluye PR, GU, AS, MP, VI).
|
||||
|
||||
Devuelve dict con:
|
||||
matched_address, lat, lng, city, state, zip, county_name, county_fips, state_fips
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import requests
|
||||
|
||||
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
CENSUS_URL = "https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress"
|
||||
|
||||
|
||||
def fetch_geocode(address: str) -> dict:
|
||||
"""Geocodifica una direccion USA. Raises FetcherError si no hay match."""
|
||||
if not address or len(address.strip()) < 5:
|
||||
raise FetcherError("address vacio o muy corto")
|
||||
|
||||
params = {
|
||||
"address": address.strip(),
|
||||
"benchmark": "Public_AR_Current",
|
||||
"vintage": "Current_Current",
|
||||
"format": "json",
|
||||
# Necesitamos Census Tracts ademas de Counties para neighborhood_class
|
||||
"layers": "Census Tracts,Counties,2020 Census Blocks",
|
||||
}
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
|
||||
try:
|
||||
r = requests.get(CENSUS_URL, params=params, headers=headers, timeout=DEFAULT_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
raise FetcherError(f"HTTP error: {e}") from e
|
||||
|
||||
try:
|
||||
data = r.json()
|
||||
except ValueError as e:
|
||||
raise FetcherError(f"JSON parse error: {e}") from e
|
||||
|
||||
matches = data.get("result", {}).get("addressMatches", [])
|
||||
if not matches:
|
||||
raise FetcherError(f"No geocode match for: {address!r}")
|
||||
|
||||
m = matches[0]
|
||||
coords = m.get("coordinates", {}) or {}
|
||||
comp = m.get("addressComponents", {}) or {}
|
||||
geos = m.get("geographies", {}) or {}
|
||||
|
||||
# Counties layer; nombre varia entre vintages: a veces "Counties", a veces con sufijo
|
||||
counties = (
|
||||
geos.get("Counties")
|
||||
or geos.get("2020 Census Counties")
|
||||
or geos.get("County Subdivisions")
|
||||
or []
|
||||
)
|
||||
county = counties[0] if counties else {}
|
||||
|
||||
# Census Tracts: granularidad de vecindario (~4K personas por tract)
|
||||
tracts = geos.get("Census Tracts") or geos.get("2020 Census Tracts") or []
|
||||
tract = tracts[0] if tracts else {}
|
||||
|
||||
return {
|
||||
"matched_address": m.get("matchedAddress"),
|
||||
"lat": coords.get("y"),
|
||||
"lng": coords.get("x"),
|
||||
"city": comp.get("city"),
|
||||
"state": comp.get("state"),
|
||||
"zip": comp.get("zip"),
|
||||
"county_name": county.get("NAME") or county.get("BASENAME"),
|
||||
"county_fips": county.get("GEOID"), # e.g. "12086"
|
||||
"state_fips": county.get("STATE"), # e.g. "12"
|
||||
# NEW: tract info para neighborhood_class
|
||||
"tract_geoid": tract.get("GEOID"), # e.g. "12086007608"
|
||||
"tract_code": tract.get("TRACT"), # e.g. "007608" (6-digit, sin state/county)
|
||||
"tract_name": tract.get("NAME"), # e.g. "Census Tract 76.08"
|
||||
"county_code_only": county.get("COUNTY") or tract.get("COUNTY"), # e.g. "086" (3-digit county solo)
|
||||
}
|
||||
@@ -0,0 +1,456 @@
|
||||
"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform.
|
||||
|
||||
Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces
|
||||
que comparten 33 condados de FL. Una sola implementacion los cubre a todos.
|
||||
|
||||
COUNTIES COVERED (33):
|
||||
Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist,
|
||||
Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes,
|
||||
Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau,
|
||||
Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton,
|
||||
Washington.
|
||||
|
||||
NOT INCLUDED (use other adapters):
|
||||
Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee,
|
||||
Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe,
|
||||
Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas,
|
||||
Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor.
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.civitek_ocrs import fetch_civitek_court_records
|
||||
result = fetch_civitek_court_records(
|
||||
county_name="Hernando",
|
||||
case_number="2024-CA-001234",
|
||||
)
|
||||
# → {status, case_data, lis_pendens, sources_used, source_url, errors, ...}
|
||||
|
||||
TECHNICAL NOTES:
|
||||
- Uses Playwright headless Chromium (free, ~$0 cost per query)
|
||||
- Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests
|
||||
- Auto-generated DOM ids (j_idt*) change per session — we use text selectors
|
||||
- Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable
|
||||
- Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse)
|
||||
- Rate limit: not stated by Civitek — we self-throttle to 1 req/2s
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# COUNTY CODE MAPPING (Civitek 2-digit codes)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
CIVITEK_COUNTY_CODES: dict[str, str] = {
|
||||
# Format: "County Name (canonical)": "NN" (2-digit Civitek code)
|
||||
"Baker": "02",
|
||||
"Bradford": "04",
|
||||
"Calhoun": "07",
|
||||
"Columbia": "12",
|
||||
"DeSoto": "14",
|
||||
"Dixie": "15",
|
||||
"Franklin": "19",
|
||||
"Gilchrist": "21",
|
||||
"Glades": "22",
|
||||
"Gulf": "23",
|
||||
"Hamilton": "24",
|
||||
"Hardee": "25",
|
||||
"Hendry": "26",
|
||||
"Hernando": "27",
|
||||
"Highlands": "28",
|
||||
"Holmes": "30",
|
||||
"Jackson": "32",
|
||||
"Jefferson": "33",
|
||||
"Lafayette": "34",
|
||||
"Levy": "38",
|
||||
"Liberty": "39",
|
||||
"Madison": "40",
|
||||
"Marion": "42",
|
||||
"Nassau": "45",
|
||||
"Okeechobee": "47",
|
||||
"Pasco": "51",
|
||||
"Putnam": "54",
|
||||
"Santa Rosa": "57",
|
||||
"Sumter": "60",
|
||||
"Union": "63",
|
||||
"Wakulla": "65",
|
||||
"Walton": "66",
|
||||
"Washington": "67",
|
||||
}
|
||||
|
||||
|
||||
def is_civitek_county(county_name: Optional[str]) -> bool:
|
||||
"""True if county is in Civitek (33 FL counties)."""
|
||||
if not county_name:
|
||||
return False
|
||||
cn = county_name.strip().replace(" County", "").replace(" county", "")
|
||||
return cn in CIVITEK_COUNTY_CODES
|
||||
|
||||
|
||||
def civitek_code_for(county_name: str) -> Optional[str]:
|
||||
"""Return Civitek 2-digit code for a county name, or None."""
|
||||
cn = county_name.strip().replace(" County", "").replace(" county", "")
|
||||
return CIVITEK_COUNTY_CODES.get(cn)
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CASE NUMBER PARSER
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately.
|
||||
# Common formats observed in realauction.com deals:
|
||||
# "2024-CA-001234"
|
||||
# "23-2024-CA-001234"
|
||||
# "2024CA001234"
|
||||
# "2024-001234-CA"
|
||||
# "27-2024-CA-001234" (court code prefix)
|
||||
_CASE_PATTERNS = [
|
||||
# year-type-seq
|
||||
re.compile(r"(?:\d{2}-)?(?P<year>20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P<seq>\d{3,8})", re.IGNORECASE),
|
||||
# year-seq-type
|
||||
re.compile(r"(?P<year>20\d{2})[\-\s]?(?P<seq>\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE),
|
||||
# tight: yearTypeNNNNNN
|
||||
re.compile(r"(?P<year>20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P<seq>\d{3,8})", re.IGNORECASE),
|
||||
]
|
||||
|
||||
|
||||
def parse_case_number(case_number: str) -> Optional[tuple[str, str]]:
|
||||
"""Parse a FL case_number into (year, sequence). Returns None if unparseable.
|
||||
|
||||
Examples:
|
||||
"2024-CA-001234" → ("2024", "001234")
|
||||
"23-2024-CA-001234" → ("2024", "001234")
|
||||
"2024CA001234" → ("2024", "001234")
|
||||
"""
|
||||
if not case_number:
|
||||
return None
|
||||
cn = case_number.strip().upper()
|
||||
for pat in _CASE_PATTERNS:
|
||||
m = pat.search(cn)
|
||||
if m:
|
||||
year = m.group("year")
|
||||
seq = m.group("seq").lstrip("0") or "0"
|
||||
return (year, seq)
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# PUBLIC API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_civitek_court_records(
|
||||
county_name: str,
|
||||
case_number: Optional[str] = None,
|
||||
party_lastname: Optional[str] = None,
|
||||
party_firstname: Optional[str] = None,
|
||||
business_name: Optional[str] = None,
|
||||
headless: bool = True,
|
||||
timeout_seconds: int = 45,
|
||||
) -> dict:
|
||||
"""Fetch court records from Civitek OCRS.
|
||||
|
||||
Provide ONE of:
|
||||
- case_number (e.g., "2024-CA-001234") → fastest, most precise
|
||||
- party_lastname (with optional firstname) → person search
|
||||
- business_name → business search
|
||||
|
||||
Returns dict matching court_records.py contract:
|
||||
{
|
||||
"status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" |
|
||||
"FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" |
|
||||
"NOT_FOUND" | "ERROR",
|
||||
"county": str (normalized),
|
||||
"case_number_searched": str,
|
||||
"search_method": "case_number" | "person_name" | "business_name",
|
||||
"results": list of dicts (raw cases found),
|
||||
"case_data": dict (top result enriched) | None,
|
||||
"lis_pendens": list,
|
||||
"liens_inventory": dict,
|
||||
"sources_used": ["civitek_ocrs"],
|
||||
"source_url": str,
|
||||
"errors": list of strings,
|
||||
"fetched_at": ISO timestamp,
|
||||
}
|
||||
"""
|
||||
fetched_at = datetime.utcnow().isoformat() + "Z"
|
||||
county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "")
|
||||
|
||||
# Validate county
|
||||
code = civitek_code_for(county_normalized)
|
||||
if not code:
|
||||
return _error_result(
|
||||
county=county_normalized,
|
||||
case_number_searched=case_number or "",
|
||||
error=f"County '{county_normalized}' not on Civitek platform. "
|
||||
f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...",
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
# Validate at least one search criterion
|
||||
if not (case_number or party_lastname or business_name):
|
||||
return _error_result(
|
||||
county=county_normalized,
|
||||
case_number_searched="",
|
||||
error="Must provide one of: case_number, party_lastname, or business_name",
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
# Determine search method
|
||||
if case_number:
|
||||
parsed = parse_case_number(case_number)
|
||||
if not parsed:
|
||||
return _error_result(
|
||||
county=county_normalized,
|
||||
case_number_searched=case_number,
|
||||
error=f"Could not parse case_number '{case_number}' into year+sequence",
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
year, seq = parsed
|
||||
search_method = "case_number"
|
||||
elif business_name:
|
||||
year = seq = None
|
||||
search_method = "business_name"
|
||||
else:
|
||||
year = seq = None
|
||||
search_method = "person_name"
|
||||
|
||||
# Execute Playwright flow
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
except ImportError:
|
||||
return _error_result(
|
||||
county=county_normalized,
|
||||
case_number_searched=case_number or "",
|
||||
error="playwright not installed. Run: pip install playwright && playwright install chromium",
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/"
|
||||
errors: list[str] = []
|
||||
results: list[dict] = []
|
||||
final_url = base_url
|
||||
status_from_results = "UNKNOWN"
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=headless)
|
||||
ctx = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
|
||||
)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
# Step 1: Entry page
|
||||
page.goto(base_url)
|
||||
page.wait_for_timeout(1500)
|
||||
|
||||
# Step 2: Click Public
|
||||
page.locator("button:has-text('Public')").first.click()
|
||||
page.wait_for_timeout(2500)
|
||||
|
||||
# Step 3: Click I Agree (disclaimer)
|
||||
agree_btn = page.locator("button:has-text('I Agree')").first
|
||||
if agree_btn.count() == 0:
|
||||
errors.append("Disclaimer page didn't show 'I Agree' button")
|
||||
browser.close()
|
||||
return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at)
|
||||
agree_btn.click()
|
||||
page.wait_for_timeout(2500)
|
||||
final_url = page.url
|
||||
|
||||
# Step 4: Fill form based on search_method
|
||||
if search_method == "case_number":
|
||||
# Switch to Case Search tab (data-index=1)
|
||||
case_tab = page.locator("li[role='tab']:has-text('Case Search')").first
|
||||
if case_tab.count() == 0:
|
||||
errors.append("Case Search tab not found")
|
||||
else:
|
||||
case_tab.click()
|
||||
page.wait_for_timeout(1500)
|
||||
page.fill("#form\\:search_tab\\:year", year)
|
||||
page.fill("#form\\:search_tab\\:seq", seq)
|
||||
elif search_method == "person_name":
|
||||
# Person Search tab is default (data-index=0)
|
||||
page.fill("#form\\:search_tab\\:lastname", party_lastname)
|
||||
if party_firstname:
|
||||
page.fill("#form\\:search_tab\\:fname", party_firstname)
|
||||
elif search_method == "business_name":
|
||||
page.fill("#form\\:search_tab\\:businessname", business_name)
|
||||
|
||||
# Step 5: Submit
|
||||
search_btn = page.locator(
|
||||
"button:has(.ui-button-text:text-is('Search'))"
|
||||
).first
|
||||
if search_btn.count() == 0:
|
||||
search_btn = page.locator("button:has-text('Search')").first
|
||||
search_btn.click()
|
||||
page.wait_for_timeout(6000)
|
||||
|
||||
# Step 6: Capture validation errors
|
||||
err_msgs = page.locator(".ui-message-error, .ui-messages-error").all()
|
||||
for m in err_msgs[:5]:
|
||||
try:
|
||||
t = (m.inner_text() or "").strip()
|
||||
if t and t not in errors:
|
||||
errors.append(t[:200])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Step 7: Parse results table
|
||||
# Civitek results render in a data table with role=grid or as a panelgrid.
|
||||
# Look for tables that contain "Case" headers
|
||||
results = _parse_results(page)
|
||||
final_url = page.url
|
||||
|
||||
browser.close()
|
||||
except PWTimeout as e:
|
||||
errors.append(f"Playwright timeout: {e}")
|
||||
except Exception as e:
|
||||
errors.append(f"Playwright crashed: {type(e).__name__}: {e}")
|
||||
|
||||
# Determine status from results
|
||||
if errors and not results:
|
||||
status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND"
|
||||
elif not results:
|
||||
status_from_results = "NOT_FOUND"
|
||||
else:
|
||||
# Has results — infer from case_type
|
||||
first_type = (results[0].get("case_type") or "").upper()
|
||||
if "CA" in first_type or "CIVIL" in first_type:
|
||||
status_from_results = "FORECLOSURE_PENDING" # CA cases include foreclosures
|
||||
elif "CF" in first_type or "FELONY" in first_type:
|
||||
status_from_results = "CLEAN" # unrelated criminal
|
||||
else:
|
||||
status_from_results = "UNKNOWN"
|
||||
|
||||
# Top result enriched
|
||||
case_data = results[0] if results else None
|
||||
|
||||
return {
|
||||
"status": status_from_results,
|
||||
"county": county_normalized,
|
||||
"case_number_searched": case_number or "",
|
||||
"search_method": search_method,
|
||||
"results": results,
|
||||
"case_data": case_data,
|
||||
"lis_pendens": [],
|
||||
"liens_inventory": {},
|
||||
"sources_used": ["civitek_ocrs"],
|
||||
"source_url": final_url,
|
||||
"errors": errors,
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Internal helpers
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _parse_results(page) -> list[dict]:
|
||||
"""Parse the results table from a Civitek search results page.
|
||||
|
||||
Civitek renders results as a DataTable (PrimeFaces). Look for tables with
|
||||
case-related headers. Returns list of dicts with case_number, parties,
|
||||
filed_date, case_type.
|
||||
"""
|
||||
results: list[dict] = []
|
||||
tables = page.locator("table").all()
|
||||
for tbl in tables:
|
||||
try:
|
||||
rows = tbl.locator("tr")
|
||||
row_count = rows.count()
|
||||
if row_count < 2:
|
||||
continue
|
||||
# Header row
|
||||
headers_raw = rows.first.locator("th, td").all()
|
||||
headers = [(h.inner_text() or "").strip().lower() for h in headers_raw]
|
||||
# Heuristic: this table has case results if headers include any
|
||||
# of "case", "uniform", "date", "party", "type"
|
||||
if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type"))
|
||||
for h in headers):
|
||||
continue
|
||||
# Index columns
|
||||
col_idx = {}
|
||||
for i, h in enumerate(headers):
|
||||
if "case" in h or "uniform" in h:
|
||||
col_idx["case_number"] = i
|
||||
elif "type" in h or "court" in h:
|
||||
col_idx["case_type"] = i
|
||||
elif "filed" in h or "date" in h:
|
||||
col_idx["filed_date"] = i
|
||||
elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h:
|
||||
col_idx["parties"] = i
|
||||
elif "status" in h:
|
||||
col_idx["status"] = i
|
||||
|
||||
# Data rows
|
||||
for r in range(1, row_count):
|
||||
cells = rows.nth(r).locator("td").all()
|
||||
if not cells:
|
||||
continue
|
||||
cell_texts = [(c.inner_text() or "").strip() for c in cells]
|
||||
row_data = {}
|
||||
for k, i in col_idx.items():
|
||||
if i < len(cell_texts):
|
||||
row_data[k] = cell_texts[i]
|
||||
if row_data:
|
||||
results.append(row_data)
|
||||
except Exception:
|
||||
continue
|
||||
# If we found a results table with rows, stop
|
||||
if results:
|
||||
break
|
||||
return results
|
||||
|
||||
|
||||
def _error_result(
|
||||
county: str,
|
||||
case_number_searched: str,
|
||||
error: str,
|
||||
fetched_at: str,
|
||||
) -> dict:
|
||||
return {
|
||||
"status": "ERROR",
|
||||
"county": county,
|
||||
"case_number_searched": case_number_searched,
|
||||
"search_method": None,
|
||||
"results": [],
|
||||
"case_data": None,
|
||||
"lis_pendens": [],
|
||||
"liens_inventory": {},
|
||||
"sources_used": ["civitek_ocrs"],
|
||||
"source_url": "",
|
||||
"errors": [error],
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI for manual testing
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test")
|
||||
parser.add_argument("--county", required=True, help="County name (e.g., Hernando)")
|
||||
parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)")
|
||||
parser.add_argument("--last-name", help="Last name for person search")
|
||||
parser.add_argument("--first-name", help="First name (optional with last-name)")
|
||||
parser.add_argument("--business", help="Business name search")
|
||||
parser.add_argument("--no-headless", action="store_true", help="Show browser window")
|
||||
args = parser.parse_args()
|
||||
|
||||
result = fetch_civitek_court_records(
|
||||
county_name=args.county,
|
||||
case_number=args.case,
|
||||
party_lastname=args.last_name,
|
||||
party_firstname=args.first_name,
|
||||
business_name=args.business,
|
||||
headless=not args.no_headless,
|
||||
)
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
@@ -0,0 +1,999 @@
|
||||
"""court_records.py — Deterministic foreclosure detection via county clerk records.
|
||||
|
||||
PROBLEMA QUE RESUELVE:
|
||||
El sistema heuristico (price_validator.py + property_value.py) HIPOTETIZA que un
|
||||
listing sospechosamente bajo es foreclosure. Para CONFIRMAR deterministicamente
|
||||
necesitamos consultar los court records publicos del condado:
|
||||
- Lis pendens (notice of foreclosure filing)
|
||||
- Code enforcement violations + liens
|
||||
- Tax delinquency
|
||||
|
||||
ALCANCE WAVE 1.5A:
|
||||
- Solo DUVAL (Jacksonville) implementado en esta version
|
||||
- Otros condados: soft-fail con URL del clerk para lookup manual
|
||||
- Replicacion a Miami-Dade / Broward / Palm Beach / Hillsborough en versiones
|
||||
posteriores SI Duval funciona end-to-end.
|
||||
|
||||
STACK:
|
||||
- Playwright headless Chromium (local, $0 por consulta)
|
||||
- Fallback a Firecrawl si Playwright falla (opcional, requiere ENABLE_FIRECRAWL=true)
|
||||
- User-Agent identificable + rate-limit 1 req/2s por dominio
|
||||
- Cache TTL 7 dias (los procesos judiciales se mueven lento)
|
||||
|
||||
OPT-IN:
|
||||
ENABLE_COURT_RECORDS=true en .env
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Rate limit por dominio (un request c/2s segun la regla)
|
||||
_DOMAIN_LAST_REQUEST: dict[str, float] = {}
|
||||
_RATE_LIMIT_SECONDS = 2.0
|
||||
|
||||
# User-Agent identificable (no spoof — somos un servicio legitimo)
|
||||
USER_AGENT = "AR-House/1.0 (real estate investment analysis; +https://ar-house.example/contact)"
|
||||
|
||||
# Counties con scraper implementado en esta version
|
||||
SUPPORTED_COUNTIES = {"Duval", "duval"}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Wave 1.5A v1.2: Plaintiff classification + Lien survival analysis
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Categorias de plaintiff (quien demanda el foreclosure)
|
||||
PLAINTIFF_TYPE_BANK_NATIONAL = "BANK_NATIONAL" # Wells Fargo, BofA, Chase, Citi, etc
|
||||
PLAINTIFF_TYPE_BANK_REGIONAL = "BANK_REGIONAL" # Truist, Regions, BB&T, Fifth Third
|
||||
PLAINTIFF_TYPE_CREDIT_UNION = "CREDIT_UNION" # Navy Federal, VyStar, etc
|
||||
PLAINTIFF_TYPE_NONBANK_MORTGAGE = "NONBANK_MORTGAGE" # Quicken/Rocket, PHH, Mr. Cooper, Carrington
|
||||
PLAINTIFF_TYPE_GSE = "GSE" # Fannie Mae, Freddie Mac, Ginnie Mae
|
||||
PLAINTIFF_TYPE_TRUSTEE = "TRUSTEE_MBS" # Deutsche Bank AS Trustee, US Bank NA Trustee (MBS trusts)
|
||||
PLAINTIFF_TYPE_IRS = "IRS_FEDERAL" # Internal Revenue Service (federal tax)
|
||||
PLAINTIFF_TYPE_STATE_TAX = "STATE_TAX" # FL Dept of Revenue
|
||||
PLAINTIFF_TYPE_HOA = "HOA_ASSOCIATION" # Homeowners / Condo association
|
||||
PLAINTIFF_TYPE_MUNICIPAL = "MUNICIPAL" # City/County code enforcement, utility liens
|
||||
PLAINTIFF_TYPE_HARD_MONEY = "HARD_MONEY_LENDER" # LLC nonbank, private high-rate lender
|
||||
PLAINTIFF_TYPE_PRIVATE = "PRIVATE_INDIVIDUAL" # Private investor (named person)
|
||||
PLAINTIFF_TYPE_OTHER = "OTHER"
|
||||
PLAINTIFF_TYPE_UNKNOWN = "UNKNOWN"
|
||||
|
||||
# Mapeo de keywords del nombre del plaintiff a su categoria
|
||||
_BANK_NATIONAL_KEYWORDS = (
|
||||
"WELLS FARGO", "BANK OF AMERICA", "CHASE", "JPMORGAN", "JP MORGAN",
|
||||
"CITIBANK", "CITI ", "CITI,", "U.S. BANK", "US BANK", "USBANK",
|
||||
"PNC BANK", "TD BANK", "HSBC", "CAPITAL ONE",
|
||||
)
|
||||
_BANK_REGIONAL_KEYWORDS = (
|
||||
"TRUIST", "REGIONS BANK", "BB&T", "BBT BANK", "FIFTH THIRD", "5/3 BANK",
|
||||
"SUNTRUST", "M&T BANK", "KEYBANK", "HUNTINGTON",
|
||||
)
|
||||
_CREDIT_UNION_KEYWORDS = ("CREDIT UNION", "VYSTAR", "NAVY FEDERAL", "FCU", "C.U.")
|
||||
_NONBANK_MORTGAGE_KEYWORDS = (
|
||||
"QUICKEN", "ROCKET MORTGAGE", "PHH MORTGAGE", "MR. COOPER", "MR COOPER",
|
||||
"NATIONSTAR", "CARRINGTON", "FREEDOM MORTGAGE", "LOANDEPOT",
|
||||
"PENNYMAC", "NEW REZ", "NEWREZ", "SHELLPOINT", "OCWEN", "DITECH",
|
||||
"BAYVIEW", "SPECIALIZED LOAN SERVICING",
|
||||
)
|
||||
_GSE_KEYWORDS = ("FANNIE MAE", "FEDERAL NATIONAL MORTGAGE", "FNMA",
|
||||
"FREDDIE MAC", "FEDERAL HOME LOAN MORTGAGE", "FHLMC",
|
||||
"GINNIE MAE", "GNMA")
|
||||
_TRUSTEE_KEYWORDS = ("AS TRUSTEE", "AS INDENTURE TRUSTEE", "TRUSTEE FOR",
|
||||
"DEUTSCHE BANK NATIONAL", "BANK OF NEW YORK MELLON", "BNY MELLON",
|
||||
"WILMINGTON TRUST", "WILMINGTON SAVINGS")
|
||||
_IRS_KEYWORDS = ("INTERNAL REVENUE SERVICE", "I.R.S.", "U.S. INTERNAL REVENUE",
|
||||
"UNITED STATES OF AMERICA")
|
||||
_STATE_TAX_KEYWORDS = ("FLORIDA DEPARTMENT OF REVENUE", "FL DEPT OF REVENUE",
|
||||
"STATE OF FLORIDA")
|
||||
_HOA_KEYWORDS = ("HOMEOWNERS", "ASSOCIATION INC", "ASSOCIATION, INC", "CONDOMINIUM",
|
||||
"PROPERTY OWNERS ASSOCIATION", " POA ", "HOA")
|
||||
_MUNICIPAL_KEYWORDS = ("CITY OF ", "COUNTY OF ", "MUNICIPALITY", "CODE ENFORCEMENT",
|
||||
"TAX COLLECTOR")
|
||||
|
||||
|
||||
def classify_plaintiff(name: Optional[str]) -> dict:
|
||||
"""Clasifica el plaintiff de un foreclosure por su nombre.
|
||||
|
||||
Returns:
|
||||
{
|
||||
name: <input>,
|
||||
type: PLAINTIFF_TYPE_*,
|
||||
category: "primary_lender" | "junior_lienholder" | "tax_authority" |
|
||||
"association" | "government" | "unknown",
|
||||
is_original_loan_holder: bool | None (None si no se puede determinar)
|
||||
}
|
||||
"""
|
||||
if not name:
|
||||
return {
|
||||
"name": None,
|
||||
"type": PLAINTIFF_TYPE_UNKNOWN,
|
||||
"category": "unknown",
|
||||
"is_original_loan_holder": None,
|
||||
}
|
||||
|
||||
upper = name.upper()
|
||||
|
||||
# Order matters: more specific patterns first (trustee MBS antes que bank national)
|
||||
if any(kw in upper for kw in _TRUSTEE_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_TRUSTEE,
|
||||
"category": "mbs_trustee",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "MBS trustee: el loan fue securitizado. El servicer real puede ser otra entity."}
|
||||
if any(kw in upper for kw in _IRS_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_IRS,
|
||||
"category": "tax_authority",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "IRS federal tax lien. SOBREVIVE el foreclosure con 120-day right of redemption."}
|
||||
if any(kw in upper for kw in _STATE_TAX_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_STATE_TAX,
|
||||
"category": "tax_authority",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "FL state tax lien. Tipicamente extinguible pero verificar con title search."}
|
||||
if any(kw in upper for kw in _GSE_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_GSE,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "GSE (Fannie/Freddie/Ginnie). Compraron el loan al originador. Comun en MLS post-foreclosure como REO."}
|
||||
if any(kw in upper for kw in _BANK_NATIONAL_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_BANK_NATIONAL,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": True,
|
||||
"note": "Banco nacional grande. Probablemente originador del loan. Procesos estandarizados."}
|
||||
if any(kw in upper for kw in _BANK_REGIONAL_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_BANK_REGIONAL,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": True,
|
||||
"note": "Banco regional. Mas flexible para negociar short sale o cash-for-keys."}
|
||||
if any(kw in upper for kw in _CREDIT_UNION_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_CREDIT_UNION,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": True,
|
||||
"note": "Credit union. Members-only, foreclosure menos frecuente, mas dispuestos a workout."}
|
||||
if any(kw in upper for kw in _NONBANK_MORTGAGE_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_NONBANK_MORTGAGE,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "Non-bank mortgage servicer. Suele ser servicer asignado, no el originador. Investor real es otro."}
|
||||
if any(kw in upper for kw in _HOA_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_HOA,
|
||||
"category": "association",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "HOA/Condo association. FL Statute 720.3085(2)(b) limita lo que sobrevive a 12 meses dues o 1% del mortgage."}
|
||||
if any(kw in upper for kw in _MUNICIPAL_KEYWORDS):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_MUNICIPAL,
|
||||
"category": "government",
|
||||
"is_original_loan_holder": False,
|
||||
"note": "Municipal lien (code enforcement / utility). SOBREVIVE el foreclosure — corre con la tierra."}
|
||||
|
||||
# LLC sin keyword conocido = probable hard money / private investor
|
||||
if "LLC" in upper or "L.L.C." in upper:
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_HARD_MONEY,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": True,
|
||||
"note": "LLC sin patron de banco/servicer conocido. Probable hard money lender o private investor. Tasas 8-15%, terminos cortos."}
|
||||
|
||||
# Si el nombre tiene formato "APELLIDO, NOMBRE" o sin entity → individual
|
||||
if "," in name or (not any(s in upper for s in ("INC", "CORP", "BANK", "LLC", "TRUST")) and len(name.split()) <= 4):
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_PRIVATE,
|
||||
"category": "primary_lender",
|
||||
"is_original_loan_holder": None,
|
||||
"note": "Individuo (no entity). Posible seller financing, family loan, o private money."}
|
||||
|
||||
return {"name": name, "type": PLAINTIFF_TYPE_OTHER,
|
||||
"category": "unknown",
|
||||
"is_original_loan_holder": None,
|
||||
"note": "Nombre no matchea patrones conocidos. Revisar manualmente."}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Lien types + survival analysis
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Tipos de lien (sigue convencion estandar US real estate)
|
||||
LIEN_TYPE_MORTGAGE_1ST = "MORTGAGE_1ST"
|
||||
LIEN_TYPE_MORTGAGE_2ND = "MORTGAGE_2ND"
|
||||
LIEN_TYPE_MORTGAGE_3RD = "MORTGAGE_3RD"
|
||||
LIEN_TYPE_HELOC = "HELOC"
|
||||
LIEN_TYPE_IRS_TAX = "IRS_TAX_LIEN"
|
||||
LIEN_TYPE_STATE_TAX = "STATE_TAX_LIEN"
|
||||
LIEN_TYPE_PROPERTY_TAX = "PROPERTY_TAX_LIEN" # County property tax delinquency
|
||||
LIEN_TYPE_HOA = "HOA_LIEN"
|
||||
LIEN_TYPE_MECHANICS = "MECHANICS_LIEN"
|
||||
LIEN_TYPE_MUNICIPAL = "MUNICIPAL_LIEN" # Code enforcement, utility
|
||||
LIEN_TYPE_JUDGMENT = "JUDGMENT_LIEN"
|
||||
LIEN_TYPE_OTHER = "OTHER"
|
||||
|
||||
# Survival outcomes
|
||||
SURVIVES = "SURVIVES"
|
||||
EXTINGUISHED = "EXTINGUISHED"
|
||||
EXTINGUISHED_BY_THIS_ACTION = "EXTINGUISHED_BY_THIS_ACTION" # plaintiff's own mortgage
|
||||
UNCERTAIN = "UNCERTAIN"
|
||||
|
||||
|
||||
def analyze_lien_survival(
|
||||
*,
|
||||
lien_type: str,
|
||||
is_plaintiff_lien: bool = False,
|
||||
is_senior_to_plaintiff: bool = False,
|
||||
lien_filing_date: Optional[str] = None,
|
||||
plaintiff_filing_date: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Determina si un lien sobrevive un foreclosure judicial en Florida.
|
||||
|
||||
Reglas FL standard:
|
||||
- Plaintiff's own mortgage lien: EXTINGUISHED_BY_THIS_ACTION
|
||||
- Liens senior (filed before) al plaintiff: SURVIVE (need to be paid off)
|
||||
- Liens junior (filed after) al plaintiff: EXTINGUISHED
|
||||
- IRS federal tax lien: SURVIVES with 120-day right of redemption (26 USC 7425(d))
|
||||
- HOA dues: SURVIVE pero limitados a 12 mo o 1% del mortgage (FL 720.3085, 718.116)
|
||||
- Municipal/Code Enforcement: SURVIVES — runs with the land
|
||||
- Property tax: ALWAYS SURVIVES (super-priority)
|
||||
|
||||
Returns:
|
||||
{
|
||||
survives_foreclosure: SURVIVES | EXTINGUISHED | EXTINGUISHED_BY_THIS_ACTION | UNCERTAIN,
|
||||
warning: str | None,
|
||||
legal_basis: str,
|
||||
}
|
||||
"""
|
||||
# Plaintiff's own mortgage gets wiped by the foreclosure that the plaintiff filed
|
||||
if is_plaintiff_lien:
|
||||
return {
|
||||
"survives_foreclosure": EXTINGUISHED_BY_THIS_ACTION,
|
||||
"warning": None,
|
||||
"legal_basis": "Plaintiff's own mortgage is the subject of this foreclosure — extinguished by judicial sale.",
|
||||
}
|
||||
|
||||
# Super-priority liens (always survive regardless of filing order)
|
||||
if lien_type == LIEN_TYPE_PROPERTY_TAX:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": "Property tax delinquency tiene SUPER-PRIORIDAD. Sobrevive a TODO. Pagar inmediatamente post-cierre o el tax collector vende el property por tax deed.",
|
||||
"legal_basis": "FL Statute 197.122 — ad valorem taxes constitute first lien superior to all other liens.",
|
||||
}
|
||||
if lien_type == LIEN_TYPE_IRS_TAX:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": "IRS federal tax lien SOBREVIVE el foreclosure. El IRS tiene 120-day right of redemption (puede recomprar el property pagando el bid + costos). Despues de 120 dias, el buyer queda dueno definitivamente. Sumar al MAB.",
|
||||
"legal_basis": "26 USC 7425(d) — federal tax liens survive judicial sale with 120-day redemption period.",
|
||||
}
|
||||
if lien_type == LIEN_TYPE_MUNICIPAL:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": "Lien municipal (code enforcement / utility) SOBREVIVE — corre con la tierra. Sumar al MAB. Check code enforcement violations open antes de bidear.",
|
||||
"legal_basis": "FL Statute 162.09 — code enforcement liens equivalent to civil judgment, runs with land.",
|
||||
}
|
||||
if lien_type == LIEN_TYPE_HOA:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": "HOA dues SOBREVIVE pero limitado por FL Statute 720.3085(2)(b): el nuevo owner debe 12 meses de dues o 1% del original mortgage, lo que sea menor. Si es condo: FL 718.116. Pedir HOA estoppel letter pre-bid.",
|
||||
"legal_basis": "FL Statute 720.3085(2)(b) (HOA) o 718.116 (condo) — buyer hereda capped portion.",
|
||||
}
|
||||
if lien_type == LIEN_TYPE_STATE_TAX:
|
||||
# State tax liens vary by chronology
|
||||
if is_senior_to_plaintiff:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": "State tax lien filed BEFORE plaintiff's mortgage → sobrevive.",
|
||||
"legal_basis": "FL Statute 197.0 — chronological priority among non-super-priority liens.",
|
||||
}
|
||||
return {
|
||||
"survives_foreclosure": EXTINGUISHED,
|
||||
"warning": "State tax lien filed AFTER plaintiff's mortgage típicamente se extingue. Validar con title search.",
|
||||
"legal_basis": "Junior liens (post-mortgage) extinguished by foreclosure of senior lien.",
|
||||
}
|
||||
|
||||
# Standard junior/senior analysis for mortgages, mechanics, judgment liens
|
||||
if lien_type in (LIEN_TYPE_MORTGAGE_2ND, LIEN_TYPE_MORTGAGE_3RD, LIEN_TYPE_HELOC,
|
||||
LIEN_TYPE_MECHANICS, LIEN_TYPE_JUDGMENT):
|
||||
if is_senior_to_plaintiff:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": f"{lien_type} filed BEFORE plaintiff's mortgage → sobrevive. Sumar al MAB.",
|
||||
"legal_basis": "Senior lien sobrevive foreclosure de lien junior.",
|
||||
}
|
||||
return {
|
||||
"survives_foreclosure": EXTINGUISHED,
|
||||
"warning": None,
|
||||
"legal_basis": f"{lien_type} junior al plaintiff's mortgage — extinguido por foreclosure judicial.",
|
||||
}
|
||||
|
||||
# 1st mortgage (no es del plaintiff) — significa que hay otra hipoteca senior
|
||||
if lien_type == LIEN_TYPE_MORTGAGE_1ST:
|
||||
if is_senior_to_plaintiff:
|
||||
return {
|
||||
"survives_foreclosure": SURVIVES,
|
||||
"warning": "Existe un mortgage senior al plaintiff's lien. Buyer hereda ESTA hipoteca. Sumar saldo al MAB.",
|
||||
"legal_basis": "Senior mortgage survives foreclosure of junior lien.",
|
||||
}
|
||||
return {
|
||||
"survives_foreclosure": EXTINGUISHED,
|
||||
"warning": None,
|
||||
"legal_basis": "1st mortgage junior al plaintiff (raro pero posible si plaintiff es property tax/super-priority).",
|
||||
}
|
||||
|
||||
# Fallback
|
||||
return {
|
||||
"survives_foreclosure": UNCERTAIN,
|
||||
"warning": f"Tipo de lien '{lien_type}' no tiene regla automatica. Title search profesional ($300-500) requerido.",
|
||||
"legal_basis": "Default safety: assume UNCERTAIN para tipos no clasificados.",
|
||||
}
|
||||
|
||||
|
||||
def _empty_liens_inventory(reason: str) -> dict:
|
||||
"""Estructura vacía estandar para liens cuando v1.1 scraper no esta listo.
|
||||
|
||||
Mantiene el shape del output asi los downstream consumers no rompen.
|
||||
"""
|
||||
return {
|
||||
"all_liens": [],
|
||||
"lien_count": 0,
|
||||
"total_surviving_debt": 0,
|
||||
"investor_warning": (
|
||||
"⚠️ Liens detail no disponible automaticamente (Wave 1.5A v1.1 deferred a Phase 3.5). "
|
||||
"ANTES de cualquier oferta o bid: hacer **title search profesional** "
|
||||
"($300-500 USD) en or.duvalclerk.com filtering por document types: "
|
||||
"MTG (mortgages), NFTL (IRS federal tax liens), SATL (state tax), "
|
||||
"NOC (mechanics), COD (code enforcement). Listing puede tener hasta "
|
||||
"$30K-$100K en deudas heredables NO visibles aqui."
|
||||
),
|
||||
"detail_status": "PENDING_V1_1",
|
||||
"detail_pending_reason": reason,
|
||||
}
|
||||
|
||||
# URL del clerk por condado (para soft-fail mensajes en condados no implementados)
|
||||
COUNTY_CLERK_URLS = {
|
||||
"Duval": "https://www.duvalclerk.com/online-option/court-records",
|
||||
"Miami-Dade": "https://www2.miami-dadeclerk.com/ocs",
|
||||
"Broward": "https://officialrecords.broward.org",
|
||||
"Palm Beach": "https://www.mypalmbeachclerk.com/departments/records-services-division",
|
||||
"Hillsborough": "https://hillsclerk.com",
|
||||
"Orange": "https://myorangeclerk.com",
|
||||
}
|
||||
|
||||
|
||||
def _enable_court_records() -> bool:
|
||||
return os.getenv("ENABLE_COURT_RECORDS", "false").lower() == "true"
|
||||
|
||||
|
||||
def _rate_limit(domain: str) -> None:
|
||||
"""Bloquea si llamamos al mismo dominio < 2s atras."""
|
||||
now = time.time()
|
||||
last = _DOMAIN_LAST_REQUEST.get(domain, 0)
|
||||
delta = now - last
|
||||
if delta < _RATE_LIMIT_SECONDS:
|
||||
time.sleep(_RATE_LIMIT_SECONDS - delta)
|
||||
_DOMAIN_LAST_REQUEST[domain] = time.time()
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Duval (Jacksonville) — implementacion completa
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Diccionarios para parseo de address (Duval Property Appraiser tiene 5 campos)
|
||||
_STREET_DIRECTIONS = {"N", "S", "E", "W", "NE", "NW", "SE", "SW",
|
||||
"NORTH", "SOUTH", "EAST", "WEST"}
|
||||
_STREET_SUFFIXES = {
|
||||
"ST": "ST", "STREET": "ST",
|
||||
"AVE": "AVE", "AVENUE": "AVE",
|
||||
"RD": "RD", "ROAD": "RD",
|
||||
"BLVD": "BLVD", "BOULEVARD": "BLVD",
|
||||
"LN": "LN", "LANE": "LN",
|
||||
"WAY": "WAY",
|
||||
"DR": "DR", "DRIVE": "DR",
|
||||
"CT": "CT", "COURT": "CT",
|
||||
"PL": "PL", "PLACE": "PL",
|
||||
"CIR": "CIR", "CIRCLE": "CIR",
|
||||
"TER": "TER", "TERRACE": "TER",
|
||||
"PKWY": "PKWY", "PARKWAY": "PKWY",
|
||||
"HWY": "HWY", "HIGHWAY": "HWY",
|
||||
"TRL": "TRL", "TRAIL": "TRL",
|
||||
"XING": "XING", "CROSSING": "XING",
|
||||
"ALY": "ALY", "ALLEY": "ALY",
|
||||
"BND": "BND", "BEND": "BND",
|
||||
}
|
||||
|
||||
|
||||
def _parse_address_duval(address: str) -> Optional[dict]:
|
||||
"""Parse "3245 N Pearl St, Jacksonville, FL 32206" into Duval form fields.
|
||||
|
||||
Returns:
|
||||
{street_num: "3245", prefix: "N", name: "PEARL", suffix: "ST", zip: "32206"}
|
||||
o None si no se pudo parsear.
|
||||
"""
|
||||
# Strip post-comma (city/state) y aislar street part
|
||||
street_part = address.split(",")[0].strip()
|
||||
# Capturar ZIP del original si lo hay
|
||||
zip_match = re.search(r"\b(\d{5})(?:-\d{4})?\b", address)
|
||||
zip_code = zip_match.group(1) if zip_match else ""
|
||||
|
||||
tokens = street_part.upper().split()
|
||||
if len(tokens) < 2 or not tokens[0].isdigit():
|
||||
return None
|
||||
|
||||
street_num = tokens[0]
|
||||
rest = tokens[1:]
|
||||
|
||||
# Detectar prefix (direction) en el primer token despues del numero
|
||||
prefix = ""
|
||||
if rest and rest[0] in _STREET_DIRECTIONS:
|
||||
# Normalize NORTH→N etc
|
||||
prefix_raw = rest.pop(0)
|
||||
prefix_map = {"NORTH": "N", "SOUTH": "S", "EAST": "E", "WEST": "W"}
|
||||
prefix = prefix_map.get(prefix_raw, prefix_raw)
|
||||
|
||||
# Detectar suffix en el ultimo token
|
||||
suffix = ""
|
||||
if rest and rest[-1] in _STREET_SUFFIXES:
|
||||
suffix = _STREET_SUFFIXES[rest.pop()]
|
||||
|
||||
name = " ".join(rest)
|
||||
if not name:
|
||||
return None
|
||||
|
||||
return {
|
||||
"street_num": street_num,
|
||||
"prefix": prefix,
|
||||
"name": name,
|
||||
"suffix": suffix,
|
||||
"zip": zip_code,
|
||||
}
|
||||
|
||||
|
||||
def _fetch_property_owner_duval(address: str) -> tuple[Optional[dict], list[str]]:
|
||||
"""Step 1 Duval: dado un address, buscar el owner name + RE# en el Property Appraiser.
|
||||
|
||||
Sitio: https://paopropertysearch.coj.net/Basic/Search.aspx
|
||||
Form fields (ASP.NET, descubiertos via DOM inspection):
|
||||
- ctl00$cphBody$tbStreetNumber → numero
|
||||
- ctl00$cphBody$ddStreetPrefix → select (N/S/E/W)
|
||||
- ctl00$cphBody$tbStreetName → nombre (sin prefix ni suffix)
|
||||
- ctl00$cphBody$ddStreetSuffix → select (ST/AVE/RD/...)
|
||||
- ctl00$cphBody$tbZipCode → ZIP opcional
|
||||
- ctl00$cphBody$bSearch → submit
|
||||
Resultado en Results.aspx (puede ser direct match o tabla).
|
||||
|
||||
Returns ({owner_name, re_number, year_built, tax_assessed, last_sale_date}, errors)
|
||||
"""
|
||||
errors: list[str] = []
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||||
except ImportError as e:
|
||||
errors.append(f"playwright no instalado: {e}")
|
||||
return None, errors
|
||||
|
||||
parsed = _parse_address_duval(address)
|
||||
if not parsed:
|
||||
errors.append(f"No pude parsear el address (formato esperado: '<num> [prefix] <name> [suffix]'): {address}")
|
||||
return None, errors
|
||||
|
||||
_rate_limit("paopropertysearch.coj.net")
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(user_agent=USER_AGENT)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(15_000)
|
||||
|
||||
# IMPORTANTE: wait_until='networkidle' (no 'domcontentloaded') porque el
|
||||
# sitio Duval usa ASP.NET WebForms con WebForm_DoPostBackWithOptions, una
|
||||
# funcion JS que se carga DESPUES del DOMContentLoaded. Si clickeamos
|
||||
# Search antes que esa JS exista, el click no submite nada (PAGEERROR).
|
||||
page.goto("https://paopropertysearch.coj.net/Basic/Search.aspx",
|
||||
wait_until="networkidle", timeout=20_000)
|
||||
|
||||
# Nota: el sitio Duval ASP.NET no carga WebForm_DoPostBackWithOptions
|
||||
# (probablemente por su compat IE8 mode). No esperamos a esa funcion —
|
||||
# vamos directo al fallback de form.submit() que funciona consistentemente.
|
||||
|
||||
# Llenar form fields con los selectores REALES del sitio Duval ASP.NET
|
||||
page.locator("#ctl00_cphBody_tbStreetNumber").fill(parsed["street_num"])
|
||||
|
||||
if parsed["prefix"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(
|
||||
value=parsed["prefix"]
|
||||
)
|
||||
except Exception:
|
||||
# Fallback: por label
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(
|
||||
label=parsed["prefix"]
|
||||
)
|
||||
except Exception:
|
||||
pass # Si no matchea, dejar empty y confiar en street name match
|
||||
|
||||
page.locator("#ctl00_cphBody_tbStreetName").fill(parsed["name"])
|
||||
|
||||
if parsed["suffix"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(
|
||||
value=parsed["suffix"]
|
||||
)
|
||||
except Exception:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(
|
||||
label=parsed["suffix"]
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if parsed["zip"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_tbZipCode").fill(parsed["zip"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Submit via JS form.submit() — bypasea WebForm_DoPostBackWithOptions
|
||||
# que el sitio Duval no carga correctamente.
|
||||
submitted = False
|
||||
try:
|
||||
page.evaluate("""() => {
|
||||
const form = document.forms[0] || document.querySelector('form');
|
||||
if (!form) throw new Error('no form found');
|
||||
form.action = 'Results.aspx';
|
||||
// ASP.NET espera el button name como input para detectar el click
|
||||
let hidden = document.createElement('input');
|
||||
hidden.type = 'hidden';
|
||||
hidden.name = 'ctl00$cphBody$bSearch';
|
||||
hidden.value = 'Search';
|
||||
form.appendChild(hidden);
|
||||
form.submit();
|
||||
}""")
|
||||
page.wait_for_url("**Results.aspx**", timeout=10_000)
|
||||
submitted = True
|
||||
except Exception as e:
|
||||
errors.append(f"Property Appraiser: form submit fallo: {e}")
|
||||
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
except PlaywrightTimeout:
|
||||
pass
|
||||
|
||||
current_url = page.url
|
||||
|
||||
if not submitted:
|
||||
errors.append(
|
||||
f"Property Appraiser: submit no navego a Results.aspx (URL final: {current_url}). "
|
||||
"ASP.NET WebForms postback fallo en las 3 estrategias (click/Enter/JS-eval)."
|
||||
)
|
||||
|
||||
# Parse Results.aspx — la pagina tiene una tabla con columnas conocidas:
|
||||
# RE #, Name (Last First), Street #, Street Name, Type, Direction, Unit, City, Zip
|
||||
# Si "No Results Found" en body → property no existe en Duval PA
|
||||
body_text = page.locator("body").inner_text() if submitted else ""
|
||||
|
||||
owner_name: Optional[str] = None
|
||||
re_number: Optional[str] = None
|
||||
year_built: Optional[int] = None
|
||||
tax_assessed: Optional[int] = None
|
||||
last_sale_date: Optional[str] = None
|
||||
|
||||
no_results = "No Results Found" in body_text or "No information available" in body_text
|
||||
|
||||
if no_results:
|
||||
errors.append(
|
||||
f"Property Appraiser: 'No Results Found' para "
|
||||
f"{parsed['street_num']} {parsed['prefix']} {parsed['name']} {parsed['suffix']}. "
|
||||
"Address probablemente no existe en Duval PA database o esta fuera del condado."
|
||||
)
|
||||
elif submitted:
|
||||
# Parsear tabla de resultados via DOM
|
||||
try:
|
||||
results_table = page.locator("table").first
|
||||
rows = results_table.locator("tr").all()
|
||||
if len(rows) >= 2:
|
||||
# Row 0 = headers, Row 1+ = data
|
||||
# Buscar la fila que matchee el street # exacto si tenemos varios resultados
|
||||
best_row = None
|
||||
for r in rows[1:]:
|
||||
cells = [(c.text_content() or "").strip() for c in r.locator("td").all()]
|
||||
if len(cells) >= 9 and cells[2] == parsed["street_num"]:
|
||||
# Match exacto del street #
|
||||
best_row = cells
|
||||
break
|
||||
if not best_row:
|
||||
# Sin match exacto, tomar el primero
|
||||
cells_first = [(c.text_content() or "").strip() for c in rows[1].locator("td").all()]
|
||||
if len(cells_first) >= 9:
|
||||
best_row = cells_first
|
||||
|
||||
if best_row:
|
||||
re_number = best_row[0] or None
|
||||
owner_name = best_row[1] or None
|
||||
# Otros campos en el detail page (TODO si necesario)
|
||||
except Exception as e:
|
||||
errors.append(f"Property Appraiser: error parseando tabla de resultados: {e}")
|
||||
|
||||
browser.close()
|
||||
|
||||
if not owner_name and not re_number:
|
||||
return None, errors
|
||||
|
||||
return {
|
||||
"owner_name": owner_name,
|
||||
"re_number": re_number,
|
||||
"year_built": year_built,
|
||||
"tax_assessed_value": tax_assessed,
|
||||
"last_sale_date": last_sale_date,
|
||||
"source": "Duval Property Appraiser (paopropertysearch.coj.net)",
|
||||
"result_url": current_url,
|
||||
}, errors
|
||||
except Exception as e:
|
||||
errors.append(f"Property Appraiser Duval scrape error: {e}")
|
||||
return None, errors
|
||||
|
||||
|
||||
def _fetch_lis_pendens_duval(owner_name: str, address: str) -> tuple[list[dict], list[str]]:
|
||||
"""Step 2 Duval: dado un owner name, buscar lis pendens recientes en Official Records.
|
||||
|
||||
Sitio: https://or.duvalclerk.com/
|
||||
Flujo (descubierto via DOM inspection):
|
||||
1. Landing tiene Disclaimer form. Click "I accept the conditions above" (id='btnButton').
|
||||
2. Despues del POST, redirige a la home autenticada-as-guest.
|
||||
3. Navegar a /search/SearchTypeName (link directo).
|
||||
4. Llenar Last Name + First Name.
|
||||
5. Submit. Resultados muestran columns: Doc Type, Recording Date, Grantor, Grantee.
|
||||
6. Filter por Doc Type que contenga "LIS PENDENS" o codigo "LP".
|
||||
|
||||
Owner name del Property Appraiser viene como "JONES JOHN N" o
|
||||
"CITY OF JACKSONVILLE" — formato "LASTNAME FIRSTNAME MIDDLE_INITIAL".
|
||||
|
||||
Returns (list of {case_number, filing_date, plaintiff, doc_type, instrument_num},
|
||||
errors_list)
|
||||
"""
|
||||
errors: list[str] = []
|
||||
matches: list[dict] = []
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||||
except ImportError as e:
|
||||
errors.append(f"playwright no instalado: {e}")
|
||||
return matches, errors
|
||||
|
||||
_rate_limit("or.duvalclerk.com")
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(user_agent=USER_AGENT)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(15_000)
|
||||
|
||||
# Step 1: Aceptar disclaimer
|
||||
page.goto("https://or.duvalclerk.com/", wait_until="networkidle", timeout=20_000)
|
||||
try:
|
||||
# El boton tiene id='btnButton' (descubierto via DOM inspection)
|
||||
page.locator("#btnButton").click()
|
||||
page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
except Exception as e:
|
||||
errors.append(f"Official Records: error aceptando disclaimer: {e}")
|
||||
browser.close()
|
||||
return matches, errors
|
||||
|
||||
# Step 2: Navegar al Name Search
|
||||
try:
|
||||
page.goto("https://or.duvalclerk.com/search/SearchTypeName",
|
||||
wait_until="networkidle", timeout=15_000)
|
||||
except Exception as e:
|
||||
errors.append(f"Official Records: no pude navegar a SearchTypeName: {e}")
|
||||
browser.close()
|
||||
return matches, errors
|
||||
|
||||
# Step 3: Llenar form de busqueda por nombre
|
||||
# Owner name del Duval PA viene como "LASTNAME FIRSTNAME M" o "LASTNAME LASTNAME2 FIRSTNAME"
|
||||
# Algunos casos especiales: "CITY OF JACKSONVILLE" (no es persona)
|
||||
# "JONES JOHN N" → last=JONES, first=JOHN, middle=N
|
||||
parts = owner_name.strip().split()
|
||||
last_name = parts[0] if parts else owner_name
|
||||
first_name = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
# Dump form structure if first time (errors=[] for debug)
|
||||
# Selectores tipicos para nombre en este sistema (Acclaim Land Records)
|
||||
ln_selectors = [
|
||||
"input[name='lastName']", "input[id='lastName']",
|
||||
"input[name='LastName']", "input[id='LastName']",
|
||||
"input[name*='last']", "input[id*='last']",
|
||||
]
|
||||
fn_selectors = [
|
||||
"input[name='firstName']", "input[id='firstName']",
|
||||
"input[name='FirstName']", "input[id='FirstName']",
|
||||
"input[name*='first']", "input[id*='first']",
|
||||
]
|
||||
|
||||
ln_filled = False
|
||||
for sel in ln_selectors:
|
||||
if page.locator(sel).count() > 0:
|
||||
page.locator(sel).first.fill(last_name)
|
||||
ln_filled = True
|
||||
break
|
||||
|
||||
if not ln_filled:
|
||||
# v1: degradar gracefully — Acclaim Land Records system tiene form
|
||||
# dinamico (JS-rendered) que requiere mas iteracion. v1.1 lo arregla.
|
||||
errors.append(
|
||||
"Official Records v1: Name Search form selectors no encontrados. "
|
||||
"Lis pendens lookup automatico no disponible en este release. "
|
||||
"Lookup manual: https://or.duvalclerk.com/search/SearchTypeName "
|
||||
f"con last_name='{last_name}', first_name='{first_name}'"
|
||||
)
|
||||
browser.close()
|
||||
return matches, errors
|
||||
|
||||
for sel in fn_selectors:
|
||||
if page.locator(sel).count() > 0 and first_name:
|
||||
page.locator(sel).first.fill(first_name)
|
||||
break
|
||||
|
||||
# Submit
|
||||
search_btns = [
|
||||
"input[type='submit'][value*='Search']",
|
||||
"button:has-text('Search')",
|
||||
"input[type='button'][value*='Search']",
|
||||
"#searchButton", "#btnSearch", "button[type='submit']",
|
||||
]
|
||||
clicked = False
|
||||
for sel in search_btns:
|
||||
try:
|
||||
if page.locator(sel).count() > 0:
|
||||
page.locator(sel).first.click()
|
||||
clicked = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not clicked:
|
||||
# Fallback: Enter en el lastName input
|
||||
try:
|
||||
for sel in ln_selectors:
|
||||
if page.locator(sel).count() > 0:
|
||||
page.locator(sel).first.press("Enter")
|
||||
clicked = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not clicked:
|
||||
errors.append("Official Records: no encontre boton Search ni pude enviar via Enter")
|
||||
browser.close()
|
||||
return matches, errors
|
||||
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=15_000)
|
||||
except PlaywrightTimeout:
|
||||
pass
|
||||
|
||||
current_url = page.url
|
||||
|
||||
# Step 4: Parsear resultados. La tabla del sistema Acclaim tiene columnas
|
||||
# tipicas: Doc Type, Recording Date, Party, Instrument #, Book/Page
|
||||
# Buscamos rows con doc type que contenga "LIS PENDENS"
|
||||
try:
|
||||
# Esperar a que cargue la grid de resultados
|
||||
page.wait_for_selector("table", timeout=8_000)
|
||||
except PlaywrightTimeout:
|
||||
errors.append(f"Official Records: tabla de resultados no apareció. URL: {current_url}")
|
||||
browser.close()
|
||||
return matches, errors
|
||||
|
||||
# Buscar todas las filas de la tabla y filter por LIS PENDENS / LP
|
||||
all_rows = page.locator("table tr").all()
|
||||
lp_keywords = ["LIS PENDENS", "LISPENDENS", " LP ", "FORECLOSURE"]
|
||||
for row in all_rows[1:]: # skip header
|
||||
try:
|
||||
cells = [(c.text_content() or "").strip() for c in row.locator("td").all()]
|
||||
row_text = " ".join(cells).upper()
|
||||
if any(kw in row_text for kw in lp_keywords):
|
||||
# Parsear filas que matchean
|
||||
match = {
|
||||
"doc_type": next((c for c in cells if any(kw.strip() in c.upper() for kw in lp_keywords)), "Lis Pendens"),
|
||||
"all_columns_text": cells,
|
||||
"source_url": current_url,
|
||||
}
|
||||
# Intentar extraer fecha
|
||||
for c in cells:
|
||||
date_m = re.search(r"\d{1,2}/\d{1,2}/\d{4}", c)
|
||||
if date_m:
|
||||
match["filing_date"] = date_m.group(0)
|
||||
break
|
||||
# Intentar extraer instrument #
|
||||
for c in cells:
|
||||
inst_m = re.match(r"\d{4,}-\d{4,}", c) or re.match(r"\d{8,}", c)
|
||||
if inst_m:
|
||||
match["instrument_number"] = inst_m.group(0)
|
||||
break
|
||||
matches.append(match)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not matches:
|
||||
# Sin matches NO es error — la propiedad puede estar limpia
|
||||
errors.append(
|
||||
f"Sin matches de Lis Pendens para owner '{owner_name}' en Duval Official Records. "
|
||||
f"Esto puede significar: (a) la propiedad NO esta en foreclosure, o "
|
||||
f"(b) el owner_name parseado no matchea el formato del clerk. URL final: {current_url}"
|
||||
)
|
||||
|
||||
browser.close()
|
||||
return matches, errors
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Official Records Duval scrape error: {e}")
|
||||
return matches, errors
|
||||
|
||||
|
||||
def _fetch_duval(address: str) -> dict:
|
||||
"""Pipeline completo Duval: owner lookup + lis pendens search + liens inventory.
|
||||
|
||||
v1: solo Property Appraiser funciona. Lis pendens scraper devuelve estructura
|
||||
vacia con warning. Liens inventory tambien vacia + warning.
|
||||
|
||||
v1.1 (deferred a Phase 3.5): popular lis_pendens + all_liens reales.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
sources_used: list[str] = []
|
||||
|
||||
# Step 1: owner name from Property Appraiser
|
||||
owner_data, owner_errors = _fetch_property_owner_duval(address)
|
||||
errors.extend(owner_errors)
|
||||
if owner_data:
|
||||
sources_used.append(owner_data.get("source", "Duval Property Appraiser"))
|
||||
|
||||
# Step 2: lis pendens lookup (solo si tenemos owner_name)
|
||||
lp_matches = []
|
||||
if owner_data and owner_data.get("owner_name"):
|
||||
lp_matches, lp_errors = _fetch_lis_pendens_duval(
|
||||
owner_data["owner_name"], address
|
||||
)
|
||||
errors.extend(lp_errors)
|
||||
if not lp_errors or "Sin matches" in (lp_errors[0] if lp_errors else ""):
|
||||
sources_used.append("Duval Official Records (or.duvalclerk.com)")
|
||||
|
||||
# Step 3 (Wave 1.5A v1.2): Liens inventory — DEFERRED a v1.1, devolver placeholder
|
||||
# Cuando v1.1 funcione: _fetch_liens_duval(owner_data['owner_name'], owner_data['re_number'])
|
||||
# devolvera la lista completa de liens via doc_type filters en or.duvalclerk.com.
|
||||
liens_data = _empty_liens_inventory(
|
||||
reason="Acclaim Land Records scraper deferred to v1.1. Lookup manual disponible."
|
||||
)
|
||||
|
||||
# Step 4: Plaintiff classification (solo si hay lis pendens detectado)
|
||||
plaintiff_info = None
|
||||
if lp_matches:
|
||||
# En v1.1 cuando se parsee correctamente, el lis pendens row tendra columns
|
||||
# con plaintiff name. Por ahora, intentar extraer del primer match si esta.
|
||||
first_lp = lp_matches[0] if lp_matches else {}
|
||||
plaintiff_name_raw = (
|
||||
first_lp.get("plaintiff")
|
||||
or (first_lp.get("all_columns_text") or [None])[0]
|
||||
)
|
||||
plaintiff_info = classify_plaintiff(plaintiff_name_raw)
|
||||
|
||||
# Status determination
|
||||
# v1: si el Lis Pendens lookup degrada gracefully, status='OWNER_VERIFIED'
|
||||
# (sabemos que la propiedad existe + owner, pero NO podemos confirmar foreclosure).
|
||||
# En v1.1 cuando Official Records funcione completamente, podra subir a 'CLEAN'.
|
||||
lis_pendens_degraded = any(
|
||||
"Official Records v1" in e for e in errors
|
||||
)
|
||||
|
||||
if lp_matches:
|
||||
status = "LIS_PENDENS_ACTIVE"
|
||||
most_recent = sorted(
|
||||
[m for m in lp_matches if m.get("filing_date")],
|
||||
key=lambda m: m["filing_date"], reverse=True
|
||||
)
|
||||
most_recent_date = most_recent[0]["filing_date"] if most_recent else None
|
||||
elif owner_data:
|
||||
# Tenemos owner pero no pudimos verificar foreclosures
|
||||
if lis_pendens_degraded:
|
||||
status = "OWNER_VERIFIED" # PA OK, lis pendens manual
|
||||
else:
|
||||
status = "CLEAN" # Both PA + lis pendens lookups OK, no matches
|
||||
most_recent_date = None
|
||||
else:
|
||||
status = "UNKNOWN"
|
||||
most_recent_date = None
|
||||
|
||||
# Pull case_number from first lis_pendens match if available
|
||||
case_number = None
|
||||
if lp_matches:
|
||||
first_lp = lp_matches[0]
|
||||
case_number = (
|
||||
first_lp.get("case_number")
|
||||
or first_lp.get("instrument_number")
|
||||
)
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"county": "Duval",
|
||||
"address": address,
|
||||
# Property Appraiser data
|
||||
"owner_name": (owner_data or {}).get("owner_name"),
|
||||
"re_number": (owner_data or {}).get("re_number"),
|
||||
"tax_assessed_value": (owner_data or {}).get("tax_assessed_value"),
|
||||
"year_built_official": (owner_data or {}).get("year_built"),
|
||||
"last_sale_date": (owner_data or {}).get("last_sale_date"),
|
||||
# Lis pendens detail
|
||||
"lis_pendens": lp_matches,
|
||||
"lis_pendens_count": len(lp_matches),
|
||||
"most_recent_lis_pendens_date": most_recent_date,
|
||||
"case_number": case_number,
|
||||
# Wave 1.5A v1.2: Plaintiff + liens structured fields
|
||||
"plaintiff": plaintiff_info,
|
||||
"loan_origin": None, # v1.1 popula desde MTG records cuando funcione
|
||||
"all_liens": liens_data["all_liens"],
|
||||
"lien_count": liens_data["lien_count"],
|
||||
"total_surviving_debt": liens_data["total_surviving_debt"],
|
||||
"investor_warning": liens_data["investor_warning"],
|
||||
"liens_detail_status": liens_data["detail_status"],
|
||||
# Meta
|
||||
"sources_used": sources_used,
|
||||
"errors": errors,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_court_records(
|
||||
*,
|
||||
address: str,
|
||||
county_name: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Entry point. Dispatches per county. Soft-fail si condado no implementado.
|
||||
|
||||
Returns dict con:
|
||||
status: CLEAN | LIS_PENDENS_ACTIVE | CODE_VIOLATIONS | TAX_DELINQUENT
|
||||
| UNKNOWN | NOT_IMPLEMENTED | DISABLED
|
||||
county
|
||||
owner_name, re_number, tax_assessed_value, year_built_official, last_sale_date
|
||||
lis_pendens: list of cases (si LIS_PENDENS_ACTIVE)
|
||||
sources_used, errors, fetched_at
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
if not _enable_court_records():
|
||||
return {
|
||||
"status": "DISABLED",
|
||||
"county": county_name,
|
||||
"address": address,
|
||||
"recommendation": (
|
||||
"Court records scraping deshabilitado. Activar ENABLE_COURT_RECORDS=true "
|
||||
"en .env para deteccion deterministica de foreclosure / lis pendens."
|
||||
),
|
||||
"sources_used": [],
|
||||
"errors": [],
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
|
||||
# Normalize county
|
||||
cn = (county_name or "").strip()
|
||||
cn_normalized = cn.replace(" County", "").replace(" county", "").strip()
|
||||
|
||||
if cn_normalized.lower() == "duval":
|
||||
return _fetch_duval(address)
|
||||
|
||||
# Soft-fail para condados no implementados
|
||||
clerk_url = COUNTY_CLERK_URLS.get(cn_normalized, "https://www.flclerks.com/")
|
||||
return {
|
||||
"status": "NOT_IMPLEMENTED",
|
||||
"county": cn_normalized,
|
||||
"address": address,
|
||||
"recommendation": (
|
||||
f"Court records scraper no implementado para {cn_normalized} todavia. "
|
||||
f"Lookup manual en {clerk_url}. Wave 1.5A v1 cubre solo Duval; "
|
||||
"Miami-Dade / Broward / Palm Beach / Hillsborough en versiones posteriores."
|
||||
),
|
||||
"clerk_url": clerk_url,
|
||||
"sources_used": [],
|
||||
"errors": [],
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
"""FEMA NFHL flood zone lookup por lat/lng.
|
||||
|
||||
API publica: https://hazards.fema.gov/gis/nfhl/rest/services/public/NFHL/MapServer
|
||||
Layer 28 = "S_FLD_HAZ_AR" (Special Flood Hazard Areas).
|
||||
|
||||
Sin key requerida. Sin rate limits estrictos.
|
||||
|
||||
Devuelve dict con:
|
||||
zone: "X" / "X (shaded)" / "A" / "AE" / "AH" / "AO" / "V" / "VE" / etc.
|
||||
bfe: Base Flood Elevation (ft) o None
|
||||
sfha: bool - True si esta en Special Flood Hazard Area
|
||||
subtype: subzone description o None
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import requests
|
||||
|
||||
from .base import FetcherError, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
FEMA_URL = "https://hazards.fema.gov/arcgis/rest/services/public/NFHL/MapServer/28/query"
|
||||
|
||||
# Zonas que son SFHA (Special Flood Hazard Area) segun FEMA
|
||||
SFHA_ZONES = {"A", "AE", "AH", "AO", "AR", "A99", "V", "VE", "VO"}
|
||||
|
||||
|
||||
def fetch_flood(lat: float, lng: float) -> dict:
|
||||
"""Consulta FEMA NFHL para flood zone en (lat, lng).
|
||||
|
||||
Si el punto NO esta en ninguna SFHA, FEMA devuelve features vacio
|
||||
y se interpreta como zona X (low risk, default outside SFHA).
|
||||
"""
|
||||
if lat is None or lng is None:
|
||||
raise FetcherError("lat/lng requeridos")
|
||||
|
||||
params = {
|
||||
"geometry": f"{lng},{lat}", # FEMA usa lng,lat (x,y)
|
||||
"geometryType": "esriGeometryPoint",
|
||||
"inSR": "4326", # WGS84
|
||||
"spatialRel": "esriSpatialRelIntersects",
|
||||
"outFields": "FLD_ZONE,STATIC_BFE,ZONE_SUBTY",
|
||||
"returnGeometry": "false",
|
||||
"f": "json",
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(FEMA_URL, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
raise FetcherError(f"HTTP error: {e}") from e
|
||||
|
||||
try:
|
||||
data = r.json()
|
||||
except ValueError as e:
|
||||
raise FetcherError(f"JSON parse error: {e}") from e
|
||||
|
||||
# FEMA puede devolver "error" si la query es invalida
|
||||
if "error" in data:
|
||||
raise FetcherError(f"FEMA API error: {data['error']}")
|
||||
|
||||
features = data.get("features", [])
|
||||
if not features:
|
||||
# Punto fuera de SFHA → low-risk zone X
|
||||
return {
|
||||
"zone": "X",
|
||||
"bfe": None,
|
||||
"sfha": False,
|
||||
"subtype": None,
|
||||
"source": "FEMA NFHL (outside SFHA)",
|
||||
}
|
||||
|
||||
attrs = features[0].get("attributes", {}) or {}
|
||||
zone = (attrs.get("FLD_ZONE") or "unknown").strip()
|
||||
subtype = attrs.get("ZONE_SUBTY")
|
||||
|
||||
# BFE: FEMA usa -9999 para "no aplica"
|
||||
bfe_raw = attrs.get("STATIC_BFE")
|
||||
bfe = bfe_raw if (bfe_raw is not None and bfe_raw != -9999) else None
|
||||
|
||||
return {
|
||||
"zone": zone,
|
||||
"bfe": bfe,
|
||||
"sfha": zone in SFHA_ZONES,
|
||||
"subtype": subtype,
|
||||
"source": "FEMA NFHL",
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
"""HUD Fair Market Rent lookup.
|
||||
|
||||
API: https://www.huduser.gov/portal/dataset/fmr-api.html
|
||||
Requiere API key gratis: https://www.huduser.gov/hudapi/public/register
|
||||
|
||||
Flow:
|
||||
1. GET /fmr/listCounties/{state} -> match county_name -> fips_code
|
||||
2. GET /fmr/data/{fips_code}?year=YYYY -> Efficiency / 1BR / 2BR / 3BR / 4BR
|
||||
|
||||
Si HUD_API_KEY no esta en .env, raise FetcherError (caught por runner, fail-soft).
|
||||
|
||||
Devuelve dict con:
|
||||
year, county, state,
|
||||
fmr_efficiency, fmr_1br, fmr_2br, fmr_3br, fmr_4br (en USD/mes)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from .base import FetcherError, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
HUD_BASE = "https://www.huduser.gov/hudapi/public"
|
||||
|
||||
|
||||
def _normalize_county_name(s: str) -> str:
|
||||
"""Normaliza para comparar nombres: lowercase, sin sufijo 'County', sin espacios redundantes."""
|
||||
if not s:
|
||||
return ""
|
||||
s = s.lower().strip()
|
||||
if s.endswith(" county"):
|
||||
s = s[:-7].strip()
|
||||
return " ".join(s.split()) # collapse whitespace
|
||||
|
||||
|
||||
def fetch_fmr(state: str, county_name: str, year: int | None = None) -> dict:
|
||||
"""Fetch FMR para un condado USA.
|
||||
|
||||
state: codigo 2-letras (ej. "FL", "TX")
|
||||
county_name: nombre del condado (con o sin "County")
|
||||
year: ano del FMR (default: ano actual)
|
||||
"""
|
||||
# .env ya fue cargado por data_fetchers/__init__.py
|
||||
api_key = os.getenv("HUD_API_KEY", "").strip()
|
||||
if not api_key:
|
||||
raise FetcherError("HUD_API_KEY no esta en .env. Registrate en https://www.huduser.gov/hudapi/public/register")
|
||||
|
||||
if not state or not county_name:
|
||||
raise FetcherError(f"state y county_name son requeridos (got state={state!r}, county={county_name!r})")
|
||||
|
||||
if year is None:
|
||||
year = datetime.now().year
|
||||
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
|
||||
# 1. listCounties para encontrar el entity_id (fips_code)
|
||||
list_url = f"{HUD_BASE}/fmr/listCounties/{state}"
|
||||
try:
|
||||
r = requests.get(list_url, headers=headers, timeout=DEFAULT_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
raise FetcherError(f"listCounties HTTP error: {e}") from e
|
||||
|
||||
try:
|
||||
counties = r.json()
|
||||
except ValueError as e:
|
||||
raise FetcherError(f"listCounties JSON error: {e}") from e
|
||||
|
||||
if not isinstance(counties, list):
|
||||
raise FetcherError(f"listCounties unexpected format: {type(counties).__name__}")
|
||||
|
||||
target = _normalize_county_name(county_name)
|
||||
matched = None
|
||||
for c in counties:
|
||||
if _normalize_county_name(c.get("county_name", "")) == target:
|
||||
matched = c
|
||||
break
|
||||
|
||||
if not matched:
|
||||
sample = [c.get("county_name") for c in counties[:5]]
|
||||
raise FetcherError(f"County '{county_name}' not in HUD list for {state}. Primeros 5: {sample}")
|
||||
|
||||
entity_id = matched.get("fips_code")
|
||||
if not entity_id:
|
||||
raise FetcherError(f"Match found but no fips_code: {matched}")
|
||||
|
||||
# 2. FMR data
|
||||
fmr_url = f"{HUD_BASE}/fmr/data/{entity_id}"
|
||||
try:
|
||||
r = requests.get(fmr_url, params={"year": year}, headers=headers, timeout=DEFAULT_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
raise FetcherError(f"fmr/data HTTP error: {e}") from e
|
||||
|
||||
try:
|
||||
payload = r.json()
|
||||
except ValueError as e:
|
||||
raise FetcherError(f"fmr/data JSON error: {e}") from e
|
||||
|
||||
# payload structure puede variar. Intentamos varias formas conocidas.
|
||||
data = payload.get("data", payload) if isinstance(payload, dict) else {}
|
||||
|
||||
# basicdata puede ser un dict (condado simple) o lista (metro con sub-zonas)
|
||||
bd = data.get("basicdata")
|
||||
if isinstance(bd, list):
|
||||
bd = bd[0] if bd else {}
|
||||
if not isinstance(bd, dict):
|
||||
bd = data
|
||||
|
||||
def _g(*keys):
|
||||
"""Devuelve el primer valor presente entre las keys provistas."""
|
||||
for k in keys:
|
||||
v = bd.get(k)
|
||||
if v is not None:
|
||||
return v
|
||||
return None
|
||||
|
||||
return {
|
||||
"year": year,
|
||||
"county": matched.get("county_name"),
|
||||
"state": state,
|
||||
"entity_id": entity_id,
|
||||
"fmr_efficiency": _g("Efficiency", "fmr_0"),
|
||||
"fmr_1br": _g("One-Bedroom", "fmr_1"),
|
||||
"fmr_2br": _g("Two-Bedroom", "fmr_2"),
|
||||
"fmr_3br": _g("Three-Bedroom", "fmr_3"),
|
||||
"fmr_4br": _g("Four-Bedroom", "fmr_4"),
|
||||
"source": "HUD User FMR API",
|
||||
}
|
||||
@@ -0,0 +1,549 @@
|
||||
"""Neighborhood classifier (A/B/C/D) basado en indicadores objetivos.
|
||||
|
||||
CRITICO - COMPLIANCE LEGAL:
|
||||
La clasificacion se basa SOLO en indicadores economicos y datos publicos:
|
||||
income, owner-occupancy, education attainment, vacancy, crime, days on market.
|
||||
|
||||
NUNCA usa demografia racial o etnica. Fair Housing Act (federal) prohibe redlining.
|
||||
Esta es una clasificacion ECONOMICA, no demografica.
|
||||
|
||||
Indicadores y pesos (max 100):
|
||||
- median_household_income (Census ACS) 25%
|
||||
- owner_occupied_pct (Census ACS) 20%
|
||||
- education_attainment_pct_bachelor_plus (ACS) 20%
|
||||
- crime_vs_national (FBI UCR) 15%
|
||||
- vacancy_rate (Census ACS) 10%
|
||||
- days_on_market_median (Firecrawl, opt-in) 10%
|
||||
|
||||
Graceful degradation: si un indicador no esta disponible (API key missing,
|
||||
fetcher fallo), se redistribuye su peso entre los disponibles.
|
||||
|
||||
confidence_level (basado en CANTIDAD de indicadores disponibles):
|
||||
- "high": 5-6 indicadores
|
||||
- "medium": 3-4 indicadores
|
||||
- "low": 1-2 indicadores
|
||||
- "unclassified": 0 indicadores
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
# ─── Pesos del algoritmo de clasificacion ───────────────────────────────────
|
||||
WEIGHTS = {
|
||||
"income": 25,
|
||||
"owner_occupied": 20,
|
||||
"education": 20,
|
||||
"crime": 15,
|
||||
"vacancy": 10,
|
||||
"dom": 10,
|
||||
}
|
||||
|
||||
# ─── Census ACS variable codes ──────────────────────────────────────────────
|
||||
ACS_VARS = {
|
||||
"income": "B19013_001E", # Median household income (last 12 months)
|
||||
"oo_count": "B25003_002E", # Owner-occupied housing units count
|
||||
"occupied_total": "B25003_001E", # Total occupied housing units
|
||||
"vacant_count": "B25002_003E", # Vacant housing units count
|
||||
"housing_total": "B25002_001E", # Total housing units (occupied + vacant)
|
||||
"home_value": "B25077_001E", # Median home value
|
||||
"edu_total": "B15003_001E", # Total population 25+
|
||||
"edu_bachelor": "B15003_022E", # Bachelor's degree
|
||||
"edu_master": "B15003_023E", # Master's degree
|
||||
"edu_prof": "B15003_024E", # Professional school degree
|
||||
"edu_doctorate": "B15003_025E", # Doctorate degree
|
||||
}
|
||||
|
||||
# ─── National crime rates (FBI UCR 2022, per 100K population) ──────────────
|
||||
# Usado como denominador para crime_vs_national. Actualizar anualmente.
|
||||
NATIONAL_VIOLENT_CRIME_PER_100K = 380.7
|
||||
NATIONAL_PROPERTY_CRIME_PER_100K = 1954.4
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Fetchers individuales (fail-soft)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _fetch_census_acs(geocode: dict) -> tuple[dict, list[str]]:
|
||||
"""Fetch 4 indicadores Census ACS para el tract del geocode.
|
||||
|
||||
Returns (indicators_dict, errors_list).
|
||||
"""
|
||||
errors: list[str] = []
|
||||
out: dict = {}
|
||||
|
||||
api_key = os.getenv("CENSUS_API_KEY", "").strip()
|
||||
if not api_key:
|
||||
errors.append("CENSUS_API_KEY ausente en .env (registro: https://api.census.gov/data/key_signup.html)")
|
||||
return out, errors
|
||||
|
||||
state_fips = geocode.get("state_fips")
|
||||
county_code = geocode.get("county_code_only")
|
||||
tract_code = geocode.get("tract_code")
|
||||
if not state_fips or not county_code or not tract_code:
|
||||
errors.append(f"geocode incompleto para Census ACS (state={state_fips}, county={county_code}, tract={tract_code})")
|
||||
return out, errors
|
||||
|
||||
url = "https://api.census.gov/data/2022/acs/acs5"
|
||||
# Pedir todas las vars en una sola llamada (la API acepta hasta 50)
|
||||
var_keys = ["NAME"] + list(ACS_VARS.values())
|
||||
params = {
|
||||
"get": ",".join(var_keys),
|
||||
"for": f"tract:{tract_code}",
|
||||
"in": f"state:{state_fips} county:{county_code}",
|
||||
"key": api_key,
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
except requests.RequestException as e:
|
||||
errors.append(f"Census ACS HTTP: {e}")
|
||||
return out, errors
|
||||
except ValueError as e:
|
||||
errors.append(f"Census ACS JSON: {e}")
|
||||
return out, errors
|
||||
|
||||
if not data or len(data) < 2:
|
||||
errors.append("Census ACS devolvio respuesta vacia (tract sin datos?)")
|
||||
return out, errors
|
||||
|
||||
header = data[0]
|
||||
row = data[1]
|
||||
idx = {col: i for i, col in enumerate(header)}
|
||||
|
||||
def _f(col: str) -> Optional[float]:
|
||||
try:
|
||||
v = row[idx[col]]
|
||||
except (KeyError, IndexError):
|
||||
return None
|
||||
if v is None or v == "" or v == "null":
|
||||
return None
|
||||
try:
|
||||
f = float(v)
|
||||
# Census usa valores negativos para "no data" / "suppressed"
|
||||
return f if f >= 0 else None
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
# 1) Median household income
|
||||
income = _f(ACS_VARS["income"])
|
||||
if income is not None:
|
||||
out["median_household_income"] = round(income, 0)
|
||||
|
||||
# 2) Owner-occupied percentage
|
||||
oo = _f(ACS_VARS["oo_count"])
|
||||
total = _f(ACS_VARS["occupied_total"])
|
||||
if oo is not None and total and total > 0:
|
||||
out["owner_occupied_pct"] = round(oo / total * 100, 1)
|
||||
|
||||
# 3) Vacancy rate
|
||||
vacant = _f(ACS_VARS["vacant_count"])
|
||||
housing = _f(ACS_VARS["housing_total"])
|
||||
if vacant is not None and housing and housing > 0:
|
||||
out["vacancy_rate"] = round(vacant / housing * 100, 1)
|
||||
|
||||
# 4) Median home value
|
||||
home_value = _f(ACS_VARS["home_value"])
|
||||
if home_value is not None:
|
||||
out["median_home_value"] = round(home_value, 0)
|
||||
|
||||
# 5) Education attainment (% bachelor's or higher, age 25+)
|
||||
edu_total = _f(ACS_VARS["edu_total"])
|
||||
edu_b = _f(ACS_VARS["edu_bachelor"]) or 0
|
||||
edu_m = _f(ACS_VARS["edu_master"]) or 0
|
||||
edu_p = _f(ACS_VARS["edu_prof"]) or 0
|
||||
edu_d = _f(ACS_VARS["edu_doctorate"]) or 0
|
||||
if edu_total and edu_total > 0:
|
||||
pct = (edu_b + edu_m + edu_p + edu_d) / edu_total * 100
|
||||
out["education_attainment_pct_bachelor_plus"] = round(pct, 1)
|
||||
|
||||
return out, errors
|
||||
|
||||
|
||||
def _fetch_fbi_crime(geocode: dict) -> tuple[dict, list[str]]:
|
||||
"""Fetch crime data via FBI Crime Data Explorer (api.data.gov key).
|
||||
|
||||
NOTA: la API publica gratis de FBI es county-level via summarized endpoint.
|
||||
Implementacion best-effort: si la API responde, devolvemos crime_vs_national.
|
||||
Si no, fail-soft (errors list, indicator ausente).
|
||||
"""
|
||||
errors: list[str] = []
|
||||
out: dict = {}
|
||||
|
||||
api_key = os.getenv("API_DATA_GOV_KEY", "").strip()
|
||||
if not api_key:
|
||||
errors.append("API_DATA_GOV_KEY ausente en .env (registro: https://api.data.gov/signup/)")
|
||||
return out, errors
|
||||
|
||||
state_abbr = geocode.get("state") # e.g. "FL"
|
||||
if not state_abbr:
|
||||
errors.append("state abbreviation faltante en geocode")
|
||||
return out, errors
|
||||
|
||||
# Endpoint: FBI Crime Data Explorer state-level estimate
|
||||
# Mejor que tener nada (county-level es complejo de agregar).
|
||||
url = f"https://api.usa.gov/crime/fbi/cde/estimate/state/{state_abbr}"
|
||||
params = {
|
||||
"from": "2022",
|
||||
"to": "2022",
|
||||
"API_KEY": api_key,
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
except requests.RequestException as e:
|
||||
errors.append(f"FBI UCR HTTP: {e}")
|
||||
return out, errors
|
||||
except ValueError as e:
|
||||
errors.append(f"FBI UCR JSON: {e}")
|
||||
return out, errors
|
||||
|
||||
# Estructura tipica del endpoint: lista de estimates por ano con keys
|
||||
# como 'violent_crime', 'property_crime', 'population', etc.
|
||||
# Defensivo: probar varias formas.
|
||||
estimates = data.get("estimates") or data.get("data") or (data if isinstance(data, list) else [])
|
||||
if not estimates:
|
||||
errors.append("FBI UCR sin estimates en respuesta")
|
||||
return out, errors
|
||||
|
||||
rec = estimates[0] if isinstance(estimates, list) else estimates
|
||||
if not isinstance(rec, dict):
|
||||
errors.append(f"FBI UCR record format inesperado: {type(rec).__name__}")
|
||||
return out, errors
|
||||
|
||||
population = rec.get("population")
|
||||
violent = rec.get("violent_crime")
|
||||
property_c = rec.get("property_crime") or rec.get("homicide", 0) * 0 # fallback - se ignora luego
|
||||
|
||||
if not population or not violent:
|
||||
errors.append("FBI UCR sin population o violent_crime en estimate")
|
||||
return out, errors
|
||||
|
||||
try:
|
||||
violent_per_100k = float(violent) / float(population) * 100000
|
||||
ratio_violent = violent_per_100k / NATIONAL_VIOLENT_CRIME_PER_100K
|
||||
|
||||
if property_c:
|
||||
property_per_100k = float(property_c) / float(population) * 100000
|
||||
ratio_property = property_per_100k / NATIONAL_PROPERTY_CRIME_PER_100K
|
||||
# Promedio ponderado: violent pesa mas (2/3) que property (1/3)
|
||||
combined = (ratio_violent * 2 + ratio_property) / 3
|
||||
else:
|
||||
combined = ratio_violent
|
||||
|
||||
out["crime_vs_national"] = round(combined, 2)
|
||||
out["_crime_state_level_note"] = "Crime ratio es state-level (no neighborhood-level), aproximacion gruesa."
|
||||
except (TypeError, ValueError) as e:
|
||||
errors.append(f"FBI UCR calc error: {e}")
|
||||
|
||||
return out, errors
|
||||
|
||||
|
||||
def _fetch_firecrawl_dom(geocode: dict) -> tuple[dict, list[str]]:
|
||||
"""Fetch median days-on-market via Firecrawl scrape de Zillow.
|
||||
|
||||
OPT-IN ONLY: consume creditos Firecrawl (~3-5 por lookup).
|
||||
Llamar solo si include_dom=True en classify_neighborhood().
|
||||
"""
|
||||
errors: list[str] = []
|
||||
# Placeholder: implementacion requiere Firecrawl integration (Phase 3B paso 6).
|
||||
# Por ahora, devolver vacio. Se completara cuando Firecrawl este integrado.
|
||||
errors.append("DOM Firecrawl: implementacion pendiente Phase 3B paso 6")
|
||||
return {}, errors
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Algoritmo de clasificacion
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _score_income(income: float) -> int:
|
||||
if income >= 100000: return 25
|
||||
if income >= 60000: return 18
|
||||
if income >= 35000: return 10
|
||||
return 3
|
||||
|
||||
|
||||
def _score_owner_occupied(pct: float) -> int:
|
||||
if pct >= 80: return 20
|
||||
if pct >= 60: return 15
|
||||
if pct >= 40: return 8
|
||||
return 3
|
||||
|
||||
|
||||
def _score_education(pct_bach_plus: float) -> int:
|
||||
if pct_bach_plus >= 50: return 20
|
||||
if pct_bach_plus >= 30: return 14
|
||||
if pct_bach_plus >= 15: return 7
|
||||
return 2
|
||||
|
||||
|
||||
def _score_crime(ratio_vs_national: float) -> int:
|
||||
"""Lower ratio = better (less crime than national)."""
|
||||
if ratio_vs_national < 0.7: return 15
|
||||
if ratio_vs_national < 1.0: return 12
|
||||
if ratio_vs_national < 1.5: return 7
|
||||
return 2
|
||||
|
||||
|
||||
def _score_vacancy(pct: float) -> int:
|
||||
"""Lower vacancy = better."""
|
||||
if pct < 3: return 10
|
||||
if pct < 6: return 7
|
||||
if pct < 10: return 4
|
||||
return 1
|
||||
|
||||
|
||||
def _score_dom(days: float) -> int:
|
||||
"""Lower days-on-market = hotter neighborhood = better."""
|
||||
if days < 30: return 10
|
||||
if days < 60: return 7
|
||||
if days < 90: return 4
|
||||
return 1
|
||||
|
||||
|
||||
def _classify(indicators: dict) -> dict:
|
||||
"""Aplica el algoritmo de scoring con graceful degradation.
|
||||
|
||||
Returns dict con neighborhood_class, class_score, confidence_level, etc.
|
||||
"""
|
||||
score_funcs = {
|
||||
"income": (_score_income, "median_household_income"),
|
||||
"owner_occupied": (_score_owner_occupied, "owner_occupied_pct"),
|
||||
"education": (_score_education, "education_attainment_pct_bachelor_plus"),
|
||||
"crime": (_score_crime, "crime_vs_national"),
|
||||
"vacancy": (_score_vacancy, "vacancy_rate"),
|
||||
"dom": (_score_dom, "days_on_market_median"),
|
||||
}
|
||||
|
||||
points = {}
|
||||
indicators_available = []
|
||||
for key, (func, indicator_name) in score_funcs.items():
|
||||
val = indicators.get(indicator_name)
|
||||
if val is not None:
|
||||
points[key] = func(val)
|
||||
indicators_available.append(key)
|
||||
|
||||
n_available = len(indicators_available)
|
||||
|
||||
# Confidence level por cantidad de indicadores
|
||||
if n_available == 0:
|
||||
confidence = "unclassified"
|
||||
elif n_available <= 2:
|
||||
confidence = "low"
|
||||
elif n_available <= 4:
|
||||
confidence = "medium"
|
||||
else:
|
||||
confidence = "high"
|
||||
|
||||
if n_available == 0:
|
||||
return {
|
||||
"neighborhood_class": "unclassified",
|
||||
"class_score": 0.0,
|
||||
"confidence_level": "unclassified",
|
||||
"indicators_available": [],
|
||||
"weight_coverage_pct": 0,
|
||||
}
|
||||
|
||||
# Graceful degradation: scale points contra weights disponibles
|
||||
total_weight_available = sum(WEIGHTS[k] for k in indicators_available)
|
||||
total_points = sum(points.values())
|
||||
scaled_0_to_100 = (total_points / total_weight_available) * 100
|
||||
|
||||
if scaled_0_to_100 >= 85: letter = "A"
|
||||
elif scaled_0_to_100 >= 65: letter = "B"
|
||||
elif scaled_0_to_100 >= 40: letter = "C"
|
||||
else: letter = "D"
|
||||
|
||||
return {
|
||||
"neighborhood_class": letter,
|
||||
"class_score": round(scaled_0_to_100, 1),
|
||||
"confidence_level": confidence,
|
||||
"indicators_available": indicators_available,
|
||||
"weight_coverage_pct": total_weight_available,
|
||||
"raw_points": points,
|
||||
}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Investment implications por clase
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
INVESTMENT_IMPLICATIONS = {
|
||||
"A": {
|
||||
"buy_hold_viability": "Alta - retornos estables aunque cash flow menor por precios altos",
|
||||
"section_8_viability": "Muy baja - market rents muy superiores a Section 8 FMR",
|
||||
"appreciation_potential": "Alta - tipicamente supera inflacion",
|
||||
"tenant_quality_expected": "Profesional, familias, muy estable",
|
||||
"typical_strategies": ["Buy & Hold", "Apreciacion play", "Short-term rental premium"],
|
||||
},
|
||||
"B": {
|
||||
"buy_hold_viability": "Alta - balance entre cash flow y apreciacion",
|
||||
"section_8_viability": "Baja - market rents por encima de FMR pero no por mucho",
|
||||
"appreciation_potential": "Media-alta",
|
||||
"tenant_quality_expected": "Profesional, familias, estable",
|
||||
"typical_strategies": ["Buy & Hold", "Light BRRRR", "Section 8 si la matematica cierra"],
|
||||
},
|
||||
"C": {
|
||||
"buy_hold_viability": "Media - mas cash flow, menos apreciacion, mas management",
|
||||
"section_8_viability": "Alta - market rents cerca o por debajo de FMR",
|
||||
"appreciation_potential": "Baja-media",
|
||||
"tenant_quality_expected": "Working class, estabilidad mixta",
|
||||
"typical_strategies": ["Section 8", "BRRRR", "Buy & Hold con management activo"],
|
||||
},
|
||||
"D": {
|
||||
"buy_hold_viability": "Baja - cash flow alto pero riesgo alto, management intensivo",
|
||||
"section_8_viability": "Muy alta - Section 8 puede superar market rent",
|
||||
"appreciation_potential": "Baja - depende de trayectoria del vecindario",
|
||||
"tenant_quality_expected": "Bajos ingresos, screening diligente requerido",
|
||||
"typical_strategies": ["Section 8 (cash flow)", "BRRRR agresivo solo con exit a comprador de calidad"],
|
||||
},
|
||||
"unclassified": {
|
||||
"buy_hold_viability": "No determinado - sin datos suficientes",
|
||||
"section_8_viability": "No determinado",
|
||||
"appreciation_potential": "No determinado",
|
||||
"tenant_quality_expected": "No determinado",
|
||||
"typical_strategies": [],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _build_reasoning(indicators: dict, classification: dict) -> str:
|
||||
"""Genera 1-2 lineas de justificacion del class letter."""
|
||||
letter = classification["neighborhood_class"]
|
||||
if letter == "unclassified":
|
||||
return "Sin datos suficientes para clasificar (todas las APIs sin keys o fallaron)."
|
||||
|
||||
parts = []
|
||||
if (v := indicators.get("median_household_income")) is not None:
|
||||
parts.append(f"median income ${v:,.0f}")
|
||||
if (v := indicators.get("owner_occupied_pct")) is not None:
|
||||
parts.append(f"owner-occupied {v:.0f}%")
|
||||
if (v := indicators.get("education_attainment_pct_bachelor_plus")) is not None:
|
||||
parts.append(f"bachelor+ {v:.0f}%")
|
||||
if (v := indicators.get("crime_vs_national")) is not None:
|
||||
parts.append(f"crime {v:.2f}x national")
|
||||
if (v := indicators.get("vacancy_rate")) is not None:
|
||||
parts.append(f"vacancy {v:.1f}%")
|
||||
if (v := indicators.get("days_on_market_median")) is not None:
|
||||
parts.append(f"DOM {v} dias")
|
||||
|
||||
indicator_str = ", ".join(parts)
|
||||
score = classification["class_score"]
|
||||
conf = classification["confidence_level"]
|
||||
coverage = classification["weight_coverage_pct"]
|
||||
return (
|
||||
f"Clase {letter} (score {score}/100, confianza {conf}, "
|
||||
f"cobertura {coverage}% del peso). Indicadores: {indicator_str}."
|
||||
)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# API publica
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_neighborhood(geocode: dict, include_dom: bool = False) -> dict:
|
||||
"""Clasifica un vecindario A/B/C/D basado en indicadores objetivos.
|
||||
|
||||
Args:
|
||||
geocode: output de census_geocode.fetch_geocode (debe tener state_fips,
|
||||
county_code_only, tract_code).
|
||||
include_dom: si True, hace lookup de Days-on-Market via Firecrawl
|
||||
(gasta credits). Default False.
|
||||
|
||||
Returns:
|
||||
dict con neighborhood_class, class_score, confidence_level, indicators,
|
||||
investment_implications, etc.
|
||||
"""
|
||||
# .env ya fue cargado por data_fetchers/__init__.py al primer import
|
||||
# del paquete. No llamamos load_dotenv() aca para evitar conflictos con
|
||||
# CWD distinto del proyecto.
|
||||
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
all_errors: list[str] = []
|
||||
data_sources: list[str] = []
|
||||
|
||||
if not geocode or not geocode.get("state_fips"):
|
||||
return {
|
||||
"neighborhood_class": "unclassified",
|
||||
"class_score": 0.0,
|
||||
"confidence_level": "unclassified",
|
||||
"indicators": {},
|
||||
"indicators_available": [],
|
||||
"weight_coverage_pct": 0,
|
||||
"class_reasoning": "Geocode fallo - no se puede clasificar sin tract.",
|
||||
"investment_implications": INVESTMENT_IMPLICATIONS["unclassified"],
|
||||
"warnings": ["Geocode invalido o incompleto"],
|
||||
"data_sources": [],
|
||||
"tract_geoid": None,
|
||||
"fetched_at": fetched_at,
|
||||
"errors": ["geocode_failed"],
|
||||
}
|
||||
|
||||
# ─── Census ACS (4 indicadores) ─────────────────────────────────────────
|
||||
indicators: dict = {}
|
||||
census_data, errs = _fetch_census_acs(geocode)
|
||||
indicators.update(census_data)
|
||||
all_errors.extend(errs)
|
||||
if census_data:
|
||||
data_sources.append("US Census ACS 2022 5-Year")
|
||||
|
||||
# ─── FBI UCR (1 indicador) ──────────────────────────────────────────────
|
||||
crime_data, errs = _fetch_fbi_crime(geocode)
|
||||
# Excluir keys auxiliares con prefijo "_"
|
||||
indicators.update({k: v for k, v in crime_data.items() if not k.startswith("_")})
|
||||
all_errors.extend(errs)
|
||||
if crime_data:
|
||||
data_sources.append("FBI Crime Data Explorer (state-level)")
|
||||
|
||||
# ─── Firecrawl DOM (1 indicador, opt-in) ────────────────────────────────
|
||||
if include_dom:
|
||||
dom_data, errs = _fetch_firecrawl_dom(geocode)
|
||||
indicators.update(dom_data)
|
||||
all_errors.extend(errs)
|
||||
if dom_data:
|
||||
data_sources.append("Firecrawl (Zillow DOM)")
|
||||
|
||||
# ─── Clasificar ─────────────────────────────────────────────────────────
|
||||
classification = _classify(indicators)
|
||||
reasoning = _build_reasoning(indicators, classification)
|
||||
letter = classification["neighborhood_class"]
|
||||
|
||||
# ─── Warnings ───────────────────────────────────────────────────────────
|
||||
warnings: list[str] = []
|
||||
if classification["confidence_level"] in ("low", "unclassified"):
|
||||
warnings.append(
|
||||
f"Confianza {classification['confidence_level']}: "
|
||||
f"solo {len(classification['indicators_available'])} indicadores disponibles."
|
||||
)
|
||||
if "_crime_state_level_note" in crime_data:
|
||||
warnings.append(crime_data["_crime_state_level_note"])
|
||||
|
||||
return {
|
||||
"neighborhood_class": letter,
|
||||
"class_score": classification["class_score"],
|
||||
"confidence_level": classification["confidence_level"],
|
||||
"indicators": indicators,
|
||||
"indicators_available": classification["indicators_available"],
|
||||
"weight_coverage_pct": classification["weight_coverage_pct"],
|
||||
"class_reasoning": reasoning,
|
||||
"investment_implications": INVESTMENT_IMPLICATIONS[letter],
|
||||
"warnings": warnings,
|
||||
"data_sources": data_sources,
|
||||
"tract_geoid": geocode.get("tract_geoid"),
|
||||
"tract_name": geocode.get("tract_name"),
|
||||
"fetched_at": fetched_at,
|
||||
"errors": all_errors,
|
||||
}
|
||||
@@ -0,0 +1,218 @@
|
||||
"""NOAA HURDAT2 - historial de huracanes Atlantico.
|
||||
|
||||
Dataset: https://www.nhc.noaa.gov/data/#hurdat
|
||||
Format spec: https://www.nhc.noaa.gov/data/hurdat/hurdat2-format.pdf
|
||||
|
||||
Descarga el archivo en data/hurdat2.txt en el primer uso (lazy).
|
||||
Re-descarga si tiene mas de 365 dias.
|
||||
|
||||
Para una direccion dada (lat/lng), devuelve huracanes que pasaron a menos
|
||||
de N millas (default 150) en los ultimos K anos (default 20).
|
||||
|
||||
Returns:
|
||||
{
|
||||
"lookback_years": 20,
|
||||
"max_distance_mi": 150,
|
||||
"total_hurricanes_nearby": <int>,
|
||||
"hurricanes": [
|
||||
{"name": "Ian", "year": 2022, "category": 4,
|
||||
"max_wind_mph": 155, "closest_pass_miles": 12},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
# URLs candidatas (NOAA renombra el archivo cada ano).
|
||||
# Si todas fallan, FetcherError.
|
||||
HURDAT2_URL_CANDIDATES = [
|
||||
"https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2024-040425.txt",
|
||||
"https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2023-051124.txt",
|
||||
"https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2022-050423.txt",
|
||||
]
|
||||
|
||||
HURDAT2_MAX_AGE_DAYS = 365
|
||||
|
||||
|
||||
def _saffir_simpson(max_wind_mph: float) -> int:
|
||||
"""Categoria Saffir-Simpson basada en max sustained wind (mph). 0 = TS (no huracan)."""
|
||||
if max_wind_mph >= 157:
|
||||
return 5
|
||||
if max_wind_mph >= 130:
|
||||
return 4
|
||||
if max_wind_mph >= 111:
|
||||
return 3
|
||||
if max_wind_mph >= 96:
|
||||
return 2
|
||||
if max_wind_mph >= 74:
|
||||
return 1
|
||||
return 0 # tropical storm or less
|
||||
|
||||
|
||||
def _haversine_mi(lat1: float, lng1: float, lat2: float, lng2: float) -> float:
|
||||
"""Distancia great-circle entre dos puntos en millas."""
|
||||
R_MI = 3958.8
|
||||
p1 = math.radians(lat1)
|
||||
p2 = math.radians(lat2)
|
||||
dp = math.radians(lat2 - lat1)
|
||||
dl = math.radians(lng2 - lng1)
|
||||
a = math.sin(dp/2)**2 + math.cos(p1) * math.cos(p2) * math.sin(dl/2)**2
|
||||
return 2 * R_MI * math.asin(math.sqrt(a))
|
||||
|
||||
|
||||
def _parse_coord(s: str) -> float | None:
|
||||
"""Parse '26.5N' o '80.3W' a float (West/South negativos)."""
|
||||
s = s.strip()
|
||||
if not s or len(s) < 2:
|
||||
return None
|
||||
try:
|
||||
val = float(s[:-1])
|
||||
d = s[-1].upper()
|
||||
if d in ("S", "W"):
|
||||
val = -val
|
||||
return val
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _download_hurdat2(dest_path: Path) -> None:
|
||||
"""Intenta descargar HURDAT2 desde varias URLs candidatas."""
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
last_err = None
|
||||
for url in HURDAT2_URL_CANDIDATES:
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=30)
|
||||
if r.status_code == 200 and len(r.text) > 10000:
|
||||
dest_path.write_text(r.text, encoding="utf-8")
|
||||
return
|
||||
last_err = f"HTTP {r.status_code} from {url}"
|
||||
except requests.RequestException as e:
|
||||
last_err = f"{url}: {e}"
|
||||
continue
|
||||
raise FetcherError(f"No pude descargar HURDAT2 desde ninguna URL. Ultimo error: {last_err}")
|
||||
|
||||
|
||||
def _ensure_hurdat2_local(local_path: str | Path) -> Path:
|
||||
"""Garantiza que el archivo local exista y este fresco. Descarga si hace falta."""
|
||||
p = Path(local_path)
|
||||
needs_download = (
|
||||
not p.exists()
|
||||
or (time.time() - p.stat().st_mtime) / 86400 > HURDAT2_MAX_AGE_DAYS
|
||||
)
|
||||
if needs_download:
|
||||
_download_hurdat2(p)
|
||||
return p
|
||||
|
||||
|
||||
def fetch_hurricanes(
|
||||
lat: float,
|
||||
lng: float,
|
||||
years_back: int = 20,
|
||||
max_distance_mi: float = 150.0,
|
||||
hurdat2_path: str | Path = "data/hurdat2.txt",
|
||||
) -> dict:
|
||||
"""Busca huracanes que pasaron cerca de (lat, lng) en los ultimos N anos.
|
||||
|
||||
'Cerca' = al menos un track-point del huracan estuvo a <= max_distance_mi.
|
||||
Solo cuenta sistemas que alcanzaron category 1+ (max_wind >= 74 mph) en algun
|
||||
momento de su track.
|
||||
"""
|
||||
if lat is None or lng is None:
|
||||
raise FetcherError("lat/lng requeridos")
|
||||
|
||||
p = _ensure_hurdat2_local(hurdat2_path)
|
||||
current_year = datetime.now().year
|
||||
min_year = current_year - years_back
|
||||
|
||||
text = p.read_text(encoding="utf-8", errors="replace")
|
||||
lines = text.splitlines()
|
||||
|
||||
hurricanes_nearby = []
|
||||
i = 0
|
||||
n = len(lines)
|
||||
while i < n:
|
||||
line = lines[i].strip()
|
||||
if not line:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
parts = [x.strip() for x in line.split(",")]
|
||||
# Header: AL112022, IAN, 65,
|
||||
if len(parts) >= 3 and parts[0].startswith("AL") and len(parts[0]) >= 8:
|
||||
atcf_id = parts[0]
|
||||
name = parts[1]
|
||||
try:
|
||||
num_records = int(parts[2])
|
||||
year = int(atcf_id[4:8])
|
||||
except (ValueError, IndexError):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if year < min_year:
|
||||
# Skip todos los track lines de este huracan
|
||||
i += 1 + num_records
|
||||
continue
|
||||
|
||||
max_wind_kt = 0
|
||||
min_dist_mi = float("inf")
|
||||
for j in range(num_records):
|
||||
tl_idx = i + 1 + j
|
||||
if tl_idx >= n:
|
||||
break
|
||||
track_parts = [x.strip() for x in lines[tl_idx].split(",")]
|
||||
if len(track_parts) < 7:
|
||||
continue
|
||||
# 0:date 1:time 2:record_id 3:status 4:lat 5:lng 6:wind
|
||||
tlat = _parse_coord(track_parts[4])
|
||||
tlng = _parse_coord(track_parts[5])
|
||||
try:
|
||||
wind = int(track_parts[6])
|
||||
except ValueError:
|
||||
wind = 0
|
||||
if tlat is None or tlng is None:
|
||||
continue
|
||||
dist = _haversine_mi(lat, lng, tlat, tlng)
|
||||
if dist < min_dist_mi:
|
||||
min_dist_mi = dist
|
||||
if wind > max_wind_kt:
|
||||
max_wind_kt = wind
|
||||
|
||||
max_wind_mph = max_wind_kt * 1.15078 # kt -> mph
|
||||
category = _saffir_simpson(max_wind_mph)
|
||||
|
||||
# Solo contamos huracanes (cat 1+) que pasaron cerca
|
||||
if category >= 1 and min_dist_mi <= max_distance_mi:
|
||||
hurricanes_nearby.append({
|
||||
"name": name if name else "UNNAMED",
|
||||
"year": year,
|
||||
"category": category,
|
||||
"max_wind_mph": int(round(max_wind_mph)),
|
||||
"closest_pass_miles": int(round(min_dist_mi)),
|
||||
})
|
||||
|
||||
i += 1 + num_records
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Ordenar: mas reciente y mas fuerte primero
|
||||
hurricanes_nearby.sort(key=lambda h: (-h["year"], -h["category"]))
|
||||
|
||||
return {
|
||||
"lookback_years": years_back,
|
||||
"max_distance_mi": max_distance_mi,
|
||||
"total_hurricanes_nearby": len(hurricanes_nearby),
|
||||
"hurricanes": hurricanes_nearby,
|
||||
"source": "NOAA HURDAT2",
|
||||
}
|
||||
@@ -0,0 +1,399 @@
|
||||
"""data_fetchers/owner_classifier.py — Clasifica owner_name del PA en tipos.
|
||||
|
||||
Estrategia de negocio: cuando el owner es un BANCO (BoA, Wells Fargo, etc.),
|
||||
hay oportunidad de REO direct outreach — comprar por debajo de market.
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.owner_classifier import classify_owner, build_reo_signal
|
||||
classification = classify_owner("BANK OF AMERICA NA TRSTEE")
|
||||
# → {type, category, confidence, normalized, evidence}
|
||||
|
||||
reo = build_reo_signal(
|
||||
owner_name="BANK OF AMERICA NA TRSTEE",
|
||||
just_value=322580,
|
||||
assessed_value=228560,
|
||||
taxes_paid_last=5256.59,
|
||||
)
|
||||
# → {is_reo_opportunity, suggested_offer_range, justification, ...}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Owner-type patterns (priority order — first match wins)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
LENDER_PATTERNS = {
|
||||
# Big national banks
|
||||
"BANK_NATIONAL": [
|
||||
"BANK OF AMERICA", "WELLS FARGO", "JPMORGAN", "CHASE BANK", "CHASE MORTGAGE",
|
||||
"U.S. BANK", "US BANK", "USBANK", "PNC BANK", "TRUIST", "CITIBANK",
|
||||
"CITIMORTGAGE", "REGIONS BANK", "FIFTH THIRD", "KEY BANK",
|
||||
"HUNTINGTON NATIONAL", "SUNTRUST", "BMO HARRIS", "TD BANK",
|
||||
],
|
||||
# GSE / Federal agencies (foreclosure → these guys hold inventory)
|
||||
"GSE_FEDERAL": [
|
||||
"FEDERAL HOME LOAN MORTGAGE", "FREDDIE MAC", "FREDDIEMAC",
|
||||
"FEDERAL NATIONAL MORTGAGE", "FANNIE MAE", "FANNIEMAE",
|
||||
"SECRETARY OF HUD", "SECRETARY OF HOUSING", "U.S. DEPARTMENT OF HOUSING",
|
||||
"VA SECRETARY", "VETERANS AFFAIRS",
|
||||
"SECRETARY OF VETERANS AFFAIRS",
|
||||
],
|
||||
# MBS / Trustee banks (Mortgage-Backed Securities trustees)
|
||||
"MBS_TRUSTEE": [
|
||||
"DEUTSCHE BANK NATIONAL TRUST", "BANK OF NEW YORK MELLON",
|
||||
"U.S. BANK TRUST", "U.S. BANK NATIONAL ASSOCIATION TRUSTEE",
|
||||
"WILMINGTON SAVINGS", "WILMINGTON TRUST",
|
||||
" AS TRUSTEE FOR", "NA TRSTEE", "TRSTEE",
|
||||
"CHRISTIANA TRUST", "WELLS FARGO BANK NA TRSTEE",
|
||||
"HSBC BANK USA NATIONAL", "HSBC BANK USA",
|
||||
],
|
||||
# Loan servicers (often own REO too)
|
||||
"SERVICER": [
|
||||
"BAYVIEW LOAN SERVICING", "SHELLPOINT MORTGAGE", "NEWREZ",
|
||||
"MR. COOPER", "MR COOPER", "SPECIALIZED LOAN SERVICING", "PHH MORTGAGE",
|
||||
"OCWEN", "SELENE FINANCE", "RUSHMORE LOAN", "FAY SERVICING",
|
||||
"CARRINGTON MORTGAGE", "PENNYMAC", "FREEDOM MORTGAGE",
|
||||
],
|
||||
# Fintech lenders
|
||||
"FINTECH_LENDER": [
|
||||
"ROCKET MORTGAGE", "QUICKEN LOANS", "BETTER MORTGAGE", "LOANDEPOT",
|
||||
"GUILD MORTGAGE", "AMERIHOME",
|
||||
],
|
||||
# Community banks / regional (FL-specific common)
|
||||
"BANK_REGIONAL": [
|
||||
"SEACOAST BANK", "VALLEY NATIONAL", "FIRST HORIZON",
|
||||
"CENTENNIAL BANK", "BANKUNITED", "AMERIS BANK", "SYNOVUS",
|
||||
"ATLANTIC CAPITAL", "PROFESSIONAL BANK",
|
||||
],
|
||||
# Tax certificate holders (acquired via tax deed)
|
||||
"TAX_CERTIFICATE_HOLDER": [
|
||||
"TAX CERTIFICATE", "TAX DEED HOLDER", "FLORIDA TAX CERTIFICATE",
|
||||
],
|
||||
# Insurance / pension (sometimes own real estate)
|
||||
"INSURANCE_PENSION": [
|
||||
"STATE FARM", "ALLSTATE", "PRUDENTIAL", "MASS MUTUAL", "METLIFE",
|
||||
],
|
||||
}
|
||||
|
||||
# Government entities (NOT REO opportunities, usually held for public use)
|
||||
GOVERNMENT_PATTERNS = [
|
||||
"STATE OF FLORIDA", "FLORIDA DEPT", "FLORIDA DEPARTMENT",
|
||||
"CITY OF", "COUNTY OF", "TOWN OF", "VILLAGE OF",
|
||||
"SCHOOL BOARD", "SCHOOL DISTRICT", "MUNICIPALITY OF",
|
||||
"UNITED STATES OF AMERICA", "U.S. POSTAL", "U.S. ARMY",
|
||||
"FLORIDA POWER", "FPL ", "WATER MANAGEMENT DISTRICT",
|
||||
"DEPARTMENT OF TRANSPORTATION", "DOT ",
|
||||
]
|
||||
|
||||
# Non-profit / religious (rare REO scenarios)
|
||||
NONPROFIT_PATTERNS = [
|
||||
"CHURCH OF", "CATHOLIC", "BAPTIST", "METHODIST",
|
||||
"FOUNDATION", "MINISTRIES", "RELIGIOUS",
|
||||
"HABITAT FOR HUMANITY", "REDEVELOPMENT",
|
||||
"NON-PROFIT", "NONPROFIT", "RED CROSS",
|
||||
]
|
||||
|
||||
# LLC patterns (investor-owned, possible negotiation)
|
||||
LLC_PATTERNS = [
|
||||
" LLC", "LLC ", "L.L.C.", "L.L.C ",
|
||||
" INC", " INC.", "INCORPORATED",
|
||||
" CORP", "CORPORATION", " LTD",
|
||||
" LP", "LIMITED PARTNERSHIP",
|
||||
]
|
||||
|
||||
# Trust patterns (family trust vs MBS trust — context matters)
|
||||
TRUST_PATTERNS = [
|
||||
" TRUST", "TRUSTEE", " TR ", "LIVING TRUST", "FAMILY TRUST",
|
||||
"REVOCABLE TRUST", "IRREVOCABLE TRUST",
|
||||
]
|
||||
|
||||
# Individual indicators (LE = Life Estate, REM = Remainderman, etc.)
|
||||
INDIVIDUAL_INDICATORS = [
|
||||
" LE", " REM", " H/W", "HUSBAND", "WIFE",
|
||||
"& W ", " &W ", "&H ", " EST", "ESTATE OF",
|
||||
]
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def classify_owner(owner_name: Optional[str], co_owners: Optional[list[str]] = None) -> dict:
|
||||
"""Classify owner_name into business categories.
|
||||
|
||||
Args:
|
||||
owner_name: primary owner name from PA
|
||||
co_owners: optional list of additional owners
|
||||
|
||||
Returns:
|
||||
{
|
||||
"type": str (category code),
|
||||
"category": str (business-level),
|
||||
"is_lender": bool,
|
||||
"is_government": bool,
|
||||
"is_individual": bool,
|
||||
"is_investor_entity": bool,
|
||||
"is_trust": bool,
|
||||
"confidence": float (0-1),
|
||||
"matched_keyword": str | None,
|
||||
"normalized": str (uppercase, stripped),
|
||||
"evidence": [str],
|
||||
}
|
||||
"""
|
||||
out = {
|
||||
"type": "UNKNOWN",
|
||||
"category": "unknown",
|
||||
"is_lender": False,
|
||||
"is_government": False,
|
||||
"is_individual": False,
|
||||
"is_investor_entity": False,
|
||||
"is_trust": False,
|
||||
"confidence": 0.0,
|
||||
"matched_keyword": None,
|
||||
"normalized": "",
|
||||
"evidence": [],
|
||||
}
|
||||
if not owner_name:
|
||||
return out
|
||||
|
||||
# Combine owner + co-owners for full classification
|
||||
full_text = owner_name.upper()
|
||||
if co_owners:
|
||||
full_text += " | " + " | ".join((c or "").upper() for c in co_owners if c)
|
||||
out["normalized"] = full_text
|
||||
|
||||
# 1. Check lender categories (highest priority — REO opportunity)
|
||||
for category, patterns in LENDER_PATTERNS.items():
|
||||
for pat in patterns:
|
||||
if pat.upper() in full_text:
|
||||
out["type"] = category
|
||||
out["category"] = "lender"
|
||||
out["is_lender"] = True
|
||||
out["matched_keyword"] = pat
|
||||
out["confidence"] = 0.95
|
||||
out["evidence"].append(f"matched lender keyword '{pat}'")
|
||||
return out
|
||||
|
||||
# 2. Government entities (not REO, but flagged)
|
||||
for pat in GOVERNMENT_PATTERNS:
|
||||
if pat.upper() in full_text:
|
||||
out["type"] = "GOVERNMENT"
|
||||
out["category"] = "government"
|
||||
out["is_government"] = True
|
||||
out["matched_keyword"] = pat
|
||||
out["confidence"] = 0.90
|
||||
out["evidence"].append(f"matched government keyword '{pat}'")
|
||||
return out
|
||||
|
||||
# 3. Non-profit
|
||||
for pat in NONPROFIT_PATTERNS:
|
||||
if pat.upper() in full_text:
|
||||
out["type"] = "NONPROFIT"
|
||||
out["category"] = "nonprofit"
|
||||
out["matched_keyword"] = pat
|
||||
out["confidence"] = 0.85
|
||||
out["evidence"].append(f"matched nonprofit keyword '{pat}'")
|
||||
return out
|
||||
|
||||
# 4. Trust (family trust vs MBS trust — usually family if not caught by MBS_TRUSTEE above)
|
||||
is_trust = any(pat in full_text for pat in TRUST_PATTERNS)
|
||||
if is_trust:
|
||||
out["is_trust"] = True
|
||||
out["type"] = "FAMILY_TRUST"
|
||||
out["category"] = "trust"
|
||||
out["matched_keyword"] = next((p for p in TRUST_PATTERNS if p in full_text), None)
|
||||
out["confidence"] = 0.80
|
||||
out["evidence"].append("Trust keyword detected (likely family/estate trust)")
|
||||
# Don't return — might also be LLC
|
||||
|
||||
# 5. LLC / Corporation
|
||||
is_llc = any(pat in full_text for pat in LLC_PATTERNS)
|
||||
if is_llc:
|
||||
out["is_investor_entity"] = True
|
||||
if out["type"] == "UNKNOWN":
|
||||
out["type"] = "LLC_OR_CORP"
|
||||
out["category"] = "investor_entity"
|
||||
out["matched_keyword"] = next((p for p in LLC_PATTERNS if p in full_text), None)
|
||||
out["confidence"] = 0.85
|
||||
out["evidence"].append(f"matched LLC/corp keyword")
|
||||
return out
|
||||
|
||||
if is_trust:
|
||||
return out
|
||||
|
||||
# 6. Individual heuristic — owner name has comma (LASTNAME, FIRSTNAME format)
|
||||
# OR contains individual indicators
|
||||
has_comma = "," in full_text
|
||||
has_individual = any(ind in full_text for ind in INDIVIDUAL_INDICATORS)
|
||||
# OR has only 2-4 words and no numbers
|
||||
words = full_text.replace(",", " ").split()
|
||||
word_count = len(words)
|
||||
has_numbers = any(any(c.isdigit() for c in w) for w in words)
|
||||
|
||||
if has_comma or has_individual or (2 <= word_count <= 5 and not has_numbers):
|
||||
out["type"] = "INDIVIDUAL"
|
||||
out["category"] = "individual"
|
||||
out["is_individual"] = True
|
||||
out["confidence"] = 0.70
|
||||
out["evidence"].append("Name pattern matches individual (comma format or 2-5 words)")
|
||||
return out
|
||||
|
||||
# Default: unknown
|
||||
out["type"] = "UNKNOWN"
|
||||
out["category"] = "unknown"
|
||||
out["confidence"] = 0.10
|
||||
out["evidence"].append("No clear pattern matched")
|
||||
return out
|
||||
|
||||
|
||||
def build_reo_signal(
|
||||
*,
|
||||
owner_classification: dict,
|
||||
just_value: Optional[float] = None,
|
||||
assessed_value: Optional[float] = None,
|
||||
listing_price: Optional[float] = None,
|
||||
taxes_paid_last: Optional[float] = None,
|
||||
mailing_address: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Build REO direct outreach opportunity signal.
|
||||
|
||||
Para owners que son lender (BANK_NATIONAL, GSE_FEDERAL, MBS_TRUSTEE):
|
||||
sugiere oferta directa para evitar MLS + comisiones del agent del banco.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"is_reo_opportunity": bool,
|
||||
"lender_type": str | None,
|
||||
"strategy": str | None,
|
||||
"suggested_offer_low": int | None,
|
||||
"suggested_offer_high": int | None,
|
||||
"discount_pct_vs_market": float | None,
|
||||
"justification_es": str | None,
|
||||
"outreach_contact_hint": str | None,
|
||||
}
|
||||
"""
|
||||
out = {
|
||||
"is_reo_opportunity": False,
|
||||
"lender_type": None,
|
||||
"strategy": None,
|
||||
"suggested_offer_low": None,
|
||||
"suggested_offer_high": None,
|
||||
"discount_pct_vs_market": None,
|
||||
"justification_es": None,
|
||||
"outreach_contact_hint": None,
|
||||
}
|
||||
|
||||
if not owner_classification.get("is_lender"):
|
||||
return out
|
||||
|
||||
lender_type = owner_classification.get("type")
|
||||
out["lender_type"] = lender_type
|
||||
out["is_reo_opportunity"] = True
|
||||
|
||||
# Math: suggest 85-95% of assessed value as offer range (banks accept this to liquidate)
|
||||
base = assessed_value or just_value or listing_price or 0
|
||||
if base > 0:
|
||||
offer_low = int(base * 0.85)
|
||||
offer_high = int(base * 0.92)
|
||||
out["suggested_offer_low"] = offer_low
|
||||
out["suggested_offer_high"] = offer_high
|
||||
market_ref = just_value or listing_price or base
|
||||
if market_ref > 0:
|
||||
mid = (offer_low + offer_high) / 2
|
||||
out["discount_pct_vs_market"] = round((1 - mid / market_ref) * 100, 1)
|
||||
|
||||
# Strategy + justification by lender type
|
||||
if lender_type == "BANK_NATIONAL":
|
||||
out["strategy"] = "Direct REO desk outreach"
|
||||
out["justification_es"] = (
|
||||
f"Banco nacional como owner = REO post-foreclosure en su balance. "
|
||||
f"REO desks tienen quota mensual de disposicion. Aceptan ofertas direct "
|
||||
f"para evitar MLS + 6% commission. "
|
||||
f"Ofrece ${offer_low:,}-${offer_high:,} ({out.get('discount_pct_vs_market', 0)}% bajo market). "
|
||||
f"Si cubris taxes pendientes (~${taxes_paid_last or 5000:,.0f}/año) y cerras cash o "
|
||||
f"conventional ready, alta probabilidad de aceptacion."
|
||||
)
|
||||
out["outreach_contact_hint"] = (
|
||||
"Contact: search '{bank} REO disposition' o 'REO asset manager' en LinkedIn. "
|
||||
"Mail oficial: {mailing_address} (REO department typically receives correspondence here)."
|
||||
).format(
|
||||
bank=lender_type.replace("_", " ").title(),
|
||||
mailing_address=mailing_address or "ver PA record",
|
||||
)
|
||||
|
||||
elif lender_type == "GSE_FEDERAL":
|
||||
out["strategy"] = "Fannie/Freddie HomePath / HUD HomeStore"
|
||||
out["justification_es"] = (
|
||||
f"GSE (Fannie/Freddie/HUD) como owner. Suelen vender via canales "
|
||||
f"oficiales: HomePath.com (Fannie), HomeSteps.com (Freddie), HUDHomeStore.gov. "
|
||||
f"Periodo Owner-Occupied first ~15-30 dias, despues investors. "
|
||||
f"Si la propiedad lleva >30 dias unsold, oferta bajo asking es aceptable. "
|
||||
f"Sugerido ${offer_low:,}-${offer_high:,}."
|
||||
)
|
||||
out["outreach_contact_hint"] = (
|
||||
"Buscar la propiedad en HomePath.com (Fannie) / HomeSteps.com (Freddie) / "
|
||||
"HUDHomeStore.gov para ver listing oficial + plazos."
|
||||
)
|
||||
|
||||
elif lender_type == "MBS_TRUSTEE":
|
||||
out["strategy"] = "Trustee-held REO (MBS securitization)"
|
||||
out["justification_es"] = (
|
||||
f"Trustee bank de un MBS (mortgage-backed security). Propiedad fue "
|
||||
f"foreclosed y entro al inventory del trust. El trustee delega a un "
|
||||
f"servicer (BAYVIEW/SHELLPOINT/etc) para la liquidacion. "
|
||||
f"Mas burocratico que un REO de bank-direct pero similar dinamica. "
|
||||
f"Oferta sugerida ${offer_low:,}-${offer_high:,}."
|
||||
)
|
||||
out["outreach_contact_hint"] = (
|
||||
"Identificar el servicer (suele estar en el mailing address o documento de transferencia). "
|
||||
"Contactar al servicer's REO/loss mitigation department."
|
||||
)
|
||||
|
||||
elif lender_type == "SERVICER":
|
||||
out["strategy"] = "Servicer-held REO outreach"
|
||||
out["justification_es"] = (
|
||||
f"Loan servicer como owner = post-foreclosure inventory. Servicers tienen "
|
||||
f"presion por compliance + balance sheet para liquidar. "
|
||||
f"Oferta sugerida ${offer_low:,}-${offer_high:,}."
|
||||
)
|
||||
out["outreach_contact_hint"] = "Contact servicer's REO disposition department."
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Owner classifier + REO signal")
|
||||
parser.add_argument("owner_name", help="Owner name from PA (e.g. 'BANK OF AMERICA NA TRSTEE')")
|
||||
parser.add_argument("--co-owners", nargs="*", help="Additional co-owners")
|
||||
parser.add_argument("--just-value", type=float, help="Just/market value")
|
||||
parser.add_argument("--assessed-value", type=float, help="Assessed value")
|
||||
parser.add_argument("--listing-price", type=float, help="Current listing price")
|
||||
parser.add_argument("--taxes-paid", type=float, help="Taxes paid last year")
|
||||
parser.add_argument("--mailing", help="Mailing address (for REO contact hint)")
|
||||
args = parser.parse_args()
|
||||
|
||||
cls = classify_owner(args.owner_name, co_owners=args.co_owners)
|
||||
print("=== CLASSIFICATION ===")
|
||||
print(json.dumps(cls, indent=2, default=str))
|
||||
|
||||
reo = build_reo_signal(
|
||||
owner_classification=cls,
|
||||
just_value=args.just_value,
|
||||
assessed_value=args.assessed_value,
|
||||
listing_price=args.listing_price,
|
||||
taxes_paid_last=args.taxes_paid,
|
||||
mailing_address=args.mailing,
|
||||
)
|
||||
print("\n=== REO SIGNAL ===")
|
||||
print(json.dumps(reo, indent=2, default=str))
|
||||
@@ -0,0 +1,461 @@
|
||||
"""data_fetchers/pa_broward.py — Full Broward County Property Appraiser extractor.
|
||||
|
||||
Extrae TODO lo publico de bcpa.net para construir un Property Snapshot Report ($15):
|
||||
- Owner + mailing address
|
||||
- Property address + neighborhood
|
||||
- Year built, sqft, use code, units
|
||||
- Just/Market value, Assessed/SOH value, by year (3 anios)
|
||||
- Taxes paid (3 anios)
|
||||
- Tax breakdown por district (County / School Board / Municipal / Independent)
|
||||
- Exemptions (homestead, senior, vet, disabled, etc.)
|
||||
- Photo URL
|
||||
- Legal description
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.pa_broward import fetch_broward_pa_record
|
||||
record = fetch_broward_pa_record(parcel_id="484226062150")
|
||||
# record["owner_name"], record["just_value"], record["sales_history"]...
|
||||
|
||||
TECHNICAL:
|
||||
- bcpa.net es Angular SPA — usar Playwright, NO requests/curl
|
||||
- wait_until="domcontentloaded" + 25s sleep (NO networkidle, nunca termina)
|
||||
- Element IDs son ESTABLES (data-bound by Angular, NO autogenerados como JSF)
|
||||
- Per-folio latency: ~28-32s
|
||||
- Free (Playwright local, no API cost)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Field ID mapping — confirmed via probe on folio 484226062150
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Single-value scalar fields
|
||||
_SCALAR_IDS = {
|
||||
"folio_number": "folioNumberId",
|
||||
"owner_name": "ownerNameId",
|
||||
"owner_name_2": "ownerName2Id",
|
||||
"mailing_address": "mailingAddressId",
|
||||
"situs_address": "situsAddressId",
|
||||
"neighborhood": "neighborhood",
|
||||
"use_code": "useCodeId",
|
||||
"millage_code": "millageCodeId",
|
||||
"adj_bldg_sqft": "bldgSqFTId",
|
||||
"under_air_sqft": "bldgUnderAirFootageId",
|
||||
"effective_year": "effectiveAgeId",
|
||||
"year_built": "actualAgeId",
|
||||
"units_beds_baths": "unitsBedsBathsId",
|
||||
"legal_description": "legalDescId",
|
||||
"homestead_flag": "homesteadFlagId",
|
||||
# Current year values (auto-current year, e.g. 2026)
|
||||
"current_tax_year": "currentTaxYearId",
|
||||
"land_value_current": "landCurrentYearId",
|
||||
"bldg_value_current": "bldgCurrentYearId",
|
||||
"just_value_current": "justCurrentYearId",
|
||||
"assessed_value_current": "sohCurrentYearId",
|
||||
# Last year (e.g. 2025)
|
||||
"last_tax_year": "lastTaxYearId",
|
||||
"land_value_last": "landLastYearId",
|
||||
"bldg_value_last": "bldgLastYearId",
|
||||
"just_value_last": "justLastYearId",
|
||||
"assessed_value_last": "sohLastYearId",
|
||||
"taxes_paid_last": "assessedLastYearId",
|
||||
# Two years ago (e.g. 2024)
|
||||
"two_years_ago_tax_year": "lastTwoTaxYearId",
|
||||
"land_value_2yr": "landLasttwoYearsId",
|
||||
"bldg_value_2yr": "bldgLasttwoYearsId",
|
||||
"just_value_2yr": "justLasttwoYearsId",
|
||||
"assessed_value_2yr": "sohLasttwoYearsId",
|
||||
"taxes_paid_2yr": "assessedLasttwoYearsId",
|
||||
}
|
||||
|
||||
# Tax breakdown by district (current year)
|
||||
_DISTRICT_IDS = {
|
||||
# district name: {field: id}
|
||||
"county": {
|
||||
"just_value": "justValueCounty",
|
||||
"portability": "portabilityValueCounty",
|
||||
"assessed_soh": "sohValueCounty",
|
||||
"homestead": "he1AmountCounty",
|
||||
"add_homestead": "he2AmountCounty",
|
||||
"widow_vet_dis": "wvdAmountCounty",
|
||||
"senior": "seniorExemptionCounty",
|
||||
"exemption_type": "mexAmountCounty",
|
||||
"affordable_housing": "ahAmountCounty",
|
||||
"taxable": "taxableAmountCounty",
|
||||
},
|
||||
"school_board": {
|
||||
"just_value": "justValueSchoolBoard",
|
||||
"portability": "portabilityValueSchoolBoard",
|
||||
"assessed_soh": "sohValueSchoolBoard",
|
||||
"homestead": "he1AmountSchoolBoard",
|
||||
"add_homestead": "he2AmountSchoolBoard",
|
||||
"widow_vet_dis": "wvdAmountSchoolBoard",
|
||||
"exemption_type": "mexAmountSchoolBoard",
|
||||
"affordable_housing": "ahAmountSchoolBoard",
|
||||
"taxable": "taxableAmountSchoolBoard",
|
||||
},
|
||||
"municipal": {
|
||||
"just_value": "justValueMunicipal",
|
||||
"portability": "portabilityValueMunicipal",
|
||||
"assessed_soh": "sohValueMunicipal",
|
||||
"homestead": "he1AmountMunicipal",
|
||||
"add_homestead": "he2AmountMunicipal",
|
||||
"widow_vet_dis": "wvdAmountMunicipal",
|
||||
"senior": "seniorExemptionMunicipal",
|
||||
"exemption_type": "mexAmountMunicipal",
|
||||
"affordable_housing": "ahAmountMunicipal",
|
||||
"taxable": "taxableAmountMunicipal",
|
||||
},
|
||||
"independent": {
|
||||
"just_value": "justValueIndependent",
|
||||
"portability": "portabilityValueIndependent",
|
||||
"assessed_soh": "sohValueIndependent",
|
||||
"homestead": "he1AmountIndependent",
|
||||
"add_homestead": "he2AmountIndependent",
|
||||
"widow_vet_dis": "wvdAmountIndependent",
|
||||
"exemption_type": "mexAmountIndependent",
|
||||
"affordable_housing": "ahAmountIndependent",
|
||||
"taxable": "taxableAmountIndependent",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_broward_pa_record(
|
||||
parcel_id: str,
|
||||
timeout_seconds: int = 45,
|
||||
wait_after_load: int = 25,
|
||||
) -> dict:
|
||||
"""Fetch full Broward PA record for a parcel_id.
|
||||
|
||||
Args:
|
||||
parcel_id: bcpa folio (e.g., "484226062150")
|
||||
timeout_seconds: max wait per Playwright operation
|
||||
wait_after_load: SPA settle time after domcontentloaded (default 25s)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"folio_number": str,
|
||||
"owner_name": str (may include " % " corp marker),
|
||||
"owner_name_2": str (continuation line),
|
||||
"mailing_address": str,
|
||||
"situs_address": str,
|
||||
"neighborhood": str,
|
||||
"use_code": str (e.g. "01-01 Single Family"),
|
||||
"year_built": int,
|
||||
"effective_year": int,
|
||||
"adj_bldg_sqft": int,
|
||||
"under_air_sqft": int,
|
||||
"millage_code": str,
|
||||
"legal_description": str,
|
||||
"homestead_active": bool,
|
||||
"current_year": {
|
||||
"tax_year": int,
|
||||
"land_value": int,
|
||||
"bldg_value": int,
|
||||
"just_value": int,
|
||||
"assessed_value": int,
|
||||
},
|
||||
"last_year": {
|
||||
"tax_year": int,
|
||||
"land_value": int,
|
||||
"bldg_value": int,
|
||||
"just_value": int,
|
||||
"assessed_value": int,
|
||||
"taxes_paid": float,
|
||||
},
|
||||
"two_years_ago": {... same ...},
|
||||
"tax_breakdown": {
|
||||
"county": {just_value, portability, assessed_soh,
|
||||
homestead, add_homestead, widow_vet_dis, senior,
|
||||
exemption_type, affordable_housing, taxable},
|
||||
"school_board": {...},
|
||||
"municipal": {...},
|
||||
"independent": {...},
|
||||
},
|
||||
"sales_history": [
|
||||
{date, type, qualified_disqualified, price, book_page_or_cin}, ...
|
||||
],
|
||||
"photo_url": str | None,
|
||||
"source_url": str,
|
||||
"fetched_at": ISO timestamp,
|
||||
"errors": [str],
|
||||
}
|
||||
"""
|
||||
fetched_at = datetime.utcnow().isoformat() + "Z"
|
||||
result = {
|
||||
"folio_number": parcel_id,
|
||||
"errors": [],
|
||||
"source_url": f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}",
|
||||
"source_api_url": f"https://web.bcpa.net/bcpaclient/search.aspx?Folio={parcel_id}",
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
|
||||
if not parcel_id or not parcel_id.strip():
|
||||
result["errors"].append("no parcel_id provided")
|
||||
return result
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
except ImportError:
|
||||
result["errors"].append("playwright not installed")
|
||||
return result
|
||||
|
||||
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1400, "height": 1000},
|
||||
)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
time.sleep(wait_after_load)
|
||||
|
||||
# Wait until at least one scalar populates (sentinel: actualAgeId = year built)
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"() => { const el = document.getElementById('actualAgeId'); return el && el.textContent.trim().length > 0; }",
|
||||
timeout=10000,
|
||||
)
|
||||
except Exception:
|
||||
# If sentinel didn't populate, try anyway — maybe extra time helps
|
||||
time.sleep(5)
|
||||
|
||||
# Extract all scalar fields in one JS call (faster than per-locator)
|
||||
scalar_values = page.evaluate(
|
||||
"""(ids) => {
|
||||
const out = {};
|
||||
for (const [key, id] of Object.entries(ids)) {
|
||||
const el = document.getElementById(id);
|
||||
out[key] = el ? (el.textContent || '').trim() : '';
|
||||
}
|
||||
return out;
|
||||
}""",
|
||||
_SCALAR_IDS,
|
||||
)
|
||||
|
||||
district_values = page.evaluate(
|
||||
"""(districts) => {
|
||||
const out = {};
|
||||
for (const [name, fields] of Object.entries(districts)) {
|
||||
out[name] = {};
|
||||
for (const [field, id] of Object.entries(fields)) {
|
||||
const el = document.getElementById(id);
|
||||
out[name][field] = el ? (el.textContent || '').trim() : '';
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}""",
|
||||
_DISTRICT_IDS,
|
||||
)
|
||||
|
||||
# Extract sales history table (PrimaryProperty Sales Information)
|
||||
sales_history = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
// The sales table has rows with class containing dates/types
|
||||
// Look for the table with header "Date | Type | Qualified | Price | Book/Page"
|
||||
const tables = document.querySelectorAll('table');
|
||||
for (const tbl of tables) {
|
||||
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
||||
if (!hdrCells || hdrCells.length < 4) continue;
|
||||
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
const isSalesHdr = hdrText.some(h => h.includes('date')) &&
|
||||
hdrText.some(h => h.includes('type')) &&
|
||||
hdrText.some(h => h.includes('price')) &&
|
||||
hdrText.some(h => h.includes('qualified'));
|
||||
if (!isSalesHdr) continue;
|
||||
// Parse data rows
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
const r = {};
|
||||
cells.forEach((c, idx) => {
|
||||
const h = hdrText[idx] || `col${idx}`;
|
||||
r[h] = (c.textContent || '').trim();
|
||||
});
|
||||
// Skip empty rows
|
||||
if (Object.values(r).some(v => v && v.length > 0)) {
|
||||
out.push(r);
|
||||
}
|
||||
}
|
||||
if (out.length > 0) break;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Photo URL
|
||||
photo_urls = page.evaluate("""
|
||||
() => Array.from(document.querySelectorAll('img'))
|
||||
.filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)
|
||||
.map(i => i.src)
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
# ─── Post-process scalars ─────────────────────────────────────
|
||||
result.update({k: _clean(v) for k, v in scalar_values.items()})
|
||||
|
||||
# Coerce numeric fields
|
||||
for k in ("year_built", "effective_year", "current_tax_year", "last_tax_year",
|
||||
"two_years_ago_tax_year", "adj_bldg_sqft", "under_air_sqft"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _to_int(v)
|
||||
|
||||
for k in ("land_value_current", "bldg_value_current", "just_value_current",
|
||||
"assessed_value_current", "land_value_last", "bldg_value_last",
|
||||
"just_value_last", "assessed_value_last",
|
||||
"land_value_2yr", "bldg_value_2yr", "just_value_2yr", "assessed_value_2yr"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _money_to_int(v)
|
||||
|
||||
for k in ("taxes_paid_last", "taxes_paid_2yr"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _money_to_float(v)
|
||||
|
||||
# ─── Structured groupings for downstream consumers ───────────
|
||||
result["current_year"] = {
|
||||
"tax_year": result.get("current_tax_year"),
|
||||
"land_value": result.get("land_value_current"),
|
||||
"bldg_value": result.get("bldg_value_current"),
|
||||
"just_value": result.get("just_value_current"),
|
||||
"assessed_value": result.get("assessed_value_current"),
|
||||
}
|
||||
result["last_year"] = {
|
||||
"tax_year": result.get("last_tax_year"),
|
||||
"land_value": result.get("land_value_last"),
|
||||
"bldg_value": result.get("bldg_value_last"),
|
||||
"just_value": result.get("just_value_last"),
|
||||
"assessed_value": result.get("assessed_value_last"),
|
||||
"taxes_paid": result.get("taxes_paid_last"),
|
||||
}
|
||||
result["two_years_ago"] = {
|
||||
"tax_year": result.get("two_years_ago_tax_year"),
|
||||
"land_value": result.get("land_value_2yr"),
|
||||
"bldg_value": result.get("bldg_value_2yr"),
|
||||
"just_value": result.get("just_value_2yr"),
|
||||
"assessed_value": result.get("assessed_value_2yr"),
|
||||
"taxes_paid": result.get("taxes_paid_2yr"),
|
||||
}
|
||||
|
||||
# Process tax breakdown — clean & convert
|
||||
result["tax_breakdown"] = {}
|
||||
for district, fields in district_values.items():
|
||||
result["tax_breakdown"][district] = {
|
||||
k: _money_to_int(v) if "$" in v or v.replace(",", "").replace(".", "").isdigit() else _clean(v)
|
||||
for k, v in fields.items()
|
||||
}
|
||||
|
||||
# Sales history cleanup
|
||||
result["sales_history"] = []
|
||||
for s in sales_history:
|
||||
# Normalize key names from possibly varied headers
|
||||
norm = {
|
||||
"date": _clean(s.get("date", "")),
|
||||
"type": _clean(s.get("type", "")),
|
||||
"qualified_disqualified": _clean(s.get("qualified/disqualified", s.get("qualified", ""))),
|
||||
"price": _money_to_int(s.get("price", "")) if s.get("price") else None,
|
||||
"book_page_or_cin": _clean(s.get("book/page or cin", s.get("book/page", ""))),
|
||||
}
|
||||
if any(norm.values()):
|
||||
result["sales_history"].append(norm)
|
||||
|
||||
# Homestead boolean (flag is " , N" or " , Y")
|
||||
hf = result.get("homestead_flag", "")
|
||||
result["homestead_active"] = "Y" in hf.upper() and "N" not in hf.upper()
|
||||
|
||||
# Photo
|
||||
result["photo_url"] = photo_urls[0] if photo_urls else None
|
||||
|
||||
except PWTimeout as e:
|
||||
result["errors"].append(f"timeout: {e}")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result["errors"].append(f"{type(e).__name__}: {e}")
|
||||
result["_trace"] = traceback.format_exc()[:600]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _clean(s: str) -> str:
|
||||
"""Collapse whitespace and strip."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _to_int(s: str) -> Optional[int]:
|
||||
"""Parse '1969' or '1,199' → int. Returns None if unparseable."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", s)
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s: str) -> Optional[int]:
|
||||
"""Parse '$322,580' → 322580. Returns None if unparseable."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", s)
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_float(s: str) -> Optional[float]:
|
||||
"""Parse '$5,256.59' → 5256.59."""
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", s)
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI for manual testing
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Broward PA full record fetcher")
|
||||
parser.add_argument("parcel_id", help="Folio number (e.g. 484226062150)")
|
||||
parser.add_argument("--wait", type=int, default=25, help="SPA settle seconds (default 25)")
|
||||
args = parser.parse_args()
|
||||
|
||||
record = fetch_broward_pa_record(args.parcel_id, wait_after_load=args.wait)
|
||||
print(json.dumps(record, indent=2, default=str))
|
||||
@@ -0,0 +1,820 @@
|
||||
"""data_fetchers/pa_duval.py — Full Duval County Property Appraiser extractor.
|
||||
|
||||
Sitio: https://paopropertysearch.coj.net (ASP.NET WebForms)
|
||||
Flow: Search.aspx → Results.aspx → Detail.aspx?ParcelNumber=XXX
|
||||
|
||||
Extrae todo lo publico del Duval PA para construir un Property Snapshot Report:
|
||||
- Owner name(s)
|
||||
- Property address + subdivision + legal description
|
||||
- Building: type, year_built, sqft heated/total, bedrooms, bathrooms,
|
||||
exterior wall, roof type, interior flooring
|
||||
- Values: just/market, assessed, exemptions (3-year history)
|
||||
- Tax breakdown por taxing district
|
||||
- Sales history completa (book/page, date, price, deed type, qualified status)
|
||||
- Homestead exemption (key signal: owner-occupant vs investor)
|
||||
- Land details (zoning, lot size, use code)
|
||||
- Extra features (fireplace, pool, etc.)
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.pa_duval import fetch_duval_pa_record
|
||||
rec = fetch_duval_pa_record(address="2352 SCENIC VIEW CT", zip_code="32218")
|
||||
# rec["year_built"], rec["sales_history"], rec["homestead_active"]...
|
||||
|
||||
TECHNICAL:
|
||||
- ASP.NET WebForms con WebForm_DoPostBackWithOptions (compat IE8)
|
||||
- Element IDs ESTABLES (no autogenerados)
|
||||
- Per-search latency: ~10-15s (entry → search → results → detail)
|
||||
- Free (Playwright local, no API cost)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
|
||||
|
||||
_BASE_URL = "https://paopropertysearch.coj.net"
|
||||
_SEARCH_URL = f"{_BASE_URL}/Basic/Search.aspx"
|
||||
|
||||
# Address parser para Duval ASP.NET form fields.
|
||||
# Acepta long form (COURT, STREET) y abbreviation (CT, ST). Strip city/state/zip
|
||||
# antes de parsear (split por primera coma).
|
||||
_ADDRESS_RE = re.compile(
|
||||
r"^\s*(?P<num>\d+)\s+"
|
||||
r"(?:(?P<prefix>N|S|E|W|NE|NW|SE|SW)\s+)?"
|
||||
r"(?P<name>[A-Z][A-Z\s\d\-']*?)"
|
||||
r"\s+(?P<suffix>"
|
||||
r"ST|STREET|AVE|AV|AVENUE|RD|ROAD|DR|DRIVE|CT|COURT|CIR|CIRCLE|"
|
||||
r"LN|LANE|BLVD|BOULEVARD|HWY|HIGHWAY|WAY|PL|PLACE|PKY|PKWY|PARKWAY|"
|
||||
r"TRL|TRAIL|TER|TERRACE|LOOP|RUN|ALY|ALLEY|XING|CROSSING"
|
||||
r")\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Map long form → ASP.NET ddStreetSuffix value
|
||||
_SUFFIX_NORMALIZE = {
|
||||
"STREET": "ST", "AVENUE": "AVE", "AV": "AVE", "ROAD": "RD",
|
||||
"DRIVE": "DR", "COURT": "CT", "CIRCLE": "CIR", "LANE": "LN",
|
||||
"BOULEVARD": "BLVD", "HIGHWAY": "HWY", "PLACE": "PL",
|
||||
"PARKWAY": "PKWY", "PKY": "PKWY", "TRAIL": "TRL", "TERRACE": "TER",
|
||||
"ALLEY": "ALY", "CROSSING": "XING",
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Field ID mapping (confirmed via probe on 2352 SCENIC VIEW CT)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
_SCALAR_IDS = {
|
||||
"owner_name": "ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
|
||||
"site_address_line1": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine1",
|
||||
"site_address_line2": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine2",
|
||||
"building_type": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingType",
|
||||
"year_built": "ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
|
||||
"building_value": "ctl00_cphBody_repeaterBuilding_ctl00_lblBldgValue",
|
||||
# Tax values current + last year
|
||||
"tax_last_year_just": "ctl00_cphBody_lblTaxLastYearJustValue",
|
||||
"tax_last_year_assessed": "ctl00_cphBody_lblTaxLastYearAssessedValue",
|
||||
"tax_last_year_exemptions": "ctl00_cphBody_lblTaxLastYearExemptions",
|
||||
"tax_last_year_taxable": "ctl00_cphBody_lblTaxLastYearTaxableValue",
|
||||
"tax_current_year_just": "ctl00_cphBody_lblTaxCurrentYearJustValue",
|
||||
"tax_current_year_assessed": "ctl00_cphBody_lblTaxCurrentYearAssessedValue",
|
||||
"tax_current_year_exemptions": "ctl00_cphBody_lblTaxCurrentYearExemptions",
|
||||
"tax_current_year_taxable": "ctl00_cphBody_lblTaxCurrentYearTaxableValue",
|
||||
# Values from main values table (no current "in progress" year)
|
||||
"assessed_value_3": "ctl00_cphBody_lblAssessedValue3",
|
||||
"taxable_value_school": "ctl00_cphBody_lblTaxableValueSchool",
|
||||
}
|
||||
|
||||
|
||||
def _parse_address(address: str) -> Optional[dict]:
|
||||
"""Parse address into Duval form fields.
|
||||
|
||||
Acepta:
|
||||
'2352 SCENIC VIEW CT' → simple
|
||||
'2352 SCENIC VIEW Court, Jacksonville, FL 32218' → con city/state/zip
|
||||
'123 N MAIN ST, Anytown, FL' → con prefix
|
||||
"""
|
||||
if not address:
|
||||
return None
|
||||
# Strip city/state/zip — toma solo lo antes del primer comma
|
||||
street_only = address.split(",")[0].strip().upper()
|
||||
m = _ADDRESS_RE.search(street_only)
|
||||
if not m:
|
||||
return None
|
||||
suffix_raw = (m.group("suffix") or "").strip().upper()
|
||||
suffix_normalized = _SUFFIX_NORMALIZE.get(suffix_raw, suffix_raw) if suffix_raw else None
|
||||
return {
|
||||
"street_num": m.group("num"),
|
||||
"prefix": (m.group("prefix") or "").strip().upper() or None,
|
||||
"name": m.group("name").strip(),
|
||||
"suffix": suffix_normalized,
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_duval_pa_record(
|
||||
address: Optional[str] = None,
|
||||
parcel_id: Optional[str] = None,
|
||||
zip_code: Optional[str] = None,
|
||||
timeout_seconds: int = 30,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Fetch full Duval PA record by address OR parcel_id (RE#).
|
||||
|
||||
Args:
|
||||
address: street address (e.g. "2352 SCENIC VIEW CT")
|
||||
parcel_id: Duval RE# (e.g. "044273-0370") — preferred si lo tenes
|
||||
zip_code: optional zip filter
|
||||
timeout_seconds: max wait per Playwright op
|
||||
|
||||
Returns:
|
||||
Dict con TODOS los campos publicos. Si fallo, dict tiene 'errors'.
|
||||
Key fields:
|
||||
- owner_name, owner_full_address (mailing)
|
||||
- site_address, parcel_id (RE#), subdivision
|
||||
- year_built (ENTERO), building_type, sqft_heated, sqft_gross,
|
||||
sqft_garage, bedrooms, baths, stories
|
||||
- exterior_wall, roof_struct, roofing_cover, interior_wall, int_flooring
|
||||
- just_value_current, assessed_value_current, taxable_current,
|
||||
exemption_current
|
||||
- just_value_last, assessed_value_last, taxes_billed_last
|
||||
- homestead_active (bool — exemptions >= $25,000 = homestead)
|
||||
- sales_history: [{date, price, deed_type, qualified, book_page,
|
||||
vacant_improved}, ...]
|
||||
- extra_features: [{code, description, value}, ...]
|
||||
- land: {zoning, use_code, lot_acres, lot_total_sqft}
|
||||
- legal_description: str
|
||||
- tax_breakdown: [{district, assessed, exempt, taxable, tax_amt}, ...]
|
||||
- source_url: str (detail page URL)
|
||||
- fetched_at: ISO timestamp
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
result = {
|
||||
"county": "Duval",
|
||||
"source": "Duval Property Appraiser (paopropertysearch.coj.net)",
|
||||
"fetched_at": fetched_at,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if not address and not parcel_id:
|
||||
result["errors"].append("no address or parcel_id provided")
|
||||
return result
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
except ImportError:
|
||||
result["errors"].append("playwright not installed")
|
||||
return result
|
||||
|
||||
parsed_addr = _parse_address(address) if address else None
|
||||
if address and not parsed_addr:
|
||||
result["errors"].append(f"could not parse address '{address}' (need format: 'NUM [PREFIX] NAME SUFFIX')")
|
||||
return result
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(user_agent=USER_AGENT)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
page.goto(_SEARCH_URL, wait_until="load", timeout=timeout_seconds * 1000)
|
||||
time.sleep(2)
|
||||
|
||||
if parcel_id:
|
||||
pid_clean = parcel_id.replace("-", "").strip()
|
||||
detail_url = f"{_BASE_URL}/Basic/Detail.aspx?RE={pid_clean}"
|
||||
try:
|
||||
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
|
||||
except Exception:
|
||||
# If 'load' times out, fall back to 'commit' (page has navigated)
|
||||
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
|
||||
time.sleep(5)
|
||||
else:
|
||||
# Search by address fields
|
||||
page.locator("#ctl00_cphBody_tbStreetNumber").fill(parsed_addr["street_num"])
|
||||
if parsed_addr["prefix"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(value=parsed_addr["prefix"])
|
||||
except Exception:
|
||||
pass
|
||||
page.locator("#ctl00_cphBody_tbStreetName").fill(parsed_addr["name"])
|
||||
if parsed_addr["suffix"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(value=parsed_addr["suffix"])
|
||||
except Exception:
|
||||
pass
|
||||
if zip_code:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_tbZipCode").fill(zip_code)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
page.locator("#ctl00_cphBody_bSearch").click()
|
||||
page.wait_for_timeout(4000)
|
||||
|
||||
# If results table → extract href from first row link and
|
||||
# navigate directly (Playwright click + navigation hangs on ASP.NET)
|
||||
rs_url = page.url
|
||||
if "Results.aspx" in rs_url:
|
||||
detail_href = _extract_detail_href_with_retry(page, max_retries=2)
|
||||
if not detail_href:
|
||||
body_preview = page.inner_text("body")[:300].replace("\n", " ")
|
||||
result["errors"].append(
|
||||
f"results page returned but no Detail.aspx link found "
|
||||
f"(url={page.url}, body_preview={body_preview!r})"
|
||||
)
|
||||
browser.close()
|
||||
return result
|
||||
# Build absolute URL and navigate directly (no click)
|
||||
if detail_href.startswith("/"):
|
||||
detail_url = f"{_BASE_URL}{detail_href}"
|
||||
elif detail_href.startswith("http"):
|
||||
detail_url = detail_href
|
||||
else:
|
||||
detail_url = f"{_BASE_URL}/Basic/{detail_href}"
|
||||
try:
|
||||
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
|
||||
except Exception:
|
||||
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
|
||||
time.sleep(5)
|
||||
elif "Detail.aspx" not in page.url:
|
||||
result["errors"].append(f"unexpected URL after search: {page.url}")
|
||||
browser.close()
|
||||
return result
|
||||
|
||||
# We should now be on Detail.aspx
|
||||
if "Detail.aspx" not in page.url:
|
||||
result["errors"].append(f"failed to reach Detail page, URL: {page.url}")
|
||||
browser.close()
|
||||
return result
|
||||
|
||||
result["source_url"] = page.url
|
||||
|
||||
# Wait for KEY element to confirm full render before extracting.
|
||||
# Retry once on server error (Duval intermittent 500s).
|
||||
render_ok = False
|
||||
for attempt in range(3):
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
"#ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
|
||||
state="attached",
|
||||
timeout=20000,
|
||||
)
|
||||
render_ok = True
|
||||
break
|
||||
except Exception:
|
||||
# Try owner sentinel
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
"#ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
|
||||
state="attached",
|
||||
timeout=10000,
|
||||
)
|
||||
render_ok = True
|
||||
break
|
||||
except Exception:
|
||||
# Check if server error → retry
|
||||
body = page.inner_text("body")[:500]
|
||||
if "Server Error" in body or "Runtime Error" in body:
|
||||
if attempt < 2:
|
||||
time.sleep(8 * (attempt + 1))
|
||||
try:
|
||||
page.reload(wait_until="load", timeout=30000)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
break
|
||||
if not render_ok:
|
||||
result["errors"].append("detail page didn't render expected elements (server slow or error)")
|
||||
|
||||
# Extract all scalar fields in one JS call
|
||||
scalars = page.evaluate(
|
||||
"""(ids) => {
|
||||
const out = {};
|
||||
for (const [k, id] of Object.entries(ids)) {
|
||||
const el = document.getElementById(id);
|
||||
out[k] = el ? (el.textContent || '').trim() : '';
|
||||
}
|
||||
return out;
|
||||
}""",
|
||||
_SCALAR_IDS,
|
||||
)
|
||||
|
||||
# Extract sales history
|
||||
sales_history = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
document.querySelectorAll('table').forEach((tbl) => {
|
||||
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
||||
if (!hdrCells || hdrCells.length < 4) return;
|
||||
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
const isSalesHdr = hdrText.some(h => h.includes('sale date') || h.includes('book/page') || h.includes('deed instrument'));
|
||||
if (!isSalesHdr) return;
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
const r = {};
|
||||
cells.forEach((c, idx) => {
|
||||
const h = hdrText[idx] || `col${idx}`;
|
||||
r[h] = (c.textContent || '').trim();
|
||||
});
|
||||
if (Object.values(r).some(v => v && v.length > 0)) out.push(r);
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract building area
|
||||
building_area = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingArea');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
const type = (cells[0].textContent || '').trim();
|
||||
const gross = (cells[1].textContent || '').trim();
|
||||
const heated = (cells[2].textContent || '').trim();
|
||||
const effective = (cells[3].textContent || '').trim();
|
||||
if (type) out[type] = { gross, heated, effective };
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract building attributes (beds, baths, stories)
|
||||
attrs = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingAttributes');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 2) continue;
|
||||
const element = (cells[0].textContent || '').trim();
|
||||
const code = (cells[1].textContent || '').trim();
|
||||
if (element) out[element] = code;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract building structural elements (roof, walls, flooring)
|
||||
# NOTE: these come from the same building section, different grid
|
||||
structural = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
// Find any grid in building section with Element/Code/Detail headers
|
||||
document.querySelectorAll('table').forEach((tbl) => {
|
||||
const hdrs = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
||||
if (!hdrs) return;
|
||||
const ht = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
if (!(ht.includes('element') && ht.includes('code') && ht.includes('detail'))) return;
|
||||
// Skip the simpler attributes table (only 3 fields)
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
if (rows.length < 4) return;
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 3) continue;
|
||||
const element = (cells[0].textContent || '').trim();
|
||||
const detail = (cells[2].textContent || '').trim();
|
||||
if (element && detail) {
|
||||
if (out[element]) {
|
||||
out[element] += '; ' + detail;
|
||||
} else {
|
||||
out[element] = detail;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract main property identity (RE#, subdivision, etc.) from top table
|
||||
top_props = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
document.querySelectorAll('table').forEach((tbl) => {
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
if (rows.length < 3) return;
|
||||
// Top table has key:value rows (2 cells per row)
|
||||
// Heuristic: first cell ends with ':' or matches known labels
|
||||
const knownLabels = ['re #','re#','tax district','property use',
|
||||
'# of buildings','legal desc','subdivision','total area'];
|
||||
let matchCount = 0;
|
||||
const candidate = {};
|
||||
for (const tr of rows) {
|
||||
const cells = tr.querySelectorAll('td, th');
|
||||
if (cells.length !== 2) continue;
|
||||
const k = (cells[0].textContent || '').trim().toLowerCase().replace(/:$/, '');
|
||||
const v = (cells[1].textContent || '').trim();
|
||||
if (k && v && knownLabels.some(kw => k.startsWith(kw))) {
|
||||
matchCount++;
|
||||
candidate[k] = v;
|
||||
}
|
||||
}
|
||||
if (matchCount >= 3) {
|
||||
Object.assign(out, candidate);
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Land details
|
||||
land = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const grid = document.getElementById('ctl00_cphBody_gridLand');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
if (rows.length < 2) return out;
|
||||
const hdrs = rows[0].querySelectorAll('th, td');
|
||||
const hdrText = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
const dataRow = rows[1].querySelectorAll('td');
|
||||
hdrText.forEach((h, i) => {
|
||||
if (dataRow[i]) out[h] = (dataRow[i].textContent || '').trim();
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extra features (fireplace, pool, deck, etc.)
|
||||
features = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const grid = document.getElementById('ctl00_cphBody_gridExtraFeatures');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 5) continue;
|
||||
out.push({
|
||||
code: (cells[1]?.textContent || '').trim(),
|
||||
description: (cells[2]?.textContent || '').trim(),
|
||||
units: (cells[6]?.textContent || '').trim(),
|
||||
value: (cells[7]?.textContent || '').trim(),
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
# ─── Post-process ─────────────────────────────────────────────
|
||||
result.update({k: _clean(v) for k, v in scalars.items()})
|
||||
|
||||
# Numeric conversions
|
||||
for k in ("year_built",):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _to_int(v)
|
||||
for k in ("building_value", "tax_last_year_just", "tax_last_year_assessed",
|
||||
"tax_last_year_exemptions", "tax_last_year_taxable",
|
||||
"tax_current_year_just", "tax_current_year_assessed",
|
||||
"tax_current_year_exemptions", "tax_current_year_taxable",
|
||||
"assessed_value_3", "taxable_value_school"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _money_to_int(v)
|
||||
|
||||
# Parcel id / subdivision / etc from top props
|
||||
result["parcel_id"] = top_props.get("re #", "") or top_props.get("re#", "")
|
||||
result["tax_district"] = top_props.get("tax district", "")
|
||||
result["property_use"] = top_props.get("property use", "")
|
||||
result["num_buildings"] = top_props.get("# of buildings", "")
|
||||
result["subdivision"] = top_props.get("subdivision", "")
|
||||
result["lot_total_sqft"] = _to_int(top_props.get("total area", "") or "0")
|
||||
|
||||
# Building area summary
|
||||
result["building_area_grid"] = building_area
|
||||
result["sqft_heated"] = _to_int(
|
||||
(building_area.get("Base Area") or {}).get("heated", "0") or
|
||||
(building_area.get("Total") or {}).get("heated", "0") or "0"
|
||||
)
|
||||
result["sqft_gross"] = _to_int(
|
||||
(building_area.get("Total") or {}).get("gross", "0") or "0"
|
||||
)
|
||||
result["sqft_garage"] = _to_int(
|
||||
(building_area.get("Finished Garage") or {}).get("gross", "0") or "0"
|
||||
)
|
||||
|
||||
# Attributes: beds/baths/stories
|
||||
def _attr_to_num(s):
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
return float(s.split(".")[0]) if "." in s else float(s)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
result["bedrooms"] = _attr_to_num(attrs.get("Bedrooms", ""))
|
||||
result["baths"] = _attr_to_num(attrs.get("Baths", ""))
|
||||
result["stories"] = _attr_to_num(attrs.get("Stories", ""))
|
||||
result["units"] = _attr_to_num(attrs.get("Rooms / Units", ""))
|
||||
|
||||
# Structural elements
|
||||
result["exterior_wall"] = structural.get("Exterior Wall", "")
|
||||
result["roof_struct"] = structural.get("Roof Struct", "")
|
||||
result["roofing_cover"] = structural.get("Roofing Cover", "")
|
||||
result["interior_wall"] = structural.get("Interior Wall", "")
|
||||
result["int_flooring"] = structural.get("Int Flooring", "")
|
||||
|
||||
# Sales history normalized
|
||||
result["sales_history"] = []
|
||||
for s in sales_history:
|
||||
record = {
|
||||
"book_page": _clean(s.get("book/page", "")),
|
||||
"date": _clean(s.get("sale date", "")),
|
||||
"price": _money_to_int(s.get("sale price", "") or "0"),
|
||||
"deed_type": _clean(s.get("deed instrument type code", "") or s.get("deed type", "")),
|
||||
"qualified": _clean(s.get("qualified/unqualified", "") or s.get("qualified", "")),
|
||||
"vacant_improved": _clean(s.get("vacant/improved", "")),
|
||||
}
|
||||
if any(record.values()):
|
||||
result["sales_history"].append(record)
|
||||
|
||||
# Land details
|
||||
result["land"] = {
|
||||
"use_code": land.get("code", ""),
|
||||
"use_description": land.get("use description", ""),
|
||||
"zoning": land.get("zoning assessment", ""),
|
||||
"front": land.get("front", ""),
|
||||
"depth": land.get("depth", ""),
|
||||
"land_units": land.get("land units", ""),
|
||||
"land_type": land.get("land type", ""),
|
||||
"land_value": _money_to_int(land.get("land value", "") or "0"),
|
||||
}
|
||||
|
||||
# Extra features (fireplace, pool, etc.)
|
||||
result["extra_features"] = features
|
||||
|
||||
# Homestead detection: exemptions >= $25K = primary residence with HX
|
||||
ex_last = result.get("tax_last_year_exemptions") or 0
|
||||
ex_curr = result.get("tax_current_year_exemptions") or 0
|
||||
result["homestead_active"] = (ex_last >= 25000) or (ex_curr >= 25000)
|
||||
result["homestead_amount_current"] = ex_curr
|
||||
result["homestead_amount_last"] = ex_last
|
||||
|
||||
# Convenience: most recent qualified sale price
|
||||
qualified_sales = [s for s in result["sales_history"]
|
||||
if s.get("qualified", "").lower().startswith("qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
result["most_recent_qualified_sale"] = qualified_sales[0] if qualified_sales else None
|
||||
|
||||
# Effective renovation signal:
|
||||
# If most recent qualified sale price >> previous qualified sale price by
|
||||
# >30% within 24 months → likely renovated/flipped.
|
||||
renov_signal = _detect_renovation_pattern(
|
||||
result["sales_history"], listing_price=listing_price,
|
||||
)
|
||||
result["renovation_signal"] = renov_signal
|
||||
|
||||
except PWTimeout as e:
|
||||
result["errors"].append(f"timeout: {e}")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result["errors"].append(f"{type(e).__name__}: {e}")
|
||||
result["_trace"] = traceback.format_exc()[:600]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers — server retry / detail link extraction
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _extract_detail_href_with_retry(page, max_retries: int = 2) -> Optional[str]:
|
||||
"""Wait for Detail.aspx link on Results page, retry on server errors.
|
||||
|
||||
Duval PA returns intermittent 500 errors ("wait operation timed out")
|
||||
when rate-limited. Retry with backoff handles that.
|
||||
"""
|
||||
for attempt in range(max_retries + 1):
|
||||
# Wait for results to render
|
||||
time.sleep(3)
|
||||
try:
|
||||
page.wait_for_selector("a[href*='Detail.aspx']", state="attached", timeout=15000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
href = page.evaluate("""
|
||||
() => {
|
||||
const links = document.querySelectorAll("a[href*='Detail.aspx']");
|
||||
return links.length > 0 ? links[0].getAttribute('href') : null;
|
||||
}
|
||||
""")
|
||||
if href:
|
||||
return href
|
||||
|
||||
# Check if this is a server error page
|
||||
body = page.inner_text("body")[:500]
|
||||
is_server_error = (
|
||||
"Server Error" in body or
|
||||
"wait operation timed out" in body or
|
||||
"Runtime Error" in body
|
||||
)
|
||||
if is_server_error and attempt < max_retries:
|
||||
# Backoff and retry — reload the search
|
||||
backoff = 5 * (attempt + 1)
|
||||
time.sleep(backoff)
|
||||
try:
|
||||
page.reload(wait_until="load", timeout=30000)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
# If not server error, the link just isn't there — return None
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _clean(s) -> str:
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", str(s))
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", str(s))
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _detect_renovation_pattern(sales: list[dict], listing_price: Optional[float] = None) -> dict:
|
||||
"""Heuristic: detect flip / renovation / flip-in-progress patterns.
|
||||
|
||||
Args:
|
||||
sales: sales_history (recent first)
|
||||
listing_price: optional current listing price — habilita flip-in-progress detection
|
||||
|
||||
Returns:
|
||||
{
|
||||
"is_flip_pattern": bool, # qualified sales historical flip detected
|
||||
"is_flip_in_progress": bool, # NEW: recent qualified << current listing
|
||||
"evidence": str,
|
||||
"most_recent_qualified": dict | None,
|
||||
"prior_qualified": dict | None,
|
||||
"value_increase_pct": float | None,
|
||||
"months_between": int | None,
|
||||
"listing_premium_pct": float | None, # NEW: (listing - recent_qualified) / recent_qualified * 100
|
||||
"months_since_recent_sale": int | None,
|
||||
"interpretation_es": str | None,
|
||||
}
|
||||
"""
|
||||
out = {
|
||||
"is_flip_pattern": False,
|
||||
"is_flip_in_progress": False,
|
||||
"evidence": "",
|
||||
"most_recent_qualified": None,
|
||||
"prior_qualified": None,
|
||||
"value_increase_pct": None,
|
||||
"months_between": None,
|
||||
"listing_premium_pct": None,
|
||||
"months_since_recent_sale": None,
|
||||
"interpretation_es": None,
|
||||
}
|
||||
qualified = [s for s in sales
|
||||
if s.get("qualified", "").lower().startswith("qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
if not qualified:
|
||||
return out
|
||||
|
||||
recent = qualified[0]
|
||||
out["most_recent_qualified"] = recent
|
||||
|
||||
# ─── Pattern A: historical flip (prior qualified → recent qualified) ─────
|
||||
if len(qualified) >= 2:
|
||||
prior = qualified[1]
|
||||
out["prior_qualified"] = prior
|
||||
try:
|
||||
increase = (recent["price"] - prior["price"]) / prior["price"] * 100
|
||||
out["value_increase_pct"] = round(increase, 1)
|
||||
except (TypeError, ZeroDivisionError):
|
||||
pass
|
||||
try:
|
||||
d1 = datetime.strptime(recent["date"], "%m/%d/%Y")
|
||||
d2 = datetime.strptime(prior["date"], "%m/%d/%Y")
|
||||
months = abs((d1 - d2).days) // 30
|
||||
out["months_between"] = months
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
if out["value_increase_pct"] and out["months_between"]:
|
||||
if out["value_increase_pct"] >= 25 and out["months_between"] <= 30:
|
||||
out["is_flip_pattern"] = True
|
||||
out["evidence"] = (
|
||||
f"+{out['value_increase_pct']}% in {out['months_between']} months "
|
||||
f"({prior['date']} ${prior['price']:,} -> {recent['date']} ${recent['price']:,})"
|
||||
)
|
||||
|
||||
# ─── Pattern B: FLIP-IN-PROGRESS (recent qualified << current listing) ──
|
||||
if listing_price and listing_price > 0 and recent.get("price", 0) > 0:
|
||||
try:
|
||||
premium = (listing_price - recent["price"]) / recent["price"] * 100
|
||||
out["listing_premium_pct"] = round(premium, 1)
|
||||
except (TypeError, ZeroDivisionError):
|
||||
pass
|
||||
try:
|
||||
d_recent = datetime.strptime(recent["date"], "%m/%d/%Y")
|
||||
today = datetime.now()
|
||||
months_since = abs((today - d_recent).days) // 30
|
||||
out["months_since_recent_sale"] = months_since
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
# Flip-in-progress: recent qualified sale is 15%+ below listing AND
|
||||
# the sale was within last 18 months (typical flip turnaround)
|
||||
if (out["listing_premium_pct"] and out["listing_premium_pct"] >= 15
|
||||
and out["months_since_recent_sale"] is not None
|
||||
and out["months_since_recent_sale"] <= 18):
|
||||
out["is_flip_in_progress"] = True
|
||||
if out["evidence"]:
|
||||
out["evidence"] += " | "
|
||||
out["evidence"] += (
|
||||
f"FLIP-IN-PROGRESS: owner bought ${recent['price']:,} on {recent['date']} "
|
||||
f"({out['months_since_recent_sale']}mo ago), listing ${listing_price:,.0f} "
|
||||
f"(+{out['listing_premium_pct']}%)"
|
||||
)
|
||||
|
||||
# ─── Spanish interpretation ─────────────────────────────────────────────
|
||||
if out["is_flip_in_progress"] and out["is_flip_pattern"]:
|
||||
out["interpretation_es"] = (
|
||||
"PATRON DE FLIP REPETIDO: la propiedad ya fue flipped una vez en el "
|
||||
"historial. El owner actual la compro reciente y la lista mucho mas "
|
||||
"alto. Probable renovacion reciente -> precio refleja inversion. "
|
||||
"Si comprador final, esperate negociacion dura del owner (necesita "
|
||||
"recuperar costos de rehab + margen)."
|
||||
)
|
||||
elif out["is_flip_in_progress"]:
|
||||
out["interpretation_es"] = (
|
||||
f"FLIP-IN-PROGRESS: el owner compro hace {out['months_since_recent_sale']}mo "
|
||||
f"a ${recent['price']:,} y lista a ${listing_price:,.0f} (+{out['listing_premium_pct']:.0f}%). "
|
||||
"Probable renovacion en el medio. Precio incluye trabajo. Negociar dificil — "
|
||||
"owner tiene 'sunk cost' del rehab. Validar condicion real con inspeccion."
|
||||
)
|
||||
elif out["is_flip_pattern"]:
|
||||
out["interpretation_es"] = (
|
||||
f"HISTORIAL DE FLIP: la propiedad subio +{out['value_increase_pct']}% en "
|
||||
f"{out['months_between']}mo (sale prior). Indica renovacion previa."
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Duval PA full record fetcher")
|
||||
parser.add_argument("--address", help="Street address (e.g. '2352 SCENIC VIEW CT')")
|
||||
parser.add_argument("--parcel", help="RE# (e.g. '044273-0370')")
|
||||
parser.add_argument("--zip", help="Optional ZIP filter")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.address and not args.parcel:
|
||||
parser.error("--address or --parcel required")
|
||||
|
||||
rec = fetch_duval_pa_record(
|
||||
address=args.address, parcel_id=args.parcel, zip_code=args.zip,
|
||||
)
|
||||
print(json.dumps(rec, indent=2, default=str))
|
||||
@@ -0,0 +1,404 @@
|
||||
"""data_fetchers/pa_miami_dade.py — Full Miami-Dade PA extractor.
|
||||
|
||||
Sitio: https://apps.miamidadepa.gov/PropertySearch/ (Angular 14 + Kendo UI)
|
||||
Deep link: /PropertySearch/#/?folio={folio_no_dashes}
|
||||
|
||||
Extrae todo lo publico del Miami-Dade PA via los components Angular:
|
||||
- pa-propertyinformation: folio, sub-division, address, owner, mailing,
|
||||
PA primary zone, primary land use, beds/baths/half, floors, living units,
|
||||
living area, adjusted area, lot size, year built
|
||||
- pa-salesinformation: sales history (date, price, OR book-page, qualification,
|
||||
previous owner)
|
||||
- pa-assessmentinformation: land/building/extra/market/assessed 3 anios
|
||||
- pa-taxablevalueinformation: COUNTY/SCHOOL/etc exemption + taxable
|
||||
- pa-benefitsinformation: homestead + other exemptions
|
||||
- pa-legaldescription: legal description completa
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
|
||||
rec = fetch_miami_dade_pa_record(parcel_id="31-2202-034-2470")
|
||||
# rec["owner_name"], rec["year_built"], rec["sales_history"]...
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
|
||||
_BASE_URL = "https://apps.miamidadepa.gov/PropertySearch"
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Text parsing helpers — labels are on left, values on right (newline separated)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _grab_after_label(text: str, label: str) -> Optional[str]:
|
||||
"""Find 'label' and return text immediately after (until next label/newline)."""
|
||||
if not text or not label:
|
||||
return None
|
||||
# Match "Label:value" or "Label\nvalue" or "Label\tvalue"
|
||||
pattern = re.compile(
|
||||
rf"{re.escape(label)}\s*[:\t]*\s*\n?\s*([^\n]+?)(?:\n|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
m = pattern.search(text)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
def _to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", str(s))
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", str(s))
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_miami_dade_pa_record(
|
||||
parcel_id: Optional[str] = None,
|
||||
address: Optional[str] = None,
|
||||
timeout_seconds: int = 45,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Fetch full Miami-Dade PA record.
|
||||
|
||||
Args:
|
||||
parcel_id: folio number (e.g. "31-2202-034-2470" or "3122020342470")
|
||||
address: alternative search by address (less reliable in this portal)
|
||||
timeout_seconds: max wait per playwright op
|
||||
listing_price: enables flip-in-progress detection
|
||||
|
||||
Returns: rich dict (same schema as pa_duval/pa_broward) with errors list.
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
result = {
|
||||
"county": "Miami-Dade",
|
||||
"source": "Miami-Dade Property Appraiser (apps.miamidadepa.gov)",
|
||||
"fetched_at": fetched_at,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if not parcel_id and not address:
|
||||
result["errors"].append("no parcel_id or address provided")
|
||||
return result
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
result["errors"].append("playwright not installed")
|
||||
return result
|
||||
|
||||
# Normalize folio (no dashes for URL)
|
||||
folio_clean = (parcel_id or "").replace("-", "").strip()
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(user_agent=USER_AGENT)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
if folio_clean:
|
||||
# Deep link by folio
|
||||
url = f"{_BASE_URL}/#/?folio={folio_clean}"
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
else:
|
||||
# Search by address — landing page + fill form
|
||||
page.goto(f"{_BASE_URL}/", wait_until="domcontentloaded")
|
||||
time.sleep(5)
|
||||
# Address tab is default. Fill kendo-textbox[formcontrolname='address']
|
||||
addr_input = page.locator("kendo-textbox[formcontrolname='address'] input").first
|
||||
addr_input.fill(address or "")
|
||||
page.locator("button[aria-label='Search button']").first.click()
|
||||
|
||||
# Wait for property info to render
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"() => document.querySelector('pa-propertyinformation') "
|
||||
"&& document.querySelector('pa-propertyinformation').innerText.includes('Folio')",
|
||||
timeout=20000,
|
||||
)
|
||||
except Exception as e:
|
||||
result["errors"].append(f"detail page didn't render: {e}")
|
||||
browser.close()
|
||||
return result
|
||||
|
||||
time.sleep(2)
|
||||
result["source_url"] = page.url
|
||||
|
||||
# Extract text from each pa-component
|
||||
sections = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const components = [
|
||||
'pa-propertyinformation','pa-salesinformation',
|
||||
'pa-assessmentinformation','pa-taxablevalueinformation',
|
||||
'pa-benefitsinformation','pa-legaldescription',
|
||||
'pa-additionalinformation',
|
||||
];
|
||||
for (const tag of components) {
|
||||
const el = document.querySelector(tag);
|
||||
out[tag] = el ? (el.innerText || '').trim() : '';
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Also extract sales history table rows
|
||||
sales_rows = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const sec = document.querySelector('pa-salesinformation');
|
||||
if (!sec) return out;
|
||||
const tbl = sec.querySelector('table');
|
||||
if (!tbl) return out;
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
out.push({
|
||||
date: (cells[0]?.textContent || '').trim(),
|
||||
price: (cells[1]?.textContent || '').trim(),
|
||||
book_page: (cells[2]?.textContent || '').trim(),
|
||||
qualification: (cells[3]?.textContent || '').trim(),
|
||||
previous_owner: cells.length > 4 ? (cells[4]?.textContent || '').trim() : '',
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract assessment table (3 years)
|
||||
# Header row: find the row whose first cell text is "Year".
|
||||
assessment_rows = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const sec = document.querySelector('pa-assessmentinformation');
|
||||
if (!sec) return out;
|
||||
const tables = sec.querySelectorAll('table');
|
||||
if (tables.length === 0) return out;
|
||||
// Find header row in any table
|
||||
let years = [];
|
||||
let headerRowIdx = -1;
|
||||
let chosenTbl = null;
|
||||
for (const tbl of tables) {
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const firstCell = (rows[i].querySelector('th, td')?.textContent || '').trim().toLowerCase();
|
||||
if (firstCell === 'year') {
|
||||
const headerCells = rows[i].querySelectorAll('th, td');
|
||||
years = Array.from(headerCells).map(c => (c.textContent || '').trim()).slice(1);
|
||||
headerRowIdx = i;
|
||||
chosenTbl = tbl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (chosenTbl) break;
|
||||
}
|
||||
if (!chosenTbl || years.length === 0) return out;
|
||||
const rows = chosenTbl.querySelectorAll('tr');
|
||||
for (let i = headerRowIdx + 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td, th');
|
||||
if (cells.length < 2) continue;
|
||||
const label = (cells[0]?.textContent || '').trim();
|
||||
const values = {};
|
||||
for (let j = 1; j < cells.length && j-1 < years.length; j++) {
|
||||
values[years[j-1]] = (cells[j].textContent || '').trim();
|
||||
}
|
||||
if (label) out[label] = values;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract taxable value table (by district)
|
||||
taxable_rows = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const sec = document.querySelector('pa-taxablevalueinformation');
|
||||
if (!sec) return out;
|
||||
out._text = (sec.innerText || '').trim().substring(0, 2000);
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
# ─── Post-process — parse via text labels ─────────────────────
|
||||
prop_text = sections.get("pa-propertyinformation", "")
|
||||
result["parcel_id"] = _grab_after_label(prop_text, "Folio")
|
||||
result["subdivision"] = _grab_after_label(prop_text, "Sub-Division")
|
||||
# Address: "Property Address\n{addr}"
|
||||
addr_block_match = re.search(
|
||||
r"Property Address\s*\n([^\n]+)", prop_text, re.IGNORECASE,
|
||||
)
|
||||
if addr_block_match:
|
||||
result["site_address"] = addr_block_match.group(1).strip()
|
||||
# Owner: "Owner\n{name(s)}"
|
||||
owner_match = re.search(
|
||||
r"Owner\s*\n([^\n]+(?:\n[^\n]+)?)", prop_text, re.IGNORECASE,
|
||||
)
|
||||
if owner_match:
|
||||
owner_text = owner_match.group(1).strip()
|
||||
# Split on newline for multiple owners
|
||||
lines = [l.strip() for l in owner_text.split("\n") if l.strip()]
|
||||
result["owner_name"] = lines[0] if lines else None
|
||||
result["co_owners"] = lines[1:] if len(lines) > 1 else []
|
||||
|
||||
mailing_match = re.search(
|
||||
r"Mailing Address\s*\n((?:[^\n]+\n?){1,3})", prop_text, re.IGNORECASE,
|
||||
)
|
||||
if mailing_match:
|
||||
result["mailing_address"] = re.sub(
|
||||
r"\s+", " ", mailing_match.group(1).strip(),
|
||||
)
|
||||
|
||||
result["pa_primary_zone"] = _grab_after_label(prop_text, "PA Primary Zone")
|
||||
result["use_code"] = _grab_after_label(prop_text, "Primary Land Use")
|
||||
result["use_description"] = result.get("use_code")
|
||||
beds_baths = _grab_after_label(prop_text, "Beds / Baths /Half")
|
||||
if beds_baths:
|
||||
parts = [p.strip() for p in beds_baths.split("/")]
|
||||
try:
|
||||
result["bedrooms"] = int(parts[0]) if parts[0] else None
|
||||
except (ValueError, IndexError):
|
||||
result["bedrooms"] = None
|
||||
try:
|
||||
result["baths"] = float(parts[1]) if len(parts) > 1 and parts[1] else None
|
||||
except (ValueError, IndexError):
|
||||
result["baths"] = None
|
||||
result["floors"] = _to_int(_grab_after_label(prop_text, "Floors"))
|
||||
result["living_units"] = _to_int(_grab_after_label(prop_text, "Living Units"))
|
||||
living_area = _grab_after_label(prop_text, "Living Area")
|
||||
result["sqft_heated"] = _to_int(living_area) if living_area else None
|
||||
adj_area = _grab_after_label(prop_text, "Adjusted Area")
|
||||
result["sqft_total"] = _to_int(adj_area) if adj_area else None
|
||||
lot_size = _grab_after_label(prop_text, "Lot Size")
|
||||
result["lot_total_sqft"] = _to_int(lot_size) if lot_size else None
|
||||
result["year_built"] = _to_int(_grab_after_label(prop_text, "Year Built"))
|
||||
|
||||
# Sales history — clean each row
|
||||
result["sales_history"] = []
|
||||
for r in sales_rows:
|
||||
date_str = r.get("date", "")
|
||||
price_str = r.get("price", "")
|
||||
# Skip header rows / non-data
|
||||
if not date_str or "Sale" in date_str or date_str.lower() == "previous sale":
|
||||
continue
|
||||
rec = {
|
||||
"date": date_str,
|
||||
"price": _money_to_int(price_str),
|
||||
"book_page": r.get("book_page", ""),
|
||||
"qualification": r.get("qualification", ""),
|
||||
"previous_owner": r.get("previous_owner", ""),
|
||||
# Approximate Duval-compatible 'qualified' flag
|
||||
"qualified": "Qualified" if "qual" in r.get("qualification", "").lower()
|
||||
and "disqual" not in r.get("qualification", "").lower()
|
||||
else "Unqualified",
|
||||
}
|
||||
if rec["date"]:
|
||||
result["sales_history"].append(rec)
|
||||
|
||||
# Most recent qualified sale
|
||||
qualified = [s for s in result["sales_history"]
|
||||
if s.get("qualified", "").startswith("Qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
result["most_recent_qualified_sale"] = qualified[0] if qualified else None
|
||||
|
||||
# Assessment 3-year values (Year column → Land, Building, Market, Assessed)
|
||||
# assessment_rows = {"Land Value": {"2025": "$0", ...}, "Market Value": {...}}
|
||||
result["assessment_table"] = assessment_rows
|
||||
# Resolve current/last/two-years
|
||||
years_present = []
|
||||
for label_dict in assessment_rows.values():
|
||||
if isinstance(label_dict, dict):
|
||||
for y in label_dict.keys():
|
||||
if y and y not in years_present:
|
||||
years_present.append(y)
|
||||
# Pick most recent year as current
|
||||
years_present_sorted = sorted([y for y in years_present if y.isdigit()], reverse=True)
|
||||
current_year = years_present_sorted[0] if years_present_sorted else None
|
||||
last_year = years_present_sorted[1] if len(years_present_sorted) > 1 else None
|
||||
|
||||
def _val(label, year):
|
||||
if year and assessment_rows.get(label):
|
||||
return _money_to_int(assessment_rows[label].get(year, "0"))
|
||||
return None
|
||||
|
||||
result["just_value_current"] = _val("Market Value", current_year)
|
||||
result["assessed_value_current"] = _val("Assessed Value", current_year)
|
||||
result["just_value_last"] = _val("Market Value", last_year)
|
||||
result["assessed_value_last"] = _val("Assessed Value", last_year)
|
||||
result["tax_year_current"] = int(current_year) if current_year else None
|
||||
result["tax_year_last"] = int(last_year) if last_year else None
|
||||
|
||||
# Homestead detection from benefits section text
|
||||
benefits_text = sections.get("pa-benefitsinformation", "") or ""
|
||||
result["homestead_active"] = "homestead" in benefits_text.lower() and "$" in benefits_text
|
||||
|
||||
# Legal description
|
||||
legal_text = sections.get("pa-legaldescription", "") or ""
|
||||
result["legal_description"] = re.sub(
|
||||
r"^Legal Description\s*\n",
|
||||
"",
|
||||
legal_text.strip(),
|
||||
)[:500] if legal_text else None
|
||||
|
||||
# Renovation signal
|
||||
from data_fetchers.pa_duval import _detect_renovation_pattern
|
||||
result["renovation_signal"] = _detect_renovation_pattern(
|
||||
result["sales_history"], listing_price=listing_price,
|
||||
)
|
||||
|
||||
# Raw sections for advanced consumers
|
||||
result["_raw_sections"] = sections
|
||||
result["_raw_taxable_text"] = taxable_rows.get("_text", "")
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result["errors"].append(f"{type(e).__name__}: {e}")
|
||||
result["_trace"] = traceback.format_exc()[:600]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Miami-Dade PA full record fetcher")
|
||||
parser.add_argument("--parcel", help="Folio number (e.g. '31-2202-034-2470')")
|
||||
parser.add_argument("--address", help="Alternative address search")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.parcel and not args.address:
|
||||
parser.error("--parcel or --address required")
|
||||
|
||||
rec = fetch_miami_dade_pa_record(parcel_id=args.parcel, address=args.address)
|
||||
print(json.dumps(rec, indent=2, default=str))
|
||||
@@ -0,0 +1,409 @@
|
||||
"""data_fetchers/pa_palm_beach.py — Full Palm Beach PA extractor.
|
||||
|
||||
Sitio: https://pbcpao.gov (server-rendered HTML + jQuery, no SPA)
|
||||
Deep link: /Property/Details?parcelId={parcelId}
|
||||
|
||||
VENTAJA: NO necesita Playwright. urllib + HTMLParser stdlib = rapidisimo.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from html.parser import HTMLParser
|
||||
from typing import Optional
|
||||
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
|
||||
_BASE_URL = "https://pbcpao.gov"
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# HTML text extractor (skip script/style)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_skip = False
|
||||
self.parts: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ("script", "style", "noscript"):
|
||||
self.in_skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("script", "style", "noscript"):
|
||||
self.in_skip = False
|
||||
|
||||
def handle_data(self, d):
|
||||
if not self.in_skip:
|
||||
t = d.strip()
|
||||
if t:
|
||||
self.parts.append(t)
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Tables extractor (table → list of rows)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
class _TableExtractor(HTMLParser):
|
||||
"""Extracts all tables as list of {idx, rows: [[cells]]} dicts."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_table = False
|
||||
self.in_tr = False
|
||||
self.in_cell = False
|
||||
self.in_skip = False
|
||||
self.current_row: list[str] = []
|
||||
self.current_cell = ""
|
||||
self.current_table: list[list[str]] = []
|
||||
self.tables: list[list[list[str]]] = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ("script", "style"):
|
||||
self.in_skip = True
|
||||
elif tag == "table":
|
||||
self.in_table = True
|
||||
self.current_table = []
|
||||
elif tag == "tr" and self.in_table:
|
||||
self.in_tr = True
|
||||
self.current_row = []
|
||||
elif tag in ("td", "th") and self.in_tr:
|
||||
self.in_cell = True
|
||||
self.current_cell = ""
|
||||
elif tag == "br" and self.in_cell:
|
||||
self.current_cell += " "
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("script", "style"):
|
||||
self.in_skip = False
|
||||
elif tag == "table":
|
||||
if self.current_table:
|
||||
self.tables.append(self.current_table)
|
||||
self.in_table = False
|
||||
elif tag == "tr" and self.in_tr:
|
||||
if self.current_row:
|
||||
self.current_table.append(self.current_row)
|
||||
self.in_tr = False
|
||||
elif tag in ("td", "th") and self.in_cell:
|
||||
self.current_row.append(re.sub(r"\s+", " ", self.current_cell).strip())
|
||||
self.in_cell = False
|
||||
|
||||
def handle_data(self, d):
|
||||
if self.in_cell and not self.in_skip:
|
||||
self.current_cell += d
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _grab_after(text: str, label: str, max_len: int = 80) -> Optional[str]:
|
||||
"""Find label in flat text, return the next non-empty token cluster."""
|
||||
if not text or not label:
|
||||
return None
|
||||
idx = text.find(label)
|
||||
if idx < 0:
|
||||
return None
|
||||
after = text[idx + len(label): idx + len(label) + max_len].strip()
|
||||
# Take up to next " ", " ", end-of-line, or "Property" / "Address" etc.
|
||||
# First word/phrase = value until next CAPITALIZED label pattern
|
||||
m = re.match(r"\s*([^\n]+?)(?:\s{2,}|\s+[A-Z][A-Z\s]+\s+[A-Za-z]+|$)", after)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return after.split("\n")[0].strip()
|
||||
|
||||
|
||||
def _to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", str(s))
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", str(s))
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_palm_beach_pa_record(
|
||||
parcel_id: str,
|
||||
timeout_seconds: int = 30,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Fetch full Palm Beach PA record by parcel_id (PCN).
|
||||
|
||||
Args:
|
||||
parcel_id: 17-digit PCN (e.g. "00414232000003080") or formatted with dashes
|
||||
timeout_seconds: HTTP timeout
|
||||
listing_price: enables flip-in-progress detection
|
||||
|
||||
Returns: rich dict (unified schema) with errors list.
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
result = {
|
||||
"county": "Palm Beach",
|
||||
"source": "Palm Beach County Property Appraiser (pbcpao.gov)",
|
||||
"fetched_at": fetched_at,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if not parcel_id:
|
||||
result["errors"].append("no parcel_id provided")
|
||||
return result
|
||||
|
||||
pcn_clean = parcel_id.replace("-", "").strip()
|
||||
url = f"{_BASE_URL}/Property/Details?parcelId={pcn_clean}"
|
||||
result["source_url"] = url
|
||||
|
||||
# HTTP fetch (no Playwright)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=timeout_seconds) as resp:
|
||||
html = resp.read().decode("utf-8", errors="ignore")
|
||||
except Exception as e:
|
||||
result["errors"].append(f"HTTP fetch failed: {type(e).__name__}: {e}")
|
||||
return result
|
||||
|
||||
# Detect "no property found"
|
||||
if "no property" in html.lower()[:5000] or "not found" in html.lower()[:5000]:
|
||||
result["errors"].append("parcel not found in PA records")
|
||||
return result
|
||||
|
||||
# Extract flat text
|
||||
text_extractor = _TextExtractor()
|
||||
text_extractor.feed(html)
|
||||
flat = " ".join(text_extractor.parts)
|
||||
|
||||
# Extract tables
|
||||
tbl_extractor = _TableExtractor()
|
||||
tbl_extractor.feed(html)
|
||||
|
||||
# ─── Parse scalars from flat text ──────────────────────────────────────
|
||||
# Owner Name DERMYSHI IRFAN Property Control Number ...
|
||||
m = re.search(r"Owner Name\s+([A-Z][A-Z\s,'.\-&]+?)(?=\s+(?:Property Control|Mailing|Current|Tax|Subdivision|Total))",
|
||||
flat)
|
||||
if m:
|
||||
result["owner_name"] = m.group(1).strip()
|
||||
|
||||
# Property Control Number — formatted as XX-XX-XX-XX-XX-XXX-XXXX
|
||||
m = re.search(r"Property Control Number\s+([\d\-]+)", flat)
|
||||
if m:
|
||||
result["parcel_id"] = m.group(1).strip()
|
||||
else:
|
||||
result["parcel_id"] = parcel_id
|
||||
|
||||
# Year Built
|
||||
m = re.search(r"Year Built\s+(\d{4})", flat)
|
||||
if m:
|
||||
result["year_built"] = int(m.group(1))
|
||||
|
||||
# Beds / Baths
|
||||
m = re.search(r"Bed\s*Rooms\s+(\d+)", flat, re.IGNORECASE)
|
||||
if m:
|
||||
result["bedrooms"] = int(m.group(1))
|
||||
m = re.search(r"Full Baths\s+(\d+)", flat, re.IGNORECASE)
|
||||
full_b = int(m.group(1)) if m else 0
|
||||
m = re.search(r"Half Baths\s+(\d+)", flat, re.IGNORECASE)
|
||||
half_b = int(m.group(1)) if m else 0
|
||||
if full_b or half_b:
|
||||
result["baths"] = float(full_b) + (0.5 * half_b)
|
||||
result["baths_full"] = full_b
|
||||
result["baths_half"] = half_b
|
||||
|
||||
# Square footage
|
||||
m = re.search(r"Total Square Footage\s+(\d[\d,]*)", flat) or re.search(r"Square Footage\s+(\d[\d,]*)", flat)
|
||||
if m:
|
||||
result["sqft_total"] = _to_int(m.group(1))
|
||||
m = re.search(r"Area Under Air\s+(\d[\d,]*)", flat)
|
||||
if m:
|
||||
result["sqft_heated"] = _to_int(m.group(1))
|
||||
|
||||
# Lot acres
|
||||
m = re.search(r"Acres\s+([\d.]+)", flat)
|
||||
if m:
|
||||
try:
|
||||
result["lot_acres"] = float(m.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Property Use Code + Zoning
|
||||
m = re.search(r"Property Use Code\s+([\w\d\?\.\-\s]+?)(?:\s+Zoning)", flat)
|
||||
if m:
|
||||
result["use_code"] = m.group(1).replace("?", " - ").strip()
|
||||
m = re.search(r"Zoning\s+([\w\d\?\-]+?(?:\s+\([^)]+\))?)", flat)
|
||||
if m:
|
||||
result["zoning"] = m.group(1).replace("?", " - ").strip()
|
||||
|
||||
# Subdivision
|
||||
m = re.search(r"Subdivision\s+([A-Z0-9 ,'.\-]+?)(?=\s+Official Records|Sale Date|Legal Description|$)", flat)
|
||||
if m:
|
||||
sub = m.group(1).strip()
|
||||
result["subdivision"] = sub if sub else None
|
||||
|
||||
# Legal description
|
||||
m = re.search(r"Legal Description\s+([^\n]+?)(?=\s+Show Full Map|Show More|Nearby|Owner INFORMATION|$)", flat)
|
||||
if m:
|
||||
result["legal_description"] = m.group(1).strip()[:300]
|
||||
|
||||
# Roof / interior info
|
||||
for label, key in [
|
||||
("Air Condition Desc.", "ac_description"),
|
||||
("Heat Type", "heat_type"),
|
||||
("Heat Fuel", "heat_fuel"),
|
||||
("Roof Structure", "roof_struct"),
|
||||
("Roof Cover", "roof_cover"),
|
||||
("Interior Wall 1", "interior_wall"),
|
||||
]:
|
||||
m = re.search(rf"{re.escape(label)}\s+([A-Z][A-Z &/\-]+?)(?=\s+[A-Z][a-z])", flat)
|
||||
if m:
|
||||
result[key] = m.group(1).strip()
|
||||
|
||||
# Site Address (Property address line)
|
||||
# PB format: addresses are usually shown after "Location Address" header
|
||||
m = re.search(r"Location Address\s+([^\n]+?)(?=\s+Subdivision|Owner|Property Use|$)", flat)
|
||||
if m:
|
||||
result["site_address"] = m.group(1).strip()
|
||||
|
||||
# Homestead detection: "Current Homestead" or "Homestead Exemption"
|
||||
# Easier: check if exemption appears in benefits section
|
||||
result["homestead_active"] = bool(
|
||||
re.search(r"Homestead Exemption\s+\$[\d,]+|Current Homestead\s*Yes",
|
||||
flat, re.IGNORECASE)
|
||||
)
|
||||
|
||||
# ─── Tax/Assessment values from tables ─────────────────────────────────
|
||||
# Look for table with rows like: "Tax Year 2025 2024 2023 ..."
|
||||
# "Total Market Value $758,298 $762,232 ..."
|
||||
# "Total Assessed Value ..."
|
||||
tax_years = []
|
||||
market_vals: dict[str, int] = {}
|
||||
assessed_vals: dict[str, int] = {}
|
||||
improvement_vals: dict[str, int] = {}
|
||||
for tbl in tbl_extractor.tables:
|
||||
for row in tbl:
|
||||
if not row:
|
||||
continue
|
||||
first = row[0].lower() if row else ""
|
||||
if first == "tax year":
|
||||
tax_years = [c for c in row[1:] if c]
|
||||
elif "market value" in first or "total market" in first:
|
||||
for i, v in enumerate(row[1:]):
|
||||
if i < len(tax_years):
|
||||
market_vals[tax_years[i]] = _money_to_int(v) or 0
|
||||
elif first == "assessed value" or "total assessed" in first:
|
||||
for i, v in enumerate(row[1:]):
|
||||
if i < len(tax_years):
|
||||
assessed_vals[tax_years[i]] = _money_to_int(v) or 0
|
||||
elif "improvement value" in first:
|
||||
for i, v in enumerate(row[1:]):
|
||||
if i < len(tax_years):
|
||||
improvement_vals[tax_years[i]] = _money_to_int(v) or 0
|
||||
|
||||
# Pick most recent year
|
||||
valid_years = sorted([y for y in tax_years if y.isdigit()], reverse=True)
|
||||
current_year = valid_years[0] if valid_years else None
|
||||
last_year = valid_years[1] if len(valid_years) > 1 else None
|
||||
|
||||
result["just_value_current"] = market_vals.get(current_year) if current_year else None
|
||||
result["assessed_value_current"] = assessed_vals.get(current_year) if current_year else None
|
||||
result["just_value_last"] = market_vals.get(last_year) if last_year else None
|
||||
result["assessed_value_last"] = assessed_vals.get(last_year) if last_year else None
|
||||
result["tax_year_current"] = int(current_year) if current_year else None
|
||||
result["tax_year_last"] = int(last_year) if last_year else None
|
||||
result["assessment_history"] = {
|
||||
"market": market_vals,
|
||||
"assessed": assessed_vals,
|
||||
"improvement": improvement_vals,
|
||||
}
|
||||
|
||||
# ─── Sales history from tables ─────────────────────────────────────────
|
||||
sales: list[dict] = []
|
||||
for tbl in tbl_extractor.tables:
|
||||
if not tbl or len(tbl) < 2:
|
||||
continue
|
||||
hdr = [c.lower() for c in tbl[0]]
|
||||
# Sales table heuristic: header has "Sale[s] Date" and "Price".
|
||||
# PB uses "Sales Date" (with 's'), some sites use "Sale Date".
|
||||
has_sale_date = any(("sale date" in h or "sales date" in h) for h in hdr)
|
||||
if has_sale_date and any("price" in h for h in hdr):
|
||||
idx_date = next((i for i, h in enumerate(hdr)
|
||||
if "sale date" in h or "sales date" in h), -1)
|
||||
idx_price = next((i for i, h in enumerate(hdr) if "price" in h), -1)
|
||||
idx_book = next((i for i, h in enumerate(hdr) if "book" in h or h.startswith("or")), -1)
|
||||
idx_qual = next((i for i, h in enumerate(hdr)
|
||||
if "qualified" in h or h == "sale type" or h == "type"), -1)
|
||||
for row in tbl[1:]:
|
||||
if len(row) < 2:
|
||||
continue
|
||||
d = row[idx_date] if idx_date >= 0 and idx_date < len(row) else ""
|
||||
p = row[idx_price] if idx_price >= 0 and idx_price < len(row) else ""
|
||||
if not d and not p:
|
||||
continue
|
||||
qual_raw = row[idx_qual] if idx_qual >= 0 and idx_qual < len(row) else ""
|
||||
price = _money_to_int(p)
|
||||
# Palm Beach uses "Sale Type" not "qualified/disqualified".
|
||||
# Treat WARRANTY DEED with price >= 50K as Qualified (typical PB convention).
|
||||
# CERT OF TITLE = foreclosure deed = Unqualified.
|
||||
# QUIT CLAIM with low price = Unqualified.
|
||||
q_low = qual_raw.lower()
|
||||
if "warranty deed" in q_low and (price or 0) >= 50000:
|
||||
qualified_flag = "Qualified"
|
||||
elif "qualified" in q_low and "disqualified" not in q_low:
|
||||
qualified_flag = "Qualified"
|
||||
else:
|
||||
qualified_flag = "Unqualified"
|
||||
sales.append({
|
||||
"date": d,
|
||||
"price": price,
|
||||
"book_page": row[idx_book] if idx_book >= 0 and idx_book < len(row) else "",
|
||||
"qualification": qual_raw,
|
||||
"deed_type": qual_raw,
|
||||
"qualified": qualified_flag,
|
||||
})
|
||||
result["sales_history"] = sales
|
||||
|
||||
# Most recent qualified sale
|
||||
qualified = [s for s in sales
|
||||
if s.get("qualified", "").startswith("Qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
result["most_recent_qualified_sale"] = qualified[0] if qualified else None
|
||||
|
||||
# Renovation signal
|
||||
from data_fetchers.pa_duval import _detect_renovation_pattern
|
||||
result["renovation_signal"] = _detect_renovation_pattern(
|
||||
sales, listing_price=listing_price,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Palm Beach PA full record fetcher")
|
||||
parser.add_argument("--parcel", required=True, help="PCN (e.g. '00414232000003080')")
|
||||
args = parser.parse_args()
|
||||
|
||||
rec = fetch_palm_beach_pa_record(parcel_id=args.parcel)
|
||||
print(json.dumps(rec, indent=2, default=str))
|
||||
@@ -0,0 +1,145 @@
|
||||
"""data_fetchers/pa_photo_lookup.py — Buscar fotos de propiedad en sitios PA (gratis).
|
||||
|
||||
PROPOSITO:
|
||||
Los County Property Appraisers (PA) de Florida tienen fotos de las propiedades.
|
||||
Acceso público vía Playwright (cero costo Firecrawl).
|
||||
|
||||
Es la alternativa GRATIS a `zillow_photo_lookup` (que cuesta 1 credit por property).
|
||||
|
||||
COVERAGE actual:
|
||||
- Broward (bcpa.net): ✓ tested, 100% hit rate en 3-sample
|
||||
- Miami-Dade (miamidadepa.gov): ✗ solo aerial, no street photo
|
||||
- Duval (paopropertysearch.coj.net): pendiente investigar URL correcta
|
||||
- Otros counties: stub para Phase 3.5.B
|
||||
|
||||
USO:
|
||||
from data_fetchers.pa_photo_lookup import fetch_pa_photo
|
||||
url, meta = fetch_pa_photo(county="Broward", parcel_id="484226062150")
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def fetch_pa_photo(
|
||||
county: str,
|
||||
parcel_id: str,
|
||||
timeout_seconds: int = 25,
|
||||
) -> tuple[Optional[str], dict]:
|
||||
"""Fetch property photo URL from County Property Appraiser.
|
||||
|
||||
Args:
|
||||
county: county name (e.g. "Broward", "Miami-Dade", "Duval")
|
||||
parcel_id: county-specific parcel/folio number
|
||||
timeout_seconds: max wait per Playwright call
|
||||
|
||||
Returns:
|
||||
(photo_url, metadata)
|
||||
photo_url: str or None
|
||||
metadata: {county, parcel_id, source, error}
|
||||
"""
|
||||
meta = {"county": county, "parcel_id": parcel_id, "source": None, "error": None}
|
||||
if not parcel_id:
|
||||
meta["error"] = "no parcel_id"
|
||||
return None, meta
|
||||
|
||||
cnorm = (county or "").lower().replace(" county", "").strip().replace(" ", "_")
|
||||
|
||||
fetcher = _FETCHERS.get(cnorm)
|
||||
if not fetcher:
|
||||
meta["error"] = f"no PA fetcher for county {county!r} (supported: {sorted(_FETCHERS.keys())})"
|
||||
return None, meta
|
||||
|
||||
try:
|
||||
url, source_name = fetcher(parcel_id, timeout_seconds)
|
||||
meta["source"] = source_name
|
||||
return url, meta
|
||||
except Exception as e:
|
||||
meta["error"] = f"{type(e).__name__}: {e}"
|
||||
return None, meta
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────────────
|
||||
# Per-county implementations
|
||||
# ────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
_CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36"
|
||||
|
||||
|
||||
def _fetch_broward(parcel_id: str, timeout_seconds: int) -> tuple[Optional[str], str]:
|
||||
"""Broward bcpa.net SPA fetcher.
|
||||
|
||||
URL pattern: https://web.bcpa.net/bcpaclient/#/Record-Search?folio=XXX
|
||||
Photo embedded in <img src=".../Photographs/<first6>/<next2>/<next4>/<file>.jpg">
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
import time
|
||||
|
||||
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
||||
photo = None
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
try:
|
||||
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
|
||||
time.sleep(7) # SPA render delay
|
||||
photos = page.evaluate(
|
||||
"Array.from(document.querySelectorAll('img'))"
|
||||
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
|
||||
".map(i => i.src)"
|
||||
)
|
||||
if photos:
|
||||
photo = photos[0]
|
||||
finally:
|
||||
browser.close()
|
||||
return photo, "bcpa.net"
|
||||
|
||||
|
||||
def _fetch_broward_batch(parcel_ids: list[str], timeout_seconds: int = 20) -> dict[str, Optional[str]]:
|
||||
"""Optimized batch fetcher for Broward.
|
||||
|
||||
Re-uses browser across folios (single session) for speed.
|
||||
Returns: {parcel_id: photo_url or None}
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
import time
|
||||
|
||||
out: dict[str, Optional[str]] = {}
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(user_agent=_CHROME_UA, viewport={"width": 1280, "height": 900})
|
||||
for parcel_id in parcel_ids:
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
url = f"https://web.bcpa.net/bcpaclient/#/Record-Search?folio={parcel_id}"
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=timeout_seconds * 1000)
|
||||
time.sleep(7)
|
||||
photos = page.evaluate(
|
||||
"Array.from(document.querySelectorAll('img'))"
|
||||
".filter(i => i.src.includes('/Photographs/') && i.naturalWidth > 200)"
|
||||
".map(i => i.src)"
|
||||
)
|
||||
out[parcel_id] = photos[0] if photos else None
|
||||
except Exception:
|
||||
out[parcel_id] = None
|
||||
page.close()
|
||||
browser.close()
|
||||
return out
|
||||
|
||||
|
||||
_FETCHERS = {
|
||||
"broward": _fetch_broward,
|
||||
# TODO Phase 3.5.B:
|
||||
# "miami-dade": _fetch_miami_dade, (only aerial, no street photo — skip)
|
||||
# "duval": _fetch_duval,
|
||||
# "hillsborough": _fetch_hillsborough,
|
||||
# "orange": _fetch_orange,
|
||||
}
|
||||
|
||||
|
||||
def list_supported_counties() -> list[str]:
|
||||
"""Returns counties with implemented PA photo fetchers."""
|
||||
return sorted(_FETCHERS.keys())
|
||||
@@ -0,0 +1,465 @@
|
||||
"""price_validator.py - detecta discrepancia listing vs market estimates.
|
||||
|
||||
PROBLEMA QUE RESUELVE:
|
||||
Deal con listing $70K mientras Zillow Zestimate dice $280K. Cap rate sale 18%.
|
||||
Sistema procede a calcular como deal normal sin detectar el RED FLAG mas obvio:
|
||||
ese precio bajo casi siempre indica problema oculto heredable (liens, foreclosure,
|
||||
damage, code violations, title issues, etc.).
|
||||
|
||||
OBJETIVO:
|
||||
Detectar discrepancia >30% entre listing price y market value estimates.
|
||||
Alertar al usuario MASIVAMENTE (CRITICAL_RED_FLAG) con lista de razones probables
|
||||
y due diligence obligatoria.
|
||||
|
||||
LOGICA:
|
||||
- discrepancy < 10% → NORMAL (listing dentro de rango razonable)
|
||||
- 10-30% → WARNING (listing fuera de rango pero no escandalo)
|
||||
- ≥30% → CRITICAL_RED_FLAG (algo huele mal — investigar antes de seguir)
|
||||
|
||||
FUENTES de market value:
|
||||
1. Zillow Zestimate (Firecrawl ~3 credits) — opt-in con ENABLE_FIRECRAWL_PRICE_CHECK
|
||||
2. Redfin estimate (Firecrawl ~3 credits) — opt-in
|
||||
3. Tax Assessed Value (county scraper, gratis cuando funcione)
|
||||
4. Existing comps via property_value.py si ya estan computados
|
||||
|
||||
FAIL-SOFT: si no hay ninguna fuente disponible, retorna status='unknown' con warning.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
# Thresholds
|
||||
NORMAL_THRESHOLD_PCT = 10.0
|
||||
WARNING_THRESHOLD_PCT = 30.0
|
||||
|
||||
# Tax assessed → market value typical ratio in FL
|
||||
TAX_TO_MARKET_RATIO = 0.85 # tax assessed suele ser 85% del market value
|
||||
|
||||
|
||||
def _firecrawl_price_check_enabled() -> bool:
|
||||
"""Flag separado de comps. Default OFF para no quemar credits."""
|
||||
flag = os.getenv("ENABLE_FIRECRAWL_PRICE_CHECK", "false").lower() == "true"
|
||||
has_key = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
|
||||
return flag and has_key
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Fetchers de market value
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_zillow_zestimate(address: str) -> tuple[Optional[int], list[str]]:
|
||||
"""Fetch Zillow Zestimate via Firecrawl scrape de la pagina de la propiedad.
|
||||
|
||||
Returns (zestimate_value, errors). zestimate puede ser None si no se encuentra.
|
||||
Consume ~3 credits Firecrawl.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
if not _firecrawl_price_check_enabled():
|
||||
errors.append(
|
||||
"Firecrawl price check deshabilitado. "
|
||||
"Setear ENABLE_FIRECRAWL_PRICE_CHECK=true en .env para activar."
|
||||
)
|
||||
return None, errors
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
errors.append(f"firecrawl-py no importable: {e}")
|
||||
return None, errors
|
||||
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
||||
if not api_key:
|
||||
errors.append("FIRECRAWL_API_KEY ausente en .env")
|
||||
return None, errors
|
||||
|
||||
# Zillow address search url
|
||||
# Formato: https://www.zillow.com/homes/{address-with-dashes}_rb/
|
||||
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
|
||||
url = f"https://www.zillow.com/homes/{addr_slug}_rb/"
|
||||
|
||||
try:
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
|
||||
result = app.scrape(url, formats=["markdown"])
|
||||
if not result or not hasattr(result, "markdown"):
|
||||
errors.append("Firecrawl Zillow: respuesta vacia")
|
||||
return None, errors
|
||||
md = result.markdown or ""
|
||||
except Exception as e:
|
||||
errors.append(f"Firecrawl Zillow error: {e}")
|
||||
return None, errors
|
||||
|
||||
# Parser: buscar patron "Zestimate" + precio cercano
|
||||
m = re.search(
|
||||
r"zestimate[^\$]*\$([\d,]{4,9})",
|
||||
md,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not m:
|
||||
errors.append("Zillow markdown OK pero patron 'Zestimate $XXX' no encontrado")
|
||||
return None, errors
|
||||
try:
|
||||
zestimate = int(m.group(1).replace(",", ""))
|
||||
if 20_000 <= zestimate <= 50_000_000:
|
||||
return zestimate, errors
|
||||
else:
|
||||
errors.append(f"Zestimate fuera de rango razonable: ${zestimate}")
|
||||
return None, errors
|
||||
except ValueError as e:
|
||||
errors.append(f"Parse Zestimate: {e}")
|
||||
return None, errors
|
||||
|
||||
|
||||
def fetch_redfin_estimate(address: str) -> tuple[Optional[int], list[str]]:
|
||||
"""Similar a Zillow pero Redfin. ~3 credits Firecrawl."""
|
||||
errors: list[str] = []
|
||||
if not _firecrawl_price_check_enabled():
|
||||
errors.append("Firecrawl price check deshabilitado (ENABLE_FIRECRAWL_PRICE_CHECK=true)")
|
||||
return None, errors
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
errors.append(f"firecrawl-py no importable: {e}")
|
||||
return None, errors
|
||||
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
||||
if not api_key:
|
||||
errors.append("FIRECRAWL_API_KEY ausente")
|
||||
return None, errors
|
||||
|
||||
# Redfin search por address
|
||||
addr_slug = re.sub(r"[^\w\s]", "", address).replace(" ", "-")
|
||||
url = f"https://www.redfin.com/?location={addr_slug}"
|
||||
|
||||
try:
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url())
|
||||
result = app.scrape(url, formats=["markdown"])
|
||||
md = result.markdown if result and hasattr(result, "markdown") else ""
|
||||
except Exception as e:
|
||||
errors.append(f"Firecrawl Redfin error: {e}")
|
||||
return None, errors
|
||||
|
||||
m = re.search(
|
||||
r"redfin estimate[^\$]*\$([\d,]{4,9})",
|
||||
md,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not m:
|
||||
errors.append("Redfin: patron 'Redfin Estimate' no encontrado")
|
||||
return None, errors
|
||||
try:
|
||||
est = int(m.group(1).replace(",", ""))
|
||||
if 20_000 <= est <= 50_000_000:
|
||||
return est, errors
|
||||
return None, errors
|
||||
except ValueError:
|
||||
return None, errors
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Posibles razones de discrepancia (educacion al usuario)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Bug 6: Hipotesis distressed ordenadas por likelihood en Florida.
|
||||
# Cuando listing es <$150K SFR + status=UNKNOWN, estas son las CAUSAS PRIMARIAS
|
||||
# que el sistema debe surface ANTES de las 12 razones generales.
|
||||
# Frecuencia derivada de datos publicos: ~60% de listings <$150K SFR en Florida
|
||||
# son foreclosure-related (auction, REO, pre-foreclosure short sale) o tax deed.
|
||||
DISTRESSED_HYPOTHESIS_REASONS = [
|
||||
"🥇 FORECLOSURE AUCTION — listing puede ser el opening bid en la subasta judicial. "
|
||||
"Lookup obligatorio: lis pendens en CCIS del condado (clerk online).",
|
||||
"🥈 REO (Real Estate Owned) — el banco recupero la propiedad post-foreclosure y la "
|
||||
"lista as-is cash-quick-close. Comun en bancos chicos / credit unions.",
|
||||
"🥉 TAX DEED — el condado vendio el certificado por tax delinquency severa. "
|
||||
"1-year redemption period donde el ex-owner puede recomprar.",
|
||||
"Pre-foreclosure short sale — owner intenta vender antes de la subasta. "
|
||||
"Requiere aprobacion del lender (puede llevar 3-6 meses).",
|
||||
"Wholesale assignment — el wholesaler tiene el deal bajo contrato y vende el contrato. "
|
||||
"Puede haber issues con marketable title.",
|
||||
"Probate / estate sale — heirs liquidando rapido. Requiere certificado del juez.",
|
||||
]
|
||||
|
||||
POSSIBLE_RED_FLAG_REASONS = [
|
||||
"Tax delinquency severa (property tax + interes acumulado puede ser >20% del valor)",
|
||||
"IRS lien sobre el owner (federal tax lien, 120-day right of redemption)",
|
||||
"Code enforcement violations grandes (municipalidad puede tener liens de $50K+)",
|
||||
"Foreclosure en curso (lis pendens publico) — el seller intenta vender antes de subasta",
|
||||
"Damage severo no fotografiado (fire, water, structural) que requiere $50K-$200K rehab",
|
||||
"Title issues (clouds en el chain — heirs no identificados, divorce sin completar, fraud)",
|
||||
"Bankruptcy quick-sale (trustee debe liquidar rapido, precio bajo para cerrar)",
|
||||
"Wholesaler problem deal (el wholesaler le bajo el precio porque tuvo issues con buyers anteriores)",
|
||||
"HOA litigation pendiente — lender no presta hasta resolver",
|
||||
"Open insurance claims que el buyer hereda",
|
||||
"Polybutylene plumbing + electrical Federal Pacific (re-pipe + repanel costoso)",
|
||||
"Inhabitable / no certificate of occupancy (puede ser ilegal alquilar tal como esta)",
|
||||
]
|
||||
|
||||
MANDATORY_INVESTIGATION_LIST = [
|
||||
"Court records search (county clerk: lis pendens, foreclosure docket, civil suits)",
|
||||
"Tax collector / appraiser: verificar pagos al dia + assessed value",
|
||||
"Code enforcement check con la municipalidad: violations + liens",
|
||||
"Property records: chain of title del county recorder",
|
||||
"Title search profesional ($300-$500) ANTES de hacer oferta",
|
||||
"Drive-by inspection (sin entrar): nivel de mantenimiento exterior, signos de damage",
|
||||
"PACER bankruptcy search (federal): auto-stay del owner puede invalidar transferencia",
|
||||
"Permits buscador: openings sin cerrar pueden tener implicaciones legales",
|
||||
"Open insurance claims: pedir disclosure al seller",
|
||||
]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# API publica
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def validate_price(
|
||||
*,
|
||||
address: str,
|
||||
listing_price: float,
|
||||
tax_assessed_value: Optional[float] = None,
|
||||
existing_comps_estimate: Optional[float] = None,
|
||||
existing_comps_confidence: Optional[str] = None, # Bug 4: "high"|"medium"|"low"|None
|
||||
existing_comps_sources: Optional[list] = None, # Bug 4: list of source labels
|
||||
neighborhood_class: Optional[str] = None, # Bug 6: A|B|C|D|None
|
||||
use_firecrawl: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""Entry point. Valida listing_price contra fuentes de market value.
|
||||
|
||||
Args:
|
||||
address: full address de la propiedad
|
||||
listing_price: precio listado
|
||||
tax_assessed_value: opcional, si ya se computo por property_value.py
|
||||
existing_comps_estimate: opcional, mid del estimated_value de property_value.py
|
||||
existing_comps_confidence: confidence level del estimate ("high"/"medium"/"low").
|
||||
Si "low", el estimate NO se usa como baseline (Bug 4 fix).
|
||||
existing_comps_sources: lista de sources del property_value (para detectar
|
||||
heuristica-only). Bug 4: si solo viene de "Deductions",
|
||||
NO usar como baseline.
|
||||
neighborhood_class: A/B/C/D del Census ACS. Bug 6: si UNKNOWN + listing muy bajo
|
||||
en zona Class C/D, surface hipotesis foreclosure.
|
||||
use_firecrawl: si True, hace lookups Zillow + Redfin (consume credits).
|
||||
None → usa flag ENABLE_FIRECRAWL_PRICE_CHECK de .env
|
||||
|
||||
Returns dict con:
|
||||
status: NORMAL | WARNING | CRITICAL_RED_FLAG | UNKNOWN
|
||||
listing_price, market_estimates {zillow, redfin, tax_implied, comps_mid}
|
||||
max_discrepancy_pct
|
||||
possible_reasons (list of str) — si CRITICAL_RED_FLAG o UNKNOWN+sospechoso
|
||||
mandatory_investigation (list of str)
|
||||
recommendation: brief one-liner
|
||||
fetched_at, sources_used, errors
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
errors: list[str] = []
|
||||
sources_used: list[str] = []
|
||||
estimates: dict[str, Optional[int]] = {
|
||||
"zillow_zestimate": None,
|
||||
"redfin_estimate": None,
|
||||
"tax_implied_market": None,
|
||||
"comps_mid": None,
|
||||
}
|
||||
rejected_sources: list[str] = [] # Bug 4: tracking de sources descartadas
|
||||
|
||||
# 1. Tax assessed → market implied (FL ratio ~85%)
|
||||
if tax_assessed_value and tax_assessed_value > 1000:
|
||||
estimates["tax_implied_market"] = int(tax_assessed_value / TAX_TO_MARKET_RATIO)
|
||||
sources_used.append(f"Tax assessed → market implied (${tax_assessed_value:,.0f} / {TAX_TO_MARKET_RATIO})")
|
||||
|
||||
# 2. Existing comps estimate (de property_value.py) — CON VALIDACION DE CALIDAD (Bug 4)
|
||||
if existing_comps_estimate and existing_comps_estimate > 1000:
|
||||
# Reject if confidence is "low" — significa que property_value.py no tuvo
|
||||
# data real y cayo en fallback heuristico de deductions. Usarlo como
|
||||
# baseline produce direccion INVERTIDA (visto en Jacksonville test).
|
||||
is_heuristic_only = False
|
||||
if existing_comps_sources:
|
||||
srcs_str = " | ".join(str(s) for s in existing_comps_sources).lower()
|
||||
# Si la UNICA source es "Deductions por edad" → no es un comp real
|
||||
is_heuristic_only = (
|
||||
("deduction" in srcs_str or "heurística" in srcs_str or "heuristica" in srcs_str)
|
||||
and "comp" not in srcs_str
|
||||
and "tax" not in srcs_str
|
||||
and "zillow" not in srcs_str
|
||||
and "redfin" not in srcs_str
|
||||
)
|
||||
|
||||
if existing_comps_confidence == "low" or is_heuristic_only:
|
||||
rejected_sources.append(
|
||||
f"property_value comps_mid descartado: confidence={existing_comps_confidence}, "
|
||||
f"sources={existing_comps_sources} — fallback heuristico no es baseline valido"
|
||||
)
|
||||
errors.append(
|
||||
"property_value estimate descartado por baja calidad (heuristic-only). "
|
||||
"Para validacion confiable: activar ENABLE_FIRECRAWL_COMPS o esperar tax_assessed scraper."
|
||||
)
|
||||
else:
|
||||
estimates["comps_mid"] = int(existing_comps_estimate)
|
||||
sources_used.append(f"Comps mid (confidence={existing_comps_confidence or 'unknown'}, ${existing_comps_estimate:,.0f})")
|
||||
|
||||
# 3. Firecrawl Zillow Zestimate
|
||||
if use_firecrawl is None:
|
||||
do_firecrawl = _firecrawl_price_check_enabled()
|
||||
else:
|
||||
do_firecrawl = use_firecrawl
|
||||
|
||||
if do_firecrawl:
|
||||
z, z_errors = fetch_zillow_zestimate(address)
|
||||
if z:
|
||||
estimates["zillow_zestimate"] = z
|
||||
sources_used.append(f"Zillow Zestimate (${z:,.0f})")
|
||||
errors.extend(z_errors)
|
||||
|
||||
r, r_errors = fetch_redfin_estimate(address)
|
||||
if r:
|
||||
estimates["redfin_estimate"] = r
|
||||
sources_used.append(f"Redfin Estimate (${r:,.0f})")
|
||||
errors.extend(r_errors)
|
||||
|
||||
# 4. Calcular discrepancia
|
||||
available_estimates = [v for v in estimates.values() if v]
|
||||
if not available_estimates:
|
||||
# Bug 6: UNKNOWN-pero-listing-sospechosamente-bajo → surface hipotesis
|
||||
# distressed (foreclosure / tax_deed / REO / pre-foreclosure short sale).
|
||||
# Heuristica: listing < $150K + zona conocida por foreclosures FL
|
||||
# (Duval, Hillsborough, Polk, Marion, Brevard, Volusia, Lake) o sin info
|
||||
# de neighborhood = surface hypothesis.
|
||||
suspicious_low_listing = listing_price < 150_000
|
||||
is_low_class_area = (neighborhood_class or "").upper() in ("C", "D")
|
||||
|
||||
possible = []
|
||||
investigation = []
|
||||
recommendation_text = (
|
||||
"No se pudo validar el precio contra fuentes de mercado confiables. "
|
||||
"Activar ENABLE_FIRECRAWL_PRICE_CHECK + ENABLE_FIRECRAWL_COMPS en .env "
|
||||
"o esperar el tax_assessed scraper para validacion automatica. "
|
||||
"Considera lookup manual en Zillow/Redfin antes de proceder."
|
||||
)
|
||||
|
||||
if suspicious_low_listing:
|
||||
# En USA real estate, listing <$150K SFR en Florida es estadisticamente raro
|
||||
# excepto en: (a) zonas Class D donde es market-rate (rare), (b) deals
|
||||
# distressed donde el listing es el opening bid o el "as-is cash quick close".
|
||||
# Surface las hipotesis distressed como PRIMER orden de explicacion.
|
||||
possible = DISTRESSED_HYPOTHESIS_REASONS + POSSIBLE_RED_FLAG_REASONS
|
||||
investigation = MANDATORY_INVESTIGATION_LIST
|
||||
if is_low_class_area:
|
||||
class_note = (
|
||||
f"Vecindario Class {neighborhood_class} (income bajo) — listing en este rango "
|
||||
"puede ser market-rate. Pero foreclosure tampoco esta descartado: en Class D "
|
||||
"FL, el porcentaje de foreclosures es ~3x el promedio nacional."
|
||||
)
|
||||
elif neighborhood_class in ("A", "B"):
|
||||
class_note = (
|
||||
f"Vecindario Class {neighborhood_class} (income medio/alto) — listing tan bajo "
|
||||
"es PROBABLEMENTE deal distressed. Investigar lis pendens en CCIS antes de proceder."
|
||||
)
|
||||
else:
|
||||
# neighborhood_class unknown
|
||||
class_note = (
|
||||
"Neighborhood class no disponible — no se puede inferir si el listing es "
|
||||
"market-rate-para-la-zona o distressed."
|
||||
)
|
||||
recommendation_text = (
|
||||
f"⚠️ Listing ${listing_price:,.0f} es estadisticamente raro para SFR en Florida "
|
||||
f"(<$150K). {class_note} "
|
||||
"HIPOTESIS PRIMARIA: deal distressed (foreclosure, tax_deed, REO, short sale, "
|
||||
"pre-foreclosure). Re-verificar deal_type del usuario, hacer court records lookup "
|
||||
"(lis pendens en CCIS del condado), y tratar este analisis como PRELIMINAR hasta "
|
||||
"confirmar el status real."
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "UNKNOWN",
|
||||
"listing_price": int(listing_price),
|
||||
"market_estimates": estimates,
|
||||
"max_discrepancy_pct": None,
|
||||
"min_discrepancy_pct": None,
|
||||
"possible_reasons": possible,
|
||||
"mandatory_investigation": investigation,
|
||||
"recommendation": recommendation_text,
|
||||
"sources_used": sources_used,
|
||||
"rejected_sources": rejected_sources,
|
||||
"suspicious_low_listing": suspicious_low_listing,
|
||||
"errors": errors,
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
|
||||
# Discrepancia % vs cada estimate (negativo = listing < market, positivo = listing > market)
|
||||
discrepancies = {}
|
||||
for src, val in estimates.items():
|
||||
if val:
|
||||
disc_pct = (listing_price - val) / val * 100
|
||||
discrepancies[src] = round(disc_pct, 1)
|
||||
|
||||
# max ABS discrepancy = la mas alarmante (ya sea sobre o bajo el mercado)
|
||||
max_abs_disc = max(abs(d) for d in discrepancies.values())
|
||||
# signed para reportar direccion
|
||||
signed_max = max(discrepancies.values(), key=abs)
|
||||
|
||||
# Status
|
||||
if max_abs_disc < NORMAL_THRESHOLD_PCT:
|
||||
status = "NORMAL"
|
||||
recommendation = (
|
||||
f"Listing dentro de ±{NORMAL_THRESHOLD_PCT}% de market estimates. "
|
||||
"Procede con analisis financiero estandar."
|
||||
)
|
||||
possible_reasons = []
|
||||
investigation = []
|
||||
elif max_abs_disc < WARNING_THRESHOLD_PCT:
|
||||
status = "WARNING"
|
||||
direction = "sobre" if signed_max > 0 else "bajo"
|
||||
recommendation = (
|
||||
f"Listing {abs(signed_max):.0f}% {direction} el market estimate. "
|
||||
"Verifica condiciones del deal antes de proceder. "
|
||||
"Si listing > market: probable inflación del seller. "
|
||||
"Si listing < market: investigar razon (motivacion legitima vs problema oculto)."
|
||||
)
|
||||
possible_reasons = []
|
||||
investigation = []
|
||||
else:
|
||||
# CRITICAL_RED_FLAG
|
||||
status = "CRITICAL_RED_FLAG"
|
||||
direction = "sobre" if signed_max > 0 else "bajo"
|
||||
if signed_max < 0:
|
||||
# Listing < market — el caso peligroso de problema oculto
|
||||
recommendation = (
|
||||
f"🚨 LISTING ${listing_price:,.0f} esta {abs(signed_max):.0f}% BAJO el market estimate. "
|
||||
"Esto NO es un 'gran deal' por default — es una RED FLAG masiva. "
|
||||
"El precio bajo casi siempre indica problema oculto heredable. "
|
||||
"NO procedas con analisis financiero estandar hasta entender el POR QUE del precio bajo. "
|
||||
"Cap rate alto en este contexto puede ser ilusion — los costos heredables pueden destruir el deal."
|
||||
)
|
||||
possible_reasons = POSSIBLE_RED_FLAG_REASONS
|
||||
investigation = MANDATORY_INVESTIGATION_LIST
|
||||
else:
|
||||
# Listing > market — clasico seller inflacionado pero no peligroso
|
||||
recommendation = (
|
||||
f"Listing ${listing_price:,.0f} esta {abs(signed_max):.0f}% SOBRE el market estimate. "
|
||||
"Probable inflacion del seller. Oferta agresiva justificada. "
|
||||
"Si declinan, walk away — hay deals mejores."
|
||||
)
|
||||
possible_reasons = []
|
||||
investigation = []
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"listing_price": int(listing_price),
|
||||
"market_estimates": estimates,
|
||||
"discrepancies_pct": discrepancies,
|
||||
"max_discrepancy_pct": round(max_abs_disc, 1),
|
||||
"signed_max_discrepancy_pct": round(signed_max, 1),
|
||||
"direction": "listing_BELOW_market" if signed_max < 0 else "listing_ABOVE_market" if signed_max > 0 else "match",
|
||||
"possible_reasons": possible_reasons,
|
||||
"mandatory_investigation": investigation,
|
||||
"recommendation": recommendation,
|
||||
"sources_used": sources_used,
|
||||
"errors": errors,
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
@@ -0,0 +1,545 @@
|
||||
"""data_fetchers/property_appraiser.py — Unified PA router.
|
||||
|
||||
Source of Truth para CUALQUIER propiedad en USA. El pre-screening llama a
|
||||
esta funcion COMO PRIMER paso para evitar inferencias erroneas sobre listing
|
||||
data (Zillow puede mentir, ser viejo, o estar incompleto).
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.property_appraiser import fetch_pa_record, is_pa_supported
|
||||
|
||||
if is_pa_supported(county_name, state):
|
||||
record = fetch_pa_record(
|
||||
address="2352 SCENIC VIEW CT",
|
||||
parcel_id=None,
|
||||
county_name="Duval",
|
||||
state="FL",
|
||||
zip_code="32218",
|
||||
)
|
||||
|
||||
UNIFIED RETURN SCHEMA (cada adapter llena lo que pueda; campos faltantes = None):
|
||||
{
|
||||
# Identity
|
||||
"parcel_id": str, # County-specific RE#/folio
|
||||
"owner_name": str, # Primary owner
|
||||
"co_owners": [str], # If multiple
|
||||
"mailing_address": str,
|
||||
"site_address": str, # Property address
|
||||
"owner_address_mismatch": bool, # mailing != site (absentee owner)
|
||||
# Building
|
||||
"year_built": int,
|
||||
"effective_year_built": int | None, # If county exposes renovations
|
||||
"sqft_heated": int,
|
||||
"sqft_total": int,
|
||||
"lot_acres": float,
|
||||
"lot_total_sqft": int,
|
||||
"bedrooms": int,
|
||||
"baths": float,
|
||||
"stories": float,
|
||||
"building_type": str,
|
||||
"roof_type": str,
|
||||
"exterior_wall": str,
|
||||
"interior_wall": str,
|
||||
"int_flooring": str,
|
||||
# Values
|
||||
"just_value_current": int,
|
||||
"assessed_value_current": int,
|
||||
"taxable_value_current": int,
|
||||
"exemption_current": int,
|
||||
"just_value_last": int,
|
||||
"assessed_value_last": int,
|
||||
"taxes_paid_last": float,
|
||||
"tax_year_current": int,
|
||||
"tax_year_last": int,
|
||||
# Owner signals
|
||||
"homestead_active": bool, # primary residence flag
|
||||
"homestead_amount": int,
|
||||
# Sales history (most recent first)
|
||||
"sales_history": [
|
||||
{
|
||||
"date": "MM/DD/YYYY",
|
||||
"price": int,
|
||||
"deed_type": str, # "Warranty Deed", "Quit Claim", etc.
|
||||
"qualified": str, # "Qualified" | "Unqualified"
|
||||
"vacant_improved": str, # "Vacant" | "Improved"
|
||||
"book_page": str,
|
||||
}, ...
|
||||
],
|
||||
# Improvements / permits (when county exposes)
|
||||
"improvements": [
|
||||
{"year": int, "type": str, "permit": str | None}, ...
|
||||
],
|
||||
# Computed signals
|
||||
"most_recent_qualified_sale": dict | None,
|
||||
"renovation_signal": { # Heuristic flip/renov detection
|
||||
"is_flip_pattern": bool,
|
||||
"evidence": str,
|
||||
"value_increase_pct": float,
|
||||
"months_between": int,
|
||||
},
|
||||
# Land
|
||||
"zoning": str,
|
||||
"use_code": str,
|
||||
"use_description": str,
|
||||
# Metadata
|
||||
"county": str,
|
||||
"state": str,
|
||||
"source": str, # "Duval PA (paopropertysearch.coj.net)" etc.
|
||||
"source_url": str,
|
||||
"fetched_at": ISO timestamp,
|
||||
"errors": [str],
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# County → adapter mapping
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Counties with FULL extractors (returning rich unified schema).
|
||||
# Cuando se agreguen Palm Beach, Orange, Hillsborough, etc → sumarlos aca.
|
||||
_SUPPORTED_COUNTIES: dict[str, str] = {
|
||||
# county_lowercase: state
|
||||
"duval": "FL",
|
||||
"broward": "FL",
|
||||
"miami-dade": "FL",
|
||||
"palm beach": "FL",
|
||||
}
|
||||
|
||||
|
||||
def is_pa_supported(county_name: Optional[str], state: Optional[str]) -> bool:
|
||||
"""True si hay full PA extractor para este county."""
|
||||
if not county_name or not state:
|
||||
return False
|
||||
key = county_name.lower().replace(" county", "").strip()
|
||||
return _SUPPORTED_COUNTIES.get(key) == state.upper()
|
||||
|
||||
|
||||
def list_supported_counties() -> list[tuple[str, str]]:
|
||||
"""Returns [(county_lower, state_upper), ...] de counties con PA full."""
|
||||
return [(c, s) for c, s in _SUPPORTED_COUNTIES.items()]
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Main entry point
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_pa_record(
|
||||
*,
|
||||
county_name: Optional[str],
|
||||
state: Optional[str] = "FL",
|
||||
address: Optional[str] = None,
|
||||
parcel_id: Optional[str] = None,
|
||||
zip_code: Optional[str] = None,
|
||||
timeout_seconds: int = 45,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> Optional[dict]:
|
||||
"""Fetch PA record for a property using the county-specific adapter.
|
||||
|
||||
Args:
|
||||
listing_price: optional — enables flip-in-progress detection
|
||||
(recent qualified sale << listing → owner is flipping)
|
||||
|
||||
Returns:
|
||||
Unified dict with PA data, or None if county not supported.
|
||||
"""
|
||||
if not is_pa_supported(county_name, state):
|
||||
return None
|
||||
|
||||
key = county_name.lower().replace(" county", "").strip()
|
||||
|
||||
if key == "duval":
|
||||
return _fetch_duval(address=address, parcel_id=parcel_id,
|
||||
zip_code=zip_code, timeout_seconds=timeout_seconds,
|
||||
listing_price=listing_price)
|
||||
if key == "broward":
|
||||
return _fetch_broward(parcel_id=parcel_id, timeout_seconds=timeout_seconds)
|
||||
if key == "miami-dade" or key == "miami dade":
|
||||
return _fetch_miami_dade(parcel_id=parcel_id, address=address,
|
||||
timeout_seconds=timeout_seconds,
|
||||
listing_price=listing_price)
|
||||
if key == "palm beach":
|
||||
return _fetch_palm_beach(parcel_id=parcel_id, timeout_seconds=timeout_seconds,
|
||||
listing_price=listing_price)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Adapter wrappers (normalize per-county output to unified schema)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _fetch_duval(
|
||||
*,
|
||||
address: Optional[str],
|
||||
parcel_id: Optional[str],
|
||||
zip_code: Optional[str],
|
||||
timeout_seconds: int,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Duval adapter wrapper: pa_duval.fetch_duval_pa_record → unified schema."""
|
||||
try:
|
||||
from data_fetchers.pa_duval import fetch_duval_pa_record
|
||||
except ImportError as e:
|
||||
return {
|
||||
"county": "Duval",
|
||||
"state": "FL",
|
||||
"errors": [f"pa_duval module import failed: {e}"],
|
||||
}
|
||||
|
||||
raw = fetch_duval_pa_record(
|
||||
address=address,
|
||||
parcel_id=parcel_id,
|
||||
zip_code=zip_code,
|
||||
timeout_seconds=timeout_seconds,
|
||||
listing_price=listing_price,
|
||||
)
|
||||
|
||||
# Normalize raw → unified schema
|
||||
site_addr = " ".join(filter(None, [raw.get("site_address_line1"), raw.get("site_address_line2")]))
|
||||
return {
|
||||
# Identity
|
||||
"parcel_id": raw.get("parcel_id"),
|
||||
"owner_name": raw.get("owner_name"),
|
||||
"co_owners": [], # Duval shows one owner; multi-owner detection pending
|
||||
"mailing_address": None, # not in current detail extraction
|
||||
"site_address": site_addr.strip(),
|
||||
"owner_address_mismatch": None,
|
||||
# Building
|
||||
"year_built": raw.get("year_built"),
|
||||
"effective_year_built": None, # Duval doesn't expose explicitly
|
||||
"sqft_heated": raw.get("sqft_heated"),
|
||||
"sqft_total": raw.get("sqft_gross"),
|
||||
"lot_acres": None,
|
||||
"lot_total_sqft": raw.get("lot_total_sqft"),
|
||||
"bedrooms": int(raw.get("bedrooms") or 0) or None,
|
||||
"baths": raw.get("baths"),
|
||||
"stories": raw.get("stories"),
|
||||
"building_type": raw.get("building_type"),
|
||||
"roof_type": raw.get("roof_struct"),
|
||||
"roofing_cover": raw.get("roofing_cover"),
|
||||
"exterior_wall": raw.get("exterior_wall"),
|
||||
"interior_wall": raw.get("interior_wall"),
|
||||
"int_flooring": raw.get("int_flooring"),
|
||||
# Values
|
||||
"just_value_current": raw.get("tax_current_year_just"),
|
||||
"assessed_value_current": raw.get("tax_current_year_assessed"),
|
||||
"taxable_value_current": raw.get("tax_current_year_taxable"),
|
||||
"exemption_current": raw.get("tax_current_year_exemptions"),
|
||||
"just_value_last": raw.get("tax_last_year_just"),
|
||||
"assessed_value_last": raw.get("tax_last_year_assessed"),
|
||||
"taxable_value_last": raw.get("tax_last_year_taxable"),
|
||||
"taxes_paid_last": None, # Duval doesn't show direct tax amount here
|
||||
"tax_year_current": None,
|
||||
"tax_year_last": None,
|
||||
# Owner signals
|
||||
"homestead_active": raw.get("homestead_active"),
|
||||
"homestead_amount": raw.get("homestead_amount_current"),
|
||||
# Sales history
|
||||
"sales_history": raw.get("sales_history", []),
|
||||
"most_recent_qualified_sale": raw.get("most_recent_qualified_sale"),
|
||||
# Computed signals
|
||||
"renovation_signal": raw.get("renovation_signal"),
|
||||
# Improvements (Duval no expone formales; backlog para Acclaim integration)
|
||||
"improvements": [],
|
||||
# Land
|
||||
"zoning": (raw.get("land") or {}).get("zoning"),
|
||||
"use_code": raw.get("property_use") or "",
|
||||
"use_description": (raw.get("land") or {}).get("use_description"),
|
||||
"subdivision": raw.get("subdivision"),
|
||||
"legal_description": None, # raw has gridLegal but not parsed to flat string here
|
||||
# Metadata
|
||||
"county": "Duval",
|
||||
"state": "FL",
|
||||
"source": raw.get("source"),
|
||||
"source_url": raw.get("source_url"),
|
||||
"fetched_at": raw.get("fetched_at"),
|
||||
"errors": raw.get("errors", []),
|
||||
# Raw passthrough for advanced consumers
|
||||
"_raw": raw,
|
||||
}
|
||||
|
||||
|
||||
def _fetch_broward(*, parcel_id: Optional[str], timeout_seconds: int) -> dict:
|
||||
"""Broward adapter wrapper: pa_broward.fetch_broward_pa_record → unified schema."""
|
||||
if not parcel_id:
|
||||
return {
|
||||
"county": "Broward",
|
||||
"state": "FL",
|
||||
"errors": ["Broward PA needs parcel_id (folio); address search not yet supported"],
|
||||
}
|
||||
|
||||
try:
|
||||
from data_fetchers.pa_broward import fetch_broward_pa_record
|
||||
except ImportError as e:
|
||||
return {
|
||||
"county": "Broward",
|
||||
"state": "FL",
|
||||
"errors": [f"pa_broward module import failed: {e}"],
|
||||
}
|
||||
|
||||
raw = fetch_broward_pa_record(parcel_id, timeout_seconds=timeout_seconds)
|
||||
cy = raw.get("current_year") or {}
|
||||
ly = raw.get("last_year") or {}
|
||||
|
||||
# Concatenate owner names if continuation
|
||||
owner_full = (raw.get("owner_name") or "")
|
||||
if raw.get("owner_name_2"):
|
||||
owner_full = f"{owner_full} {raw['owner_name_2']}".strip()
|
||||
|
||||
# Detect address mismatch
|
||||
mailing = (raw.get("mailing_address") or "").upper()
|
||||
site = (raw.get("situs_address") or "").upper()
|
||||
owner_addr_mismatch = bool(mailing and site and mailing.split()[0] != site.split()[0])
|
||||
|
||||
# Parse beds/baths from units_beds_baths
|
||||
beds = baths = None
|
||||
ubb = (raw.get("units_beds_baths") or "").split("/")
|
||||
if len(ubb) >= 3:
|
||||
for raw_v, key in [(ubb[1], "beds"), (ubb[2], "baths")]:
|
||||
v = raw_v.strip()
|
||||
try:
|
||||
if key == "beds":
|
||||
beds = int(v) if v.replace(".", "").isdigit() else None
|
||||
else:
|
||||
baths = float(v) if v.replace(".", "").isdigit() else None
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
return {
|
||||
"parcel_id": raw.get("folio_number"),
|
||||
"owner_name": owner_full,
|
||||
"co_owners": [],
|
||||
"mailing_address": raw.get("mailing_address"),
|
||||
"site_address": raw.get("situs_address"),
|
||||
"owner_address_mismatch": owner_addr_mismatch,
|
||||
# Building
|
||||
"year_built": raw.get("year_built"),
|
||||
"effective_year_built": raw.get("effective_year"),
|
||||
"sqft_heated": raw.get("under_air_sqft"),
|
||||
"sqft_total": raw.get("adj_bldg_sqft"),
|
||||
"lot_acres": None,
|
||||
"lot_total_sqft": None,
|
||||
"bedrooms": beds,
|
||||
"baths": baths,
|
||||
"stories": None,
|
||||
"building_type": raw.get("use_code"),
|
||||
"roof_type": None,
|
||||
"roofing_cover": None,
|
||||
"exterior_wall": None,
|
||||
"interior_wall": None,
|
||||
"int_flooring": None,
|
||||
# Values
|
||||
"just_value_current": cy.get("just_value"),
|
||||
"assessed_value_current": cy.get("assessed_value"),
|
||||
"taxable_value_current": (raw.get("tax_breakdown") or {}).get("county", {}).get("taxable"),
|
||||
"exemption_current": (raw.get("tax_breakdown") or {}).get("county", {}).get("homestead", 0),
|
||||
"just_value_last": ly.get("just_value"),
|
||||
"assessed_value_last": ly.get("assessed_value"),
|
||||
"taxable_value_last": None,
|
||||
"taxes_paid_last": ly.get("taxes_paid"),
|
||||
"tax_year_current": cy.get("tax_year"),
|
||||
"tax_year_last": ly.get("tax_year"),
|
||||
# Owner signals
|
||||
"homestead_active": raw.get("homestead_active"),
|
||||
"homestead_amount": (raw.get("tax_breakdown") or {}).get("county", {}).get("homestead", 0),
|
||||
# Sales history
|
||||
"sales_history": raw.get("sales_history", []),
|
||||
"most_recent_qualified_sale": None, # not separately calculated in pa_broward
|
||||
# Computed
|
||||
"renovation_signal": None, # pa_broward doesn't compute this yet
|
||||
"improvements": [],
|
||||
# Land
|
||||
"zoning": None,
|
||||
"use_code": raw.get("use_code"),
|
||||
"use_description": raw.get("use_code"),
|
||||
"subdivision": raw.get("neighborhood"),
|
||||
"legal_description": raw.get("legal_description"),
|
||||
# Metadata
|
||||
"county": "Broward",
|
||||
"state": "FL",
|
||||
"source": "Broward Property Appraiser (bcpa.net)",
|
||||
"source_url": raw.get("source_url"),
|
||||
"fetched_at": raw.get("fetched_at"),
|
||||
"errors": raw.get("errors", []),
|
||||
"_raw": raw,
|
||||
}
|
||||
|
||||
|
||||
def _fetch_palm_beach(
|
||||
*,
|
||||
parcel_id: Optional[str],
|
||||
timeout_seconds: int,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Palm Beach wrapper: pa_palm_beach.fetch_palm_beach_pa_record → unified."""
|
||||
if not parcel_id:
|
||||
return {
|
||||
"county": "Palm Beach",
|
||||
"state": "FL",
|
||||
"errors": ["Palm Beach PA needs parcel_id (PCN); address search not yet supported"],
|
||||
}
|
||||
try:
|
||||
from data_fetchers.pa_palm_beach import fetch_palm_beach_pa_record
|
||||
except ImportError as e:
|
||||
return {
|
||||
"county": "Palm Beach",
|
||||
"state": "FL",
|
||||
"errors": [f"pa_palm_beach module import failed: {e}"],
|
||||
}
|
||||
|
||||
raw = fetch_palm_beach_pa_record(
|
||||
parcel_id=parcel_id,
|
||||
timeout_seconds=timeout_seconds,
|
||||
listing_price=listing_price,
|
||||
)
|
||||
|
||||
# Mailing/site address mismatch heuristic: PB doesn't expose mailing
|
||||
# separately in flat parser; site_address may include "Municipality" noise.
|
||||
return {
|
||||
"parcel_id": raw.get("parcel_id"),
|
||||
"owner_name": raw.get("owner_name"),
|
||||
"co_owners": [],
|
||||
"mailing_address": None,
|
||||
"site_address": raw.get("site_address"),
|
||||
"owner_address_mismatch": None,
|
||||
# Building
|
||||
"year_built": raw.get("year_built"),
|
||||
"effective_year_built": None,
|
||||
"sqft_heated": raw.get("sqft_heated"),
|
||||
"sqft_total": raw.get("sqft_total"),
|
||||
"lot_acres": raw.get("lot_acres"),
|
||||
"lot_total_sqft": None,
|
||||
"bedrooms": raw.get("bedrooms"),
|
||||
"baths": raw.get("baths"),
|
||||
"stories": None,
|
||||
"building_type": raw.get("use_code"),
|
||||
"roof_type": raw.get("roof_struct"),
|
||||
"roofing_cover": raw.get("roof_cover"),
|
||||
"exterior_wall": None,
|
||||
"interior_wall": raw.get("interior_wall"),
|
||||
"int_flooring": None,
|
||||
# Values
|
||||
"just_value_current": raw.get("just_value_current"),
|
||||
"assessed_value_current": raw.get("assessed_value_current"),
|
||||
"taxable_value_current": None,
|
||||
"exemption_current": None,
|
||||
"just_value_last": raw.get("just_value_last"),
|
||||
"assessed_value_last": raw.get("assessed_value_last"),
|
||||
"taxable_value_last": None,
|
||||
"taxes_paid_last": None,
|
||||
"tax_year_current": raw.get("tax_year_current"),
|
||||
"tax_year_last": raw.get("tax_year_last"),
|
||||
# Owner signals
|
||||
"homestead_active": raw.get("homestead_active"),
|
||||
"homestead_amount": None,
|
||||
# Sales
|
||||
"sales_history": raw.get("sales_history", []),
|
||||
"most_recent_qualified_sale": raw.get("most_recent_qualified_sale"),
|
||||
"renovation_signal": raw.get("renovation_signal"),
|
||||
"improvements": [],
|
||||
# Land
|
||||
"zoning": raw.get("zoning"),
|
||||
"use_code": raw.get("use_code"),
|
||||
"use_description": raw.get("use_code"),
|
||||
"subdivision": raw.get("subdivision"),
|
||||
"legal_description": raw.get("legal_description"),
|
||||
# Metadata
|
||||
"county": "Palm Beach",
|
||||
"state": "FL",
|
||||
"source": raw.get("source"),
|
||||
"source_url": raw.get("source_url"),
|
||||
"fetched_at": raw.get("fetched_at"),
|
||||
"errors": raw.get("errors", []),
|
||||
"_raw": raw,
|
||||
}
|
||||
|
||||
|
||||
def _fetch_miami_dade(
|
||||
*,
|
||||
parcel_id: Optional[str],
|
||||
address: Optional[str],
|
||||
timeout_seconds: int,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Miami-Dade adapter wrapper: pa_miami_dade.fetch_miami_dade_pa_record → unified schema."""
|
||||
try:
|
||||
from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
|
||||
except ImportError as e:
|
||||
return {
|
||||
"county": "Miami-Dade",
|
||||
"state": "FL",
|
||||
"errors": [f"pa_miami_dade module import failed: {e}"],
|
||||
}
|
||||
|
||||
raw = fetch_miami_dade_pa_record(
|
||||
parcel_id=parcel_id,
|
||||
address=address,
|
||||
timeout_seconds=timeout_seconds,
|
||||
listing_price=listing_price,
|
||||
)
|
||||
|
||||
# Normalize to unified schema
|
||||
return {
|
||||
# Identity
|
||||
"parcel_id": raw.get("parcel_id"),
|
||||
"owner_name": raw.get("owner_name"),
|
||||
"co_owners": raw.get("co_owners", []),
|
||||
"mailing_address": raw.get("mailing_address"),
|
||||
"site_address": raw.get("site_address"),
|
||||
# Owner address mismatch — primitive heuristic (Miami-Dade doesn't expose
|
||||
# separate site/mailing comparison cleanly; can compute later if needed)
|
||||
"owner_address_mismatch": None,
|
||||
# Building
|
||||
"year_built": raw.get("year_built"),
|
||||
"effective_year_built": None, # Miami-Dade doesn't expose explicitly
|
||||
"sqft_heated": raw.get("sqft_heated"),
|
||||
"sqft_total": raw.get("sqft_total"),
|
||||
"lot_acres": None,
|
||||
"lot_total_sqft": raw.get("lot_total_sqft"),
|
||||
"bedrooms": raw.get("bedrooms"),
|
||||
"baths": raw.get("baths"),
|
||||
"stories": raw.get("floors"),
|
||||
"building_type": raw.get("use_code"),
|
||||
"roof_type": None,
|
||||
"roofing_cover": None,
|
||||
"exterior_wall": None,
|
||||
"interior_wall": None,
|
||||
"int_flooring": None,
|
||||
# Values
|
||||
"just_value_current": raw.get("just_value_current"),
|
||||
"assessed_value_current": raw.get("assessed_value_current"),
|
||||
"taxable_value_current": None, # in pa-taxablevalueinformation, not parsed yet
|
||||
"exemption_current": None,
|
||||
"just_value_last": raw.get("just_value_last"),
|
||||
"assessed_value_last": raw.get("assessed_value_last"),
|
||||
"taxable_value_last": None,
|
||||
"taxes_paid_last": None,
|
||||
"tax_year_current": raw.get("tax_year_current"),
|
||||
"tax_year_last": raw.get("tax_year_last"),
|
||||
# Owner signals
|
||||
"homestead_active": raw.get("homestead_active"),
|
||||
"homestead_amount": None,
|
||||
# Sales history
|
||||
"sales_history": raw.get("sales_history", []),
|
||||
"most_recent_qualified_sale": raw.get("most_recent_qualified_sale"),
|
||||
# Renovation
|
||||
"renovation_signal": raw.get("renovation_signal"),
|
||||
"improvements": [],
|
||||
# Land
|
||||
"zoning": raw.get("pa_primary_zone"),
|
||||
"use_code": raw.get("use_code"),
|
||||
"use_description": raw.get("use_description"),
|
||||
"subdivision": raw.get("subdivision"),
|
||||
"legal_description": raw.get("legal_description"),
|
||||
# Metadata
|
||||
"county": "Miami-Dade",
|
||||
"state": "FL",
|
||||
"source": raw.get("source"),
|
||||
"source_url": raw.get("source_url"),
|
||||
"fetched_at": raw.get("fetched_at"),
|
||||
"errors": raw.get("errors", []),
|
||||
"_raw": raw,
|
||||
}
|
||||
@@ -0,0 +1,682 @@
|
||||
"""Property value fetcher para Wave 2 (ValueEstimator).
|
||||
|
||||
Objetivo: estimar valor real de una propiedad combinando:
|
||||
1. Tax assessed value (gratis, county-specific scraping)
|
||||
2. Comparables recently sold (Firecrawl, MAX 5 — OPT-IN para no quemar credits)
|
||||
3. Deductions automaticas por edad del inmueble (AC, roof, plumbing, panel)
|
||||
|
||||
OUTPUT consolidado:
|
||||
{
|
||||
"listing_price": int,
|
||||
"tax_assessed_value": int | None,
|
||||
"comps_used": [{address, sold_price, sold_date, sqft, $/sqft}, ...],
|
||||
"estimated_value": {"low": int, "mid": int, "high": int, "confidence": str},
|
||||
"price_per_sqft_comps_avg": float | None,
|
||||
"price_per_sqft_subject": float,
|
||||
"overpriced_pct": float | None,
|
||||
"inflation_score": float, # 0-10
|
||||
"deductions": {"ac": int, "roof": int, "plumbing": int, "panel": int, "total": int},
|
||||
"market_trend": {"direction": str, "evidence": str},
|
||||
"sources_used": [...],
|
||||
"fetch_errors": [...],
|
||||
}
|
||||
|
||||
FAIL-SOFT:
|
||||
- Sin Firecrawl key o ENABLE_FIRECRAWL_COMPS=false → comps_used=[], confidence baja
|
||||
- Sin tax assessed → tax_assessed_value=None, fallback a comps
|
||||
- Sin nada → estimacion basada SOLO en deductions vs listing
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from .base import FetcherError, USER_AGENT, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
# ─── Deductions por edad del inmueble (Florida real estate norms) ──────────
|
||||
DEDUCTION_AC = 6_000 # AC central viejo (<2010) si no hay evidencia de uno nuevo
|
||||
DEDUCTION_ROOF = 10_000 # Roof shingle viejo (<2005)
|
||||
DEDUCTION_PLUMBING_POLYBUTYLENE = 12_000 # Polybutylene plumbing risk (1978-1995 FL)
|
||||
DEDUCTION_ELECTRICAL_PANEL = 5_000 # Federal Pacific / Zinsco panels (<1990)
|
||||
|
||||
# ─── Keywords para detectar items renovados (suprimir deduccion correspondiente) ──
|
||||
# Bug fix 2026-05-15: el sistema antes solo chequeaba "new ac"/"ac nuevo".
|
||||
# Ahora detecta variantes reales que aparecen en listings: "BRAND NEW ROOF",
|
||||
# "Updated/Remodeled", "Fully updated throughout", "AC replaced 2023", etc.
|
||||
NEW_ITEM_KEYWORDS = {
|
||||
"ac": [
|
||||
"new ac", "new a/c", "new hvac", "new a.c.", "new air conditioning",
|
||||
"ac replaced", "a/c replaced", "hvac replaced", "ac unit replaced",
|
||||
"new air handler", "ac nuevo", "a/c nuevo", "hvac nuevo",
|
||||
"ac (20", "ac 20", "a/c (20", # "AC (2023)" or "AC 2023"
|
||||
"newer ac", "newer a/c", "newer hvac",
|
||||
"recently replaced ac", "recently replaced a/c", "recently replaced hvac",
|
||||
],
|
||||
"roof": [
|
||||
"new roof", "brand new roof", "brand-new roof", "newer roof",
|
||||
"roof replaced", "roof recently replaced", "recent roof",
|
||||
"roof 20", "roof (20", "new shingles", "metal roof installed",
|
||||
"roof installed 20", "roof nuevo", "techo nuevo",
|
||||
"replaced roof", "roof was replaced", "roof replacement",
|
||||
],
|
||||
"plumbing": [
|
||||
"re-piped", "repiped", "re piped", "pex plumbing", "new plumbing",
|
||||
"plumbing replaced", "plumbing nuevo", "fully repiped",
|
||||
"copper plumbing", "plumbing updated", "new pipes",
|
||||
],
|
||||
"panel": [
|
||||
"new panel", "panel upgraded", "panel replaced", "new electrical",
|
||||
"electrical updated", "200 amp", "new wiring", "rewired",
|
||||
"panel nuevo", "panel electrico nuevo", "upgraded electrical",
|
||||
],
|
||||
}
|
||||
|
||||
# Keywords globales que indican "renovada completa" → CERO deducciones
|
||||
RENOVATED_GLOBAL_KEYWORDS = [
|
||||
"updated/remodeled", # Zillow explicit condition tag
|
||||
"fully updated", "fully renovated", "completely renovated", "completely remodeled",
|
||||
"move-in ready", "move in ready", "turnkey", "turn key", "turn-key",
|
||||
"totally renovated", "totally updated",
|
||||
"renovacion completa", "completamente renovada", "lista para mudarse",
|
||||
"newly renovated", "newly remodeled",
|
||||
"fully remodeled",
|
||||
]
|
||||
|
||||
# Keywords de condition_status que indican propiedad renovada
|
||||
RENOVATED_CONDITION_VALUES = {
|
||||
"updated/remodeled", "remodeled", "renovated", "updated",
|
||||
"new construction", "newly built",
|
||||
}
|
||||
|
||||
|
||||
def _matches_any_keyword(text: str, keywords: list[str]) -> Optional[str]:
|
||||
"""Returns first matched keyword (case-insensitive substring), or None."""
|
||||
if not text:
|
||||
return None
|
||||
text_lower = text.lower()
|
||||
for kw in keywords:
|
||||
if kw.lower() in text_lower:
|
||||
return kw
|
||||
return None
|
||||
|
||||
|
||||
# ─── Comps estimation ──────────────────────────────────────────────────────
|
||||
COMPS_MAX_COUNT = 5
|
||||
COMPS_LOOKBACK_DAYS = 180
|
||||
COMPS_SQFT_TOLERANCE_PCT = 0.15
|
||||
PRICE_LOW_PCT = 0.92 # estimated_value['low'] = mid * 0.92
|
||||
PRICE_HIGH_PCT = 1.08 # estimated_value['high'] = mid * 1.08
|
||||
|
||||
|
||||
def calculate_age_deductions(
|
||||
year_built: int,
|
||||
photo_findings_text: str = "",
|
||||
listing_description: str = "",
|
||||
condition_status: str = "",
|
||||
features_special: Optional[list] = None,
|
||||
) -> dict:
|
||||
"""Deducciones automaticas segun edad de la propiedad.
|
||||
|
||||
BUG FIX 2026-05-15: antes solo chequeaba keywords minimos en photo_findings_text.
|
||||
Ahora respeta condition_status, listing_description, y features_special tags.
|
||||
|
||||
Args:
|
||||
year_built: año de construccion. Si <= 0 → ZERO deductions.
|
||||
photo_findings_text: output del PhotoInspector (puede mencionar AC nuevo, etc.)
|
||||
listing_description: full description del listing Zillow/Realtor
|
||||
condition_status: Zillow "condition" tag (e.g. "Updated/Remodeled", "Original")
|
||||
features_special: array de tags del "What's special" Zillow box
|
||||
(e.g. ["BRAND NEW ROOF", "Fresh paint", "NEW AC"])
|
||||
|
||||
Returns:
|
||||
{ac, roof, plumbing, panel, total, _skipped_global: bool,
|
||||
_suppressed_items: [str], _reasons: {item: kw_matched}}
|
||||
|
||||
Si condition_status indica renovada O description tiene keywords globales
|
||||
→ TODO 0, _skipped_global=True.
|
||||
|
||||
Per-item: si features_special O description menciona NEW ROOF/AC/etc,
|
||||
suprime esa deduccion especifica.
|
||||
"""
|
||||
deductions = {"ac": 0, "roof": 0, "plumbing": 0, "panel": 0}
|
||||
suppressed: list[str] = []
|
||||
reasons: dict[str, str] = {}
|
||||
|
||||
# Combinar todos los textos en uno solo para keyword matching
|
||||
combined_text = " ".join([
|
||||
photo_findings_text or "",
|
||||
listing_description or "",
|
||||
" ".join(features_special or []),
|
||||
])
|
||||
|
||||
# 1. CHECK GLOBAL: si condition status es renovada → CERO deducciones
|
||||
cs_lower = (condition_status or "").lower().strip()
|
||||
if cs_lower in RENOVATED_CONDITION_VALUES:
|
||||
return {
|
||||
**deductions, "total": 0,
|
||||
"_skipped_global": True,
|
||||
"_skip_reason": f"condition_status='{condition_status}' (Zillow tag)",
|
||||
"_suppressed_items": list(deductions.keys()),
|
||||
"_reasons": {k: f"condition={condition_status}" for k in deductions.keys()},
|
||||
}
|
||||
|
||||
# 2. CHECK GLOBAL: si description o features mencionan "fully updated"/"move-in ready"
|
||||
global_kw = _matches_any_keyword(combined_text, RENOVATED_GLOBAL_KEYWORDS)
|
||||
if global_kw:
|
||||
return {
|
||||
**deductions, "total": 0,
|
||||
"_skipped_global": True,
|
||||
"_skip_reason": f"keyword '{global_kw}' detected in listing",
|
||||
"_suppressed_items": list(deductions.keys()),
|
||||
"_reasons": {k: f"keyword:{global_kw}" for k in deductions.keys()},
|
||||
}
|
||||
|
||||
# 3. PER-ITEM: aplicar deduccion solo si edad gatilla Y no hay keyword item-specific
|
||||
if not year_built or year_built <= 0:
|
||||
deductions["total"] = 0
|
||||
deductions["_skipped_global"] = False
|
||||
deductions["_suppressed_items"] = []
|
||||
deductions["_reasons"] = {}
|
||||
return deductions
|
||||
|
||||
# AC: edad <2010 y NO hay keyword "new ac"
|
||||
if year_built < 2010:
|
||||
ac_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["ac"])
|
||||
if ac_kw:
|
||||
suppressed.append("ac")
|
||||
reasons["ac"] = f"keyword:{ac_kw}"
|
||||
else:
|
||||
deductions["ac"] = DEDUCTION_AC
|
||||
|
||||
# Roof: edad <2005 y NO hay keyword "new roof"
|
||||
if year_built < 2005:
|
||||
roof_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["roof"])
|
||||
if roof_kw:
|
||||
suppressed.append("roof")
|
||||
reasons["roof"] = f"keyword:{roof_kw}"
|
||||
else:
|
||||
deductions["roof"] = DEDUCTION_ROOF
|
||||
|
||||
# Plumbing polybutylene: edad 1978-1995 y NO hay keyword "repiped"
|
||||
if 1978 <= year_built <= 1995:
|
||||
pl_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["plumbing"])
|
||||
if pl_kw:
|
||||
suppressed.append("plumbing")
|
||||
reasons["plumbing"] = f"keyword:{pl_kw}"
|
||||
else:
|
||||
deductions["plumbing"] = DEDUCTION_PLUMBING_POLYBUTYLENE
|
||||
|
||||
# Electrical panel: edad <1990 y NO hay keyword "new panel"
|
||||
if year_built < 1990:
|
||||
panel_kw = _matches_any_keyword(combined_text, NEW_ITEM_KEYWORDS["panel"])
|
||||
if panel_kw:
|
||||
suppressed.append("panel")
|
||||
reasons["panel"] = f"keyword:{panel_kw}"
|
||||
else:
|
||||
deductions["panel"] = DEDUCTION_ELECTRICAL_PANEL
|
||||
|
||||
deductions["total"] = sum(v for k, v in deductions.items() if k in ("ac", "roof", "plumbing", "panel"))
|
||||
deductions["_skipped_global"] = False
|
||||
deductions["_suppressed_items"] = suppressed
|
||||
deductions["_reasons"] = reasons
|
||||
return deductions
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Tax Assessed Value (county-specific scrapers)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_tax_assessed_miami_dade(address: str) -> Optional[dict]:
|
||||
"""Stub para scraper Miami-Dade property appraiser.
|
||||
|
||||
TODO Wave 2 follow-up: implementar Playwright scraping de:
|
||||
https://www.miamidade.gov/pa/property_search.asp
|
||||
Steps:
|
||||
1. Search by address
|
||||
2. Parse results, click property card
|
||||
3. Extract assessed_value, market_value, sale_history
|
||||
Tiempo estimado: ~30s por lookup. Cache 30 dias.
|
||||
|
||||
Por ahora retorna None — el agente trabaja con comps + deductions sin tax assessed.
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
# Set explicito de counties que TIENEN scraper real (no stub).
|
||||
# Hoy: Broward funciona via pa_broward.py (full PA record extraction).
|
||||
# Pendiente: Miami-Dade, Palm Beach, Orange, Hillsborough, Pinellas... (custom adapters)
|
||||
# qPublic (~30 rurales) — bloqueado por Cloudflare
|
||||
_TAX_ASSESSED_IMPLEMENTED_COUNTIES: set[str] = {
|
||||
"broward",
|
||||
}
|
||||
|
||||
|
||||
def is_tax_assessed_supported(county_name: Optional[str], state: Optional[str]) -> bool:
|
||||
"""True si tenemos scraper real para este county. False si es stub o no esta.
|
||||
|
||||
El orchestrator usa esto para distinguir 'no implementado' (decisión nuestra)
|
||||
de 'not found' (buscamos y no estaba). NO mentir al usuario.
|
||||
"""
|
||||
if not county_name or state != "FL":
|
||||
return False
|
||||
cname = county_name.lower().replace(" county", "").strip()
|
||||
return cname in _TAX_ASSESSED_IMPLEMENTED_COUNTIES
|
||||
|
||||
|
||||
def fetch_tax_assessed(
|
||||
address: str,
|
||||
county_name: Optional[str],
|
||||
state: Optional[str],
|
||||
parcel_id: Optional[str] = None,
|
||||
) -> Optional[dict]:
|
||||
"""Router por condado para Property Appraiser data.
|
||||
|
||||
Args:
|
||||
address: street address (used by counties whose adapter supports address search)
|
||||
county_name: e.g. "Broward", "Miami-Dade"
|
||||
state: must be "FL" today
|
||||
parcel_id: county-specific folio number (required for Broward; preferred
|
||||
for all counties since address matching is often fuzzy on PA sites)
|
||||
|
||||
Returns:
|
||||
Rich dict with legacy keys (assessed_value, market_value, year_built, sqft,
|
||||
owner_name, source) plus extended fields when adapter supports them
|
||||
(sales_history, mailing_address, tax_breakdown, photo_url, etc.).
|
||||
None if county not implemented OR adapter returned empty result.
|
||||
|
||||
Use is_tax_assessed_supported() to distinguish "not implemented" vs "not found".
|
||||
"""
|
||||
if not is_tax_assessed_supported(county_name, state):
|
||||
return None
|
||||
cname = (county_name or "").lower().replace(" county", "").strip()
|
||||
|
||||
if cname == "broward":
|
||||
if not parcel_id:
|
||||
# Broward needs folio — we can't do reliable address search yet
|
||||
return None
|
||||
return _fetch_broward(parcel_id)
|
||||
|
||||
if "miami-dade" in cname or "miami dade" in cname:
|
||||
return fetch_tax_assessed_miami_dade(address)
|
||||
|
||||
# Defensive: not reachable if is_tax_assessed_supported is in sync
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_broward(parcel_id: str) -> Optional[dict]:
|
||||
"""Broward adapter: pa_broward.py rich record → legacy contract + extensions."""
|
||||
try:
|
||||
from data_fetchers.pa_broward import fetch_broward_pa_record
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
rec = fetch_broward_pa_record(parcel_id)
|
||||
if not rec or rec.get("errors") and not rec.get("just_value_current"):
|
||||
# Broward adapter failed AND no fallback data → return None
|
||||
return None
|
||||
|
||||
cy = rec.get("current_year", {})
|
||||
ly = rec.get("last_year", {})
|
||||
|
||||
# Concatenate owner_name + owner_name_2 if continuation exists (e.g. "BANK OF AMERICA NA TRSTEE" + "% CORP REAL ESTATE ASSMT")
|
||||
owner_full = rec.get("owner_name", "") or ""
|
||||
if rec.get("owner_name_2"):
|
||||
owner_full = f"{owner_full} {rec['owner_name_2']}".strip()
|
||||
|
||||
# Parse beds/baths from "1 / 3 / 2.50" format (units/beds/baths)
|
||||
beds = baths = None
|
||||
ubb = (rec.get("units_beds_baths") or "").split("/")
|
||||
if len(ubb) >= 3:
|
||||
try:
|
||||
beds_raw = ubb[1].strip()
|
||||
beds = int(beds_raw) if beds_raw and beds_raw.replace(".", "").isdigit() else None
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
try:
|
||||
baths_raw = ubb[2].strip()
|
||||
baths = float(baths_raw) if baths_raw and baths_raw.replace(".", "").isdigit() else None
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
# ─── Legacy contract (back-compat with existing orchestrator/LLM payload) ──
|
||||
return {
|
||||
# Required legacy keys
|
||||
"assessed_value": cy.get("assessed_value") or ly.get("assessed_value"),
|
||||
"market_value": cy.get("just_value") or ly.get("just_value"),
|
||||
"just_value": cy.get("just_value") or ly.get("just_value"),
|
||||
"year_built": rec.get("year_built"),
|
||||
"sqft": rec.get("under_air_sqft") or rec.get("adj_bldg_sqft"),
|
||||
"beds": beds,
|
||||
"baths": baths,
|
||||
"owner_name": owner_full,
|
||||
"source": "bcpa.net",
|
||||
# ─── Extended fields (Property Snapshot Report inputs) ────────────────
|
||||
"folio_number": rec.get("folio_number"),
|
||||
"mailing_address": rec.get("mailing_address"),
|
||||
"situs_address": rec.get("situs_address"),
|
||||
"neighborhood": rec.get("neighborhood"),
|
||||
"use_code": rec.get("use_code"),
|
||||
"millage_code": rec.get("millage_code"),
|
||||
"legal_description": rec.get("legal_description"),
|
||||
"adj_bldg_sqft": rec.get("adj_bldg_sqft"),
|
||||
"under_air_sqft": rec.get("under_air_sqft"),
|
||||
"effective_year": rec.get("effective_year"),
|
||||
"homestead_active": rec.get("homestead_active", False),
|
||||
"taxes_paid_last_year": ly.get("taxes_paid"),
|
||||
"tax_year_last": ly.get("tax_year"),
|
||||
"tax_year_current": cy.get("tax_year"),
|
||||
"current_year_values": cy,
|
||||
"last_year_values": ly,
|
||||
"two_years_ago_values": rec.get("two_years_ago", {}),
|
||||
"tax_breakdown": rec.get("tax_breakdown", {}),
|
||||
"sales_history": rec.get("sales_history", []),
|
||||
"photo_url": rec.get("photo_url"),
|
||||
"source_url": rec.get("source_url"),
|
||||
"source_api_url": rec.get("source_api_url"),
|
||||
"fetched_at": rec.get("fetched_at"),
|
||||
}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Firecrawl comps (OPT-IN para no quemar credits)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _firecrawl_enabled() -> bool:
|
||||
"""Check env flag + key presence."""
|
||||
flag = os.getenv("ENABLE_FIRECRAWL_COMPS", "false").lower() == "true"
|
||||
has_key = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
|
||||
return flag and has_key
|
||||
|
||||
|
||||
def fetch_zillow_comps(
|
||||
zip_code: str,
|
||||
beds: int,
|
||||
baths: float,
|
||||
sqft: int,
|
||||
max_count: int = COMPS_MAX_COUNT,
|
||||
) -> tuple[list[dict], list[str]]:
|
||||
"""Fetch recently sold comps via Firecrawl scrape de Zillow.
|
||||
|
||||
Returns (comps_list, errors). Comps list capped at max_count.
|
||||
Cada comp: {address, sold_price, sold_date_text, sqft, price_per_sqft}
|
||||
|
||||
OPT-IN: requiere ENABLE_FIRECRAWL_COMPS=true en .env.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
|
||||
if not _firecrawl_enabled():
|
||||
errors.append(
|
||||
"Firecrawl comps deshabilitado. Setear ENABLE_FIRECRAWL_COMPS=true en .env para activar."
|
||||
)
|
||||
return [], errors
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
errors.append(f"firecrawl-py no importable: {e}")
|
||||
return [], errors
|
||||
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
||||
if not api_key:
|
||||
errors.append("FIRECRAWL_API_KEY ausente en .env")
|
||||
return [], errors
|
||||
|
||||
# Zillow recently sold URL para ZIP
|
||||
url = f"https://www.zillow.com/homes/recently_sold/{zip_code}_rb/"
|
||||
|
||||
try:
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
# Firecrawl SDK v2+: .scrape() (renamed from legacy .scrape_url()).
|
||||
# Returns Document object with .markdown attribute on success.
|
||||
result = app.scrape(url, formats=["markdown"])
|
||||
if not result or not hasattr(result, "markdown"):
|
||||
errors.append("Firecrawl devolvio resultado vacio")
|
||||
return [], errors
|
||||
md = result.markdown or ""
|
||||
except Exception as e:
|
||||
errors.append(f"Firecrawl scrape error: {e}")
|
||||
return [], errors
|
||||
|
||||
# Parser best-effort del markdown de Zillow.
|
||||
# Buscar bloques con: $price + sqft + address + sold date
|
||||
# Patrones tipicos en markdown de Zillow recently sold:
|
||||
# "$485,000" ... "1,450 sqft" ... "123 Main St" ... "Sold X/Y/Z"
|
||||
comps = _parse_zillow_markdown(md, beds=beds, baths=baths, sqft_target=sqft)
|
||||
if not comps:
|
||||
errors.append("Firecrawl OK pero parser no extrajo comps (Zillow cambio formato?)")
|
||||
|
||||
return comps[:max_count], errors
|
||||
|
||||
|
||||
def _parse_zillow_markdown(md: str, beds: int, baths: float, sqft_target: int) -> list[dict]:
|
||||
"""Best-effort parser de Zillow markdown.
|
||||
|
||||
Extrae bloques con price + sqft + address. Tolera variaciones.
|
||||
"""
|
||||
comps = []
|
||||
# Buscar todos los matches de precio + sqft cercanos
|
||||
# Pattern: $XXX,XXX (con o sin centavos) seguido en proximidad de "X,XXX sqft" o "X bd"
|
||||
price_pattern = re.compile(r"\$([\d,]{3,9})", re.IGNORECASE)
|
||||
sqft_pattern = re.compile(r"([\d,]{3,5})\s*sq(?:\.|uare)?\s*ft", re.IGNORECASE)
|
||||
sold_pattern = re.compile(r"(sold|vendido)[\s:]+([0-9/.-]+)", re.IGNORECASE)
|
||||
bed_pattern = re.compile(r"([\d.]+)\s*(?:bd|bed)", re.IGNORECASE)
|
||||
|
||||
# Segmentar md en bloques de ~500 chars que probablemente contengan 1 listing
|
||||
segments = re.split(r"\n\n+|---+", md)
|
||||
for seg in segments:
|
||||
if len(seg) < 50 or len(seg) > 2000:
|
||||
continue
|
||||
prices = price_pattern.findall(seg)
|
||||
sqfts = sqft_pattern.findall(seg)
|
||||
if not prices or not sqfts:
|
||||
continue
|
||||
try:
|
||||
price = int(prices[0].replace(",", ""))
|
||||
sqft = int(sqfts[0].replace(",", ""))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
# Filtro: precio razonable para single home
|
||||
if price < 30_000 or price > 5_000_000:
|
||||
continue
|
||||
if sqft < 400 or sqft > 8_000:
|
||||
continue
|
||||
# Filtro: sqft cerca del subject (+/- COMPS_SQFT_TOLERANCE_PCT)
|
||||
if sqft_target > 0:
|
||||
ratio = sqft / sqft_target
|
||||
if ratio < (1 - COMPS_SQFT_TOLERANCE_PCT) or ratio > (1 + COMPS_SQFT_TOLERANCE_PCT):
|
||||
continue
|
||||
|
||||
sold_match = sold_pattern.search(seg)
|
||||
sold_date = sold_match.group(2) if sold_match else "?"
|
||||
|
||||
bed_match = bed_pattern.search(seg)
|
||||
comp_beds = bed_match.group(1) if bed_match else "?"
|
||||
|
||||
# Address best-effort: primera linea o primer link
|
||||
addr_match = re.search(r"\[([^\]]+(?:St|Ave|Rd|Dr|Ln|Way|Blvd|Ct|Ter|Pl)[^\]]*)\]", seg, re.IGNORECASE)
|
||||
address = addr_match.group(1) if addr_match else "(direccion no parseada)"
|
||||
|
||||
comps.append({
|
||||
"address": address,
|
||||
"sold_price": price,
|
||||
"sold_date": sold_date,
|
||||
"sqft": sqft,
|
||||
"beds_text": comp_beds,
|
||||
"price_per_sqft": round(price / sqft, 2) if sqft > 0 else 0,
|
||||
})
|
||||
|
||||
return comps
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Estimacion de valor y inflation score
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def estimate_value_from_comps(comps: list[dict], subject_sqft: int) -> tuple[Optional[int], Optional[float]]:
|
||||
"""Calcula valor estimado mid + $/sqft promedio de comps."""
|
||||
if not comps or subject_sqft <= 0:
|
||||
return None, None
|
||||
prices_per_sqft = [c["price_per_sqft"] for c in comps if c.get("price_per_sqft", 0) > 0]
|
||||
if not prices_per_sqft:
|
||||
return None, None
|
||||
avg_ppsqft = sum(prices_per_sqft) / len(prices_per_sqft)
|
||||
estimated_mid = int(avg_ppsqft * subject_sqft)
|
||||
return estimated_mid, round(avg_ppsqft, 2)
|
||||
|
||||
|
||||
def calculate_inflation_score(listing_price: float, estimated_mid: float) -> float:
|
||||
"""Score 0-10 de cuanto el listing esta sobre el valor estimado.
|
||||
|
||||
0 = listing al valor o por debajo (subvaluado)
|
||||
5 = listing 10% sobre valor
|
||||
10 = listing 30%+ sobre valor (gravemente inflado)
|
||||
"""
|
||||
if estimated_mid <= 0:
|
||||
return 5.0 # default medium
|
||||
overpriced = (listing_price - estimated_mid) / estimated_mid
|
||||
if overpriced <= 0:
|
||||
return 0.0
|
||||
if overpriced >= 0.30:
|
||||
return 10.0
|
||||
return round(overpriced / 0.30 * 10, 1)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# API publica
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_property_value(
|
||||
*,
|
||||
address: str,
|
||||
listing_price: float,
|
||||
sqft: int,
|
||||
beds: int,
|
||||
baths: float,
|
||||
year_built: int,
|
||||
zip_code: Optional[str] = None,
|
||||
county_name: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
photo_findings_text: str = "",
|
||||
listing_description: str = "",
|
||||
condition_status: str = "",
|
||||
features_special: Optional[list] = None,
|
||||
include_firecrawl_comps: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""Entry point. Combina tax assessed + comps + deductions en un dict consolidado.
|
||||
|
||||
Bug fix 2026-05-15: ahora acepta listing_description, condition_status,
|
||||
features_special para que calculate_age_deductions pueda detectar listings
|
||||
renovados (e.g. condition='Updated/Remodeled' o description='Fully updated
|
||||
throughout, BRAND NEW ROOF, NEW AC') y suprimir deducciones falsas.
|
||||
|
||||
include_firecrawl_comps:
|
||||
None → usa el flag ENABLE_FIRECRAWL_COMPS de .env (default false)
|
||||
True → fuerza llamada a Firecrawl (consume credits)
|
||||
False → omite Firecrawl
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
sources_used: list[str] = []
|
||||
errors: list[str] = []
|
||||
|
||||
# 1. Deductions por edad (siempre, gratis) — ahora respeta condition + keywords
|
||||
deductions = calculate_age_deductions(
|
||||
year_built=year_built,
|
||||
photo_findings_text=photo_findings_text,
|
||||
listing_description=listing_description,
|
||||
condition_status=condition_status,
|
||||
features_special=features_special,
|
||||
)
|
||||
if deductions["total"] > 0:
|
||||
sources_used.append("Deductions por edad (heuristica FL)")
|
||||
elif deductions.get("_skipped_global"):
|
||||
sources_used.append(
|
||||
f"Deductions SKIPPED (renovated: {deductions.get('_skip_reason', '?')})"
|
||||
)
|
||||
|
||||
# 2. Tax assessed (stub Miami-Dade por ahora)
|
||||
tax_assessed_data = fetch_tax_assessed(address, county_name, state)
|
||||
tax_assessed_value = None
|
||||
if tax_assessed_data:
|
||||
tax_assessed_value = tax_assessed_data.get("assessed_value")
|
||||
sources_used.append(f"Tax assessed ({county_name})")
|
||||
else:
|
||||
errors.append(f"Tax assessed no disponible para {county_name or '?'} (scraper pendiente Wave 2 follow-up)")
|
||||
|
||||
# 3. Firecrawl comps (opt-in)
|
||||
if include_firecrawl_comps is None:
|
||||
do_firecrawl = _firecrawl_enabled()
|
||||
else:
|
||||
do_firecrawl = include_firecrawl_comps
|
||||
|
||||
comps: list[dict] = []
|
||||
if do_firecrawl and zip_code:
|
||||
comps, comp_errors = fetch_zillow_comps(zip_code, beds, baths, sqft)
|
||||
errors.extend(comp_errors)
|
||||
if comps:
|
||||
sources_used.append(f"Zillow recently sold ({len(comps)} comps via Firecrawl)")
|
||||
|
||||
# 4. Calcular valor estimado
|
||||
estimated_mid_from_comps, avg_ppsqft = estimate_value_from_comps(comps, sqft)
|
||||
price_per_sqft_subject = round(listing_price / sqft, 2) if sqft > 0 else 0
|
||||
|
||||
# Combinar tax_assessed + comps + deductions para mid estimate
|
||||
candidates_mid = []
|
||||
if estimated_mid_from_comps:
|
||||
candidates_mid.append(estimated_mid_from_comps)
|
||||
if tax_assessed_value:
|
||||
# Tax assessed en FL suele ser ~85% del market value
|
||||
candidates_mid.append(int(tax_assessed_value / 0.85))
|
||||
|
||||
if candidates_mid:
|
||||
estimated_mid = int(sum(candidates_mid) / len(candidates_mid)) - deductions["total"]
|
||||
else:
|
||||
# Fallback: listing - deductions, con muy low confidence
|
||||
estimated_mid = max(0, int(listing_price) - deductions["total"])
|
||||
|
||||
estimated_mid = max(estimated_mid, 1) # nunca zero
|
||||
|
||||
estimated_low = int(estimated_mid * PRICE_LOW_PCT)
|
||||
estimated_high = int(estimated_mid * PRICE_HIGH_PCT)
|
||||
|
||||
# Confidence segun fuentes disponibles
|
||||
if comps and tax_assessed_value:
|
||||
confidence = "high"
|
||||
elif comps or tax_assessed_value:
|
||||
confidence = "medium"
|
||||
else:
|
||||
confidence = "low"
|
||||
|
||||
overpriced_pct = None
|
||||
if estimated_mid > 0:
|
||||
overpriced_pct = round((listing_price - estimated_mid) / estimated_mid * 100, 1)
|
||||
|
||||
inflation_score = calculate_inflation_score(listing_price, estimated_mid)
|
||||
|
||||
return {
|
||||
"listing_price": int(listing_price),
|
||||
"tax_assessed_value": tax_assessed_value,
|
||||
"comps_used": comps,
|
||||
"estimated_value": {
|
||||
"low": estimated_low,
|
||||
"mid": estimated_mid,
|
||||
"high": estimated_high,
|
||||
"confidence": confidence,
|
||||
},
|
||||
"price_per_sqft_comps_avg": avg_ppsqft,
|
||||
"price_per_sqft_subject": price_per_sqft_subject,
|
||||
"overpriced_pct": overpriced_pct,
|
||||
"inflation_score": inflation_score,
|
||||
"deductions": deductions,
|
||||
"market_trend": {
|
||||
"direction": "unknown",
|
||||
"evidence": "Para detectar tendencia requiere historial de comps (no implementado en MVP)",
|
||||
},
|
||||
"sources_used": sources_used,
|
||||
"fetch_errors": errors,
|
||||
"firecrawl_used": do_firecrawl and bool(comps),
|
||||
"fetched_at": fetched_at,
|
||||
}
|
||||
@@ -0,0 +1,233 @@
|
||||
"""Runner para data fetchers.
|
||||
|
||||
Flujo:
|
||||
1. Geocode (sequential) - sin esto no podemos hacer FEMA ni NOAA
|
||||
2. FEMA + HUD + NOAA en paralelo (ThreadPoolExecutor, max 3 workers)
|
||||
|
||||
Fail-soft en cada fetcher: si uno falla, el campo queda {} y se anota en fetch_errors.
|
||||
El pipeline nunca aborta.
|
||||
|
||||
Output schema:
|
||||
{
|
||||
"geocode": {matched_address, lat, lng, city, state, zip, county_name, county_fips, state_fips} | {}
|
||||
"flood": {zone, bfe, sfha, subtype, source} | {}
|
||||
"fmr": {year, county, state, fmr_efficiency, fmr_1br..fmr_4br, source} | {}
|
||||
"hurricanes": [{name, year, category, max_wind_mph, closest_pass_miles}, ...]
|
||||
"hurricanes_summary": {lookback_years, max_distance_mi, total_hurricanes_nearby, source}
|
||||
"fetch_errors": ["geocode: ...", "hud: ...", ...] # strings con explicacion
|
||||
"duration_seconds": float
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional, TYPE_CHECKING
|
||||
|
||||
from .base import FetcherError
|
||||
from .cache import FileCache
|
||||
from .census_geocode import fetch_geocode
|
||||
from .fema_flood import fetch_flood
|
||||
from .hud_fmr import fetch_fmr
|
||||
from .noaa_hurricanes import fetch_hurricanes
|
||||
from .neighborhood_class import fetch_neighborhood
|
||||
from .court_records import fetch_court_records, _enable_court_records # Wave 1.5A
|
||||
|
||||
|
||||
# Paths ABSOLUTOS anclados al proyecto (no relativos al CWD del caller).
|
||||
# Asi el cache y data files siempre estan en D:\Proyectos Software\AR-House\
|
||||
# sin importar desde donde se llama fetch_all().
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "data_fetchers"
|
||||
DEFAULT_HURDAT2_PATH = _PROJECT_ROOT / "data" / "hurdat2.txt"
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from orchestrator import DealInputs
|
||||
|
||||
|
||||
# TTL por namespace (dias)
|
||||
TTL = {
|
||||
"geocode": 30,
|
||||
"fema": 30,
|
||||
"hud_fmr": 365, # cambia anualmente
|
||||
"hurricanes": 30,
|
||||
"neighborhood": 90, # ACS y crime cambian lentamente
|
||||
"court_records": 7, # Wave 1.5A: procesos judiciales se mueven lento
|
||||
}
|
||||
|
||||
|
||||
def _emit(cb: Optional[Callable[[str], None]], msg: str) -> None:
|
||||
if cb:
|
||||
cb(msg)
|
||||
|
||||
|
||||
def _safe(
|
||||
cache: FileCache,
|
||||
namespace: str,
|
||||
cache_key: str,
|
||||
ttl: float,
|
||||
func: Callable[[], dict],
|
||||
errors: list,
|
||||
error_prefix: str,
|
||||
) -> dict:
|
||||
"""Wrapper fail-soft: usa cache, llama func si miss, captura errores."""
|
||||
cached = cache.get(namespace, cache_key, ttl)
|
||||
if cached is not None:
|
||||
return cached
|
||||
try:
|
||||
data = func()
|
||||
cache.set(namespace, cache_key, data)
|
||||
return data
|
||||
except FetcherError as e:
|
||||
errors.append(f"{error_prefix}: {e}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
errors.append(f"{error_prefix}: unexpected {type(e).__name__}: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_all(
|
||||
deal: "DealInputs",
|
||||
status_cb: Optional[Callable[[str], None]] = None,
|
||||
cache_dir: str | Path | None = None,
|
||||
hurdat2_path: str | Path | None = None,
|
||||
include_neighborhood_dom: bool = False,
|
||||
) -> dict:
|
||||
"""Obtiene todos los datos verificados para un deal.
|
||||
|
||||
Geocode primero (necesario para FEMA y NOAA). Los demas en paralelo.
|
||||
|
||||
Si cache_dir o hurdat2_path son None, usa paths absolutos anclados al
|
||||
proyecto (independientes del CWD del caller).
|
||||
"""
|
||||
# Default paths absolutos al proyecto (no relativos al CWD del caller)
|
||||
if cache_dir is None:
|
||||
cache_dir = DEFAULT_CACHE_DIR
|
||||
if hurdat2_path is None:
|
||||
hurdat2_path = DEFAULT_HURDAT2_PATH
|
||||
|
||||
t0 = time.perf_counter()
|
||||
cache = FileCache(cache_dir)
|
||||
errors: list[str] = []
|
||||
|
||||
# --- 1. Geocode (sequential, bloquea a los demas) ----------------------
|
||||
_emit(status_cb, "Geocodificando direccion (Census)...")
|
||||
geocode = _safe(
|
||||
cache, "geocode", deal.address, TTL["geocode"],
|
||||
lambda: fetch_geocode(deal.address),
|
||||
errors, "geocode",
|
||||
)
|
||||
|
||||
if not geocode or not geocode.get("lat") or not geocode.get("lng"):
|
||||
_emit(status_cb, " Geocodificacion fallo - omitiendo FEMA/NOAA/HUD/neighborhood")
|
||||
return {
|
||||
"geocode": geocode,
|
||||
"flood": {},
|
||||
"fmr": {},
|
||||
"hurricanes": [],
|
||||
"hurricanes_summary": {},
|
||||
"neighborhood": {},
|
||||
"fetch_errors": errors + (["geocode_failed_no_coords"] if not errors else []),
|
||||
"duration_seconds": round(time.perf_counter() - t0, 2),
|
||||
}
|
||||
|
||||
lat = float(geocode["lat"])
|
||||
lng = float(geocode["lng"])
|
||||
state = geocode.get("state", "")
|
||||
county_name = geocode.get("county_name", "")
|
||||
|
||||
_emit(
|
||||
status_cb,
|
||||
f" OK: {geocode.get('matched_address', '?')} | "
|
||||
f"{county_name}, {state} | ({lat:.4f}, {lng:.4f})"
|
||||
)
|
||||
|
||||
# --- 2. FEMA + HUD + NOAA + Neighborhood en paralelo -------------------
|
||||
_emit(status_cb, "Fetching FEMA / HUD / NOAA / Neighborhood en paralelo...")
|
||||
|
||||
def task_fema():
|
||||
return _safe(
|
||||
cache, "fema", f"{lat:.5f},{lng:.5f}", TTL["fema"],
|
||||
lambda: fetch_flood(lat, lng),
|
||||
errors, "fema",
|
||||
)
|
||||
|
||||
def task_hud():
|
||||
if not state or not county_name:
|
||||
errors.append("hud: state o county_name faltantes en geocode")
|
||||
return {}
|
||||
return _safe(
|
||||
cache, "hud_fmr", f"{state}|{county_name}", TTL["hud_fmr"],
|
||||
lambda: fetch_fmr(state, county_name),
|
||||
errors, "hud",
|
||||
)
|
||||
|
||||
def task_noaa():
|
||||
return _safe(
|
||||
cache, "hurricanes", f"{lat:.4f},{lng:.4f}", TTL["hurricanes"],
|
||||
lambda: fetch_hurricanes(lat, lng, years_back=20, hurdat2_path=hurdat2_path),
|
||||
errors, "noaa",
|
||||
)
|
||||
|
||||
def task_neighborhood():
|
||||
tract = geocode.get("tract_geoid") or "no_tract"
|
||||
return _safe(
|
||||
cache, "neighborhood", f"{tract}|dom={include_neighborhood_dom}", TTL["neighborhood"],
|
||||
lambda: fetch_neighborhood(geocode, include_dom=include_neighborhood_dom),
|
||||
errors, "neighborhood",
|
||||
)
|
||||
|
||||
# Wave 1.5A: court records (opt-in via ENABLE_COURT_RECORDS=true)
|
||||
# Solo si el county es Duval (Wave 1.5A v1). Otros condados → soft-fail.
|
||||
def task_court_records():
|
||||
if not _enable_court_records():
|
||||
return {"status": "DISABLED",
|
||||
"recommendation": "Activar ENABLE_COURT_RECORDS=true en .env para "
|
||||
"deteccion deterministica de foreclosure/lis pendens."}
|
||||
return _safe(
|
||||
cache, "court_records", f"{deal.address}|{county_name}", TTL["court_records"],
|
||||
lambda: fetch_court_records(address=deal.address, county_name=county_name),
|
||||
errors, "court_records",
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=5) as ex:
|
||||
f_fema = ex.submit(task_fema)
|
||||
f_hud = ex.submit(task_hud)
|
||||
f_noaa = ex.submit(task_noaa)
|
||||
f_nbh = ex.submit(task_neighborhood)
|
||||
f_court = ex.submit(task_court_records)
|
||||
flood = f_fema.result()
|
||||
fmr = f_hud.result()
|
||||
noaa_data = f_noaa.result()
|
||||
neighborhood = f_nbh.result()
|
||||
court_records = f_court.result()
|
||||
|
||||
hurricanes = noaa_data.get("hurricanes", []) if isinstance(noaa_data, dict) else []
|
||||
hurricanes_summary = {
|
||||
k: v for k, v in (noaa_data or {}).items() if k != "hurricanes"
|
||||
}
|
||||
|
||||
# Log de resumen
|
||||
f_zone = flood.get("zone", "N/A") if flood else "N/A"
|
||||
h3 = fmr.get("fmr_3br", "N/A") if fmr else "N/A"
|
||||
n_hur = len(hurricanes)
|
||||
nbh_class = neighborhood.get("neighborhood_class", "?") if neighborhood else "?"
|
||||
nbh_conf = neighborhood.get("confidence_level", "?") if neighborhood else "?"
|
||||
_emit(status_cb, f" Datos: FEMA={f_zone}, HUD 3BR=${h3}, {n_hur} huracanes, Nbh={nbh_class}({nbh_conf})")
|
||||
|
||||
if errors:
|
||||
_emit(status_cb, f" Fetcher errors: {len(errors)} (continuamos con datos parciales)")
|
||||
|
||||
return {
|
||||
"geocode": geocode,
|
||||
"flood": flood,
|
||||
"fmr": fmr,
|
||||
"hurricanes": hurricanes,
|
||||
"hurricanes_summary": hurricanes_summary,
|
||||
"neighborhood": neighborhood,
|
||||
"court_records": court_records, # Wave 1.5A
|
||||
"fetch_errors": errors,
|
||||
"duration_seconds": round(time.perf_counter() - t0, 2),
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
"""data_fetchers/zillow_photo_lookup.py — Buscar fotos de Zillow por address.
|
||||
|
||||
PROPOSITO:
|
||||
Los scrapers de county clerks (Miami-Dade, Duval, Broward, etc.) no exponen
|
||||
fotos de la propiedad. Sin embargo, Zillow tiene fotos para casi cualquier
|
||||
address en USA (incluso para foreclosures off-market).
|
||||
|
||||
Estrategia:
|
||||
1. Construir URL de Zillow address search: https://www.zillow.com/homes/{slug}_rb/
|
||||
2. Firecrawl scrape → markdown
|
||||
3. Regex sobre markdown para extraer photos.zillowstatic.com URLs
|
||||
4. Retornar list[str] de URLs (cap 5)
|
||||
|
||||
COSTO: 1 Firecrawl credit por address lookup.
|
||||
|
||||
USO:
|
||||
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
|
||||
urls = fetch_zillow_photos_by_address("2837 BLACK BUCK CIR, JACKSONVILLE, FL")
|
||||
# → ["https://photos.zillowstatic.com/fp/X.jpg", ...]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
# Photo URL pattern (Zillow CDN)
|
||||
_PHOTO_PAT = re.compile(
|
||||
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png|jpeg))\)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _build_address_search_url(address: str) -> str:
|
||||
"""Build Zillow address search URL.
|
||||
|
||||
Format: https://www.zillow.com/homes/{slug}_rb/
|
||||
Slug = uppercased address with dashes, no commas/extras.
|
||||
|
||||
e.g., "2837 BLACK BUCK CIR, JACKSONVILLE, FL" →
|
||||
https://www.zillow.com/homes/2837-BLACK-BUCK-CIR-JACKSONVILLE-FL_rb/
|
||||
"""
|
||||
s = address.upper().replace(",", "").replace(".", "")
|
||||
s = re.sub(r"\s+", "-", s.strip())
|
||||
s = re.sub(r"-+", "-", s)
|
||||
return f"https://www.zillow.com/homes/{s}_rb/"
|
||||
|
||||
|
||||
def fetch_zillow_photos_by_address(
|
||||
address: str,
|
||||
max_photos: int = 1, # Solo 1 foto (la principal). Las demas ve user en Zillow directly.
|
||||
debug: bool = False,
|
||||
) -> tuple[list[str], dict]:
|
||||
"""Fetch photo URLs from Zillow address search.
|
||||
|
||||
Returns (photo_urls, metadata).
|
||||
metadata: {url_attempted, address_matched_in_md, credits_used, error}
|
||||
|
||||
Caveats:
|
||||
- Si Zillow no tiene la propiedad, returns ([], {...}) silently.
|
||||
- Si Firecrawl falla, returns ([], {"error": ...}).
|
||||
- Caller debe persistir el resultado y NO reintentar si vacío
|
||||
(perderia credits sin ganar nada).
|
||||
"""
|
||||
meta = {
|
||||
"url_attempted": None,
|
||||
"address_matched_in_md": False,
|
||||
"credits_used": 0,
|
||||
"error": None,
|
||||
"markdown_size": 0,
|
||||
}
|
||||
|
||||
if not address or len(address.strip()) < 5:
|
||||
meta["error"] = "address too short / empty"
|
||||
return [], meta
|
||||
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "")
|
||||
if not api_key:
|
||||
meta["error"] = "FIRECRAWL_API_KEY not configured"
|
||||
return [], meta
|
||||
|
||||
url = _build_address_search_url(address)
|
||||
meta["url_attempted"] = url
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
resp = app.scrape(url, formats=["markdown"])
|
||||
md = resp.markdown if hasattr(resp, "markdown") else resp.get("markdown", "")
|
||||
meta["markdown_size"] = len(md)
|
||||
meta["credits_used"] = 1
|
||||
except Exception as e:
|
||||
meta["error"] = f"firecrawl error: {type(e).__name__}: {e}"
|
||||
return [], meta
|
||||
|
||||
# Verify address actually matched (sanity check — Zillow sometimes returns
|
||||
# a "no results" page or different property)
|
||||
# Extract street number + a distinctive street word
|
||||
addr_upper = address.upper()
|
||||
street_num_match = re.match(r"(\d+)", addr_upper.strip())
|
||||
street_num = street_num_match.group(1) if street_num_match else ""
|
||||
|
||||
# Check if street number AND some distinctive word from address appears in markdown
|
||||
addr_in_md = bool(street_num and street_num in md)
|
||||
if addr_in_md:
|
||||
# Look for at least one >3-char word from address
|
||||
words = [w for w in re.findall(r"[A-Z]+", addr_upper) if len(w) >= 4]
|
||||
word_match = any(w in md.upper() for w in words[:3])
|
||||
addr_in_md = addr_in_md and word_match
|
||||
meta["address_matched_in_md"] = addr_in_md
|
||||
|
||||
# Extract photos
|
||||
photos = _PHOTO_PAT.findall(md)
|
||||
# Dedup keeping order
|
||||
seen = set()
|
||||
unique = []
|
||||
for p in photos:
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
unique.append(p)
|
||||
|
||||
# Only return photos if address matched (defensive)
|
||||
if not addr_in_md and not debug:
|
||||
meta["error"] = "address not matched in Zillow markdown (no result page)"
|
||||
return [], meta
|
||||
|
||||
return unique[:max_photos], meta
|
||||
Reference in New Issue
Block a user