feat: AR-House initial commit

This commit is contained in:
2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
+404
View File
@@ -0,0 +1,404 @@
"""data_fetchers/pa_miami_dade.py — Full Miami-Dade PA extractor.
Sitio: https://apps.miamidadepa.gov/PropertySearch/ (Angular 14 + Kendo UI)
Deep link: /PropertySearch/#/?folio={folio_no_dashes}
Extrae todo lo publico del Miami-Dade PA via los components Angular:
- pa-propertyinformation: folio, sub-division, address, owner, mailing,
PA primary zone, primary land use, beds/baths/half, floors, living units,
living area, adjusted area, lot size, year built
- pa-salesinformation: sales history (date, price, OR book-page, qualification,
previous owner)
- pa-assessmentinformation: land/building/extra/market/assessed 3 anios
- pa-taxablevalueinformation: COUNTY/SCHOOL/etc exemption + taxable
- pa-benefitsinformation: homestead + other exemptions
- pa-legaldescription: legal description completa
USAGE:
from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
rec = fetch_miami_dade_pa_record(parcel_id="31-2202-034-2470")
# rec["owner_name"], rec["year_built"], rec["sales_history"]...
"""
from __future__ import annotations
import re
import time
from datetime import datetime, timezone
from typing import Optional
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
_BASE_URL = "https://apps.miamidadepa.gov/PropertySearch"
# ════════════════════════════════════════════════════════════════════════════
# Text parsing helpers — labels are on left, values on right (newline separated)
# ════════════════════════════════════════════════════════════════════════════
def _grab_after_label(text: str, label: str) -> Optional[str]:
"""Find 'label' and return text immediately after (until next label/newline)."""
if not text or not label:
return None
# Match "Label:value" or "Label\nvalue" or "Label\tvalue"
pattern = re.compile(
rf"{re.escape(label)}\s*[:\t]*\s*\n?\s*([^\n]+?)(?:\n|$)",
re.IGNORECASE,
)
m = pattern.search(text)
if m:
return m.group(1).strip()
return None
def _to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d-]", "", str(s))
try:
return int(cleaned) if cleaned else None
except ValueError:
return None
def _money_to_int(s) -> Optional[int]:
if not s:
return None
cleaned = re.sub(r"[^\d.-]", "", str(s))
if not cleaned or cleaned == "-":
return None
try:
return int(float(cleaned))
except ValueError:
return None
# ════════════════════════════════════════════════════════════════════════════
# Public API
# ════════════════════════════════════════════════════════════════════════════
def fetch_miami_dade_pa_record(
parcel_id: Optional[str] = None,
address: Optional[str] = None,
timeout_seconds: int = 45,
listing_price: Optional[float] = None,
) -> dict:
"""Fetch full Miami-Dade PA record.
Args:
parcel_id: folio number (e.g. "31-2202-034-2470" or "3122020342470")
address: alternative search by address (less reliable in this portal)
timeout_seconds: max wait per playwright op
listing_price: enables flip-in-progress detection
Returns: rich dict (same schema as pa_duval/pa_broward) with errors list.
"""
fetched_at = datetime.now(timezone.utc).isoformat()
result = {
"county": "Miami-Dade",
"source": "Miami-Dade Property Appraiser (apps.miamidadepa.gov)",
"fetched_at": fetched_at,
"errors": [],
}
if not parcel_id and not address:
result["errors"].append("no parcel_id or address provided")
return result
try:
from playwright.sync_api import sync_playwright
except ImportError:
result["errors"].append("playwright not installed")
return result
# Normalize folio (no dashes for URL)
folio_clean = (parcel_id or "").replace("-", "").strip()
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(user_agent=USER_AGENT)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
if folio_clean:
# Deep link by folio
url = f"{_BASE_URL}/#/?folio={folio_clean}"
page.goto(url, wait_until="domcontentloaded")
else:
# Search by address — landing page + fill form
page.goto(f"{_BASE_URL}/", wait_until="domcontentloaded")
time.sleep(5)
# Address tab is default. Fill kendo-textbox[formcontrolname='address']
addr_input = page.locator("kendo-textbox[formcontrolname='address'] input").first
addr_input.fill(address or "")
page.locator("button[aria-label='Search button']").first.click()
# Wait for property info to render
try:
page.wait_for_function(
"() => document.querySelector('pa-propertyinformation') "
"&& document.querySelector('pa-propertyinformation').innerText.includes('Folio')",
timeout=20000,
)
except Exception as e:
result["errors"].append(f"detail page didn't render: {e}")
browser.close()
return result
time.sleep(2)
result["source_url"] = page.url
# Extract text from each pa-component
sections = page.evaluate("""
() => {
const out = {};
const components = [
'pa-propertyinformation','pa-salesinformation',
'pa-assessmentinformation','pa-taxablevalueinformation',
'pa-benefitsinformation','pa-legaldescription',
'pa-additionalinformation',
];
for (const tag of components) {
const el = document.querySelector(tag);
out[tag] = el ? (el.innerText || '').trim() : '';
}
return out;
}
""")
# Also extract sales history table rows
sales_rows = page.evaluate("""
() => {
const out = [];
const sec = document.querySelector('pa-salesinformation');
if (!sec) return out;
const tbl = sec.querySelector('table');
if (!tbl) return out;
const rows = tbl.querySelectorAll('tr');
for (let i = 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td');
if (cells.length < 4) continue;
out.push({
date: (cells[0]?.textContent || '').trim(),
price: (cells[1]?.textContent || '').trim(),
book_page: (cells[2]?.textContent || '').trim(),
qualification: (cells[3]?.textContent || '').trim(),
previous_owner: cells.length > 4 ? (cells[4]?.textContent || '').trim() : '',
});
}
return out;
}
""")
# Extract assessment table (3 years)
# Header row: find the row whose first cell text is "Year".
assessment_rows = page.evaluate("""
() => {
const out = {};
const sec = document.querySelector('pa-assessmentinformation');
if (!sec) return out;
const tables = sec.querySelectorAll('table');
if (tables.length === 0) return out;
// Find header row in any table
let years = [];
let headerRowIdx = -1;
let chosenTbl = null;
for (const tbl of tables) {
const rows = tbl.querySelectorAll('tr');
for (let i = 0; i < rows.length; i++) {
const firstCell = (rows[i].querySelector('th, td')?.textContent || '').trim().toLowerCase();
if (firstCell === 'year') {
const headerCells = rows[i].querySelectorAll('th, td');
years = Array.from(headerCells).map(c => (c.textContent || '').trim()).slice(1);
headerRowIdx = i;
chosenTbl = tbl;
break;
}
}
if (chosenTbl) break;
}
if (!chosenTbl || years.length === 0) return out;
const rows = chosenTbl.querySelectorAll('tr');
for (let i = headerRowIdx + 1; i < rows.length; i++) {
const cells = rows[i].querySelectorAll('td, th');
if (cells.length < 2) continue;
const label = (cells[0]?.textContent || '').trim();
const values = {};
for (let j = 1; j < cells.length && j-1 < years.length; j++) {
values[years[j-1]] = (cells[j].textContent || '').trim();
}
if (label) out[label] = values;
}
return out;
}
""")
# Extract taxable value table (by district)
taxable_rows = page.evaluate("""
() => {
const out = {};
const sec = document.querySelector('pa-taxablevalueinformation');
if (!sec) return out;
out._text = (sec.innerText || '').trim().substring(0, 2000);
return out;
}
""")
browser.close()
# ─── Post-process — parse via text labels ─────────────────────
prop_text = sections.get("pa-propertyinformation", "")
result["parcel_id"] = _grab_after_label(prop_text, "Folio")
result["subdivision"] = _grab_after_label(prop_text, "Sub-Division")
# Address: "Property Address\n{addr}"
addr_block_match = re.search(
r"Property Address\s*\n([^\n]+)", prop_text, re.IGNORECASE,
)
if addr_block_match:
result["site_address"] = addr_block_match.group(1).strip()
# Owner: "Owner\n{name(s)}"
owner_match = re.search(
r"Owner\s*\n([^\n]+(?:\n[^\n]+)?)", prop_text, re.IGNORECASE,
)
if owner_match:
owner_text = owner_match.group(1).strip()
# Split on newline for multiple owners
lines = [l.strip() for l in owner_text.split("\n") if l.strip()]
result["owner_name"] = lines[0] if lines else None
result["co_owners"] = lines[1:] if len(lines) > 1 else []
mailing_match = re.search(
r"Mailing Address\s*\n((?:[^\n]+\n?){1,3})", prop_text, re.IGNORECASE,
)
if mailing_match:
result["mailing_address"] = re.sub(
r"\s+", " ", mailing_match.group(1).strip(),
)
result["pa_primary_zone"] = _grab_after_label(prop_text, "PA Primary Zone")
result["use_code"] = _grab_after_label(prop_text, "Primary Land Use")
result["use_description"] = result.get("use_code")
beds_baths = _grab_after_label(prop_text, "Beds / Baths /Half")
if beds_baths:
parts = [p.strip() for p in beds_baths.split("/")]
try:
result["bedrooms"] = int(parts[0]) if parts[0] else None
except (ValueError, IndexError):
result["bedrooms"] = None
try:
result["baths"] = float(parts[1]) if len(parts) > 1 and parts[1] else None
except (ValueError, IndexError):
result["baths"] = None
result["floors"] = _to_int(_grab_after_label(prop_text, "Floors"))
result["living_units"] = _to_int(_grab_after_label(prop_text, "Living Units"))
living_area = _grab_after_label(prop_text, "Living Area")
result["sqft_heated"] = _to_int(living_area) if living_area else None
adj_area = _grab_after_label(prop_text, "Adjusted Area")
result["sqft_total"] = _to_int(adj_area) if adj_area else None
lot_size = _grab_after_label(prop_text, "Lot Size")
result["lot_total_sqft"] = _to_int(lot_size) if lot_size else None
result["year_built"] = _to_int(_grab_after_label(prop_text, "Year Built"))
# Sales history — clean each row
result["sales_history"] = []
for r in sales_rows:
date_str = r.get("date", "")
price_str = r.get("price", "")
# Skip header rows / non-data
if not date_str or "Sale" in date_str or date_str.lower() == "previous sale":
continue
rec = {
"date": date_str,
"price": _money_to_int(price_str),
"book_page": r.get("book_page", ""),
"qualification": r.get("qualification", ""),
"previous_owner": r.get("previous_owner", ""),
# Approximate Duval-compatible 'qualified' flag
"qualified": "Qualified" if "qual" in r.get("qualification", "").lower()
and "disqual" not in r.get("qualification", "").lower()
else "Unqualified",
}
if rec["date"]:
result["sales_history"].append(rec)
# Most recent qualified sale
qualified = [s for s in result["sales_history"]
if s.get("qualified", "").startswith("Qualified")
and s.get("price", 0) and s["price"] >= 1000]
result["most_recent_qualified_sale"] = qualified[0] if qualified else None
# Assessment 3-year values (Year column → Land, Building, Market, Assessed)
# assessment_rows = {"Land Value": {"2025": "$0", ...}, "Market Value": {...}}
result["assessment_table"] = assessment_rows
# Resolve current/last/two-years
years_present = []
for label_dict in assessment_rows.values():
if isinstance(label_dict, dict):
for y in label_dict.keys():
if y and y not in years_present:
years_present.append(y)
# Pick most recent year as current
years_present_sorted = sorted([y for y in years_present if y.isdigit()], reverse=True)
current_year = years_present_sorted[0] if years_present_sorted else None
last_year = years_present_sorted[1] if len(years_present_sorted) > 1 else None
def _val(label, year):
if year and assessment_rows.get(label):
return _money_to_int(assessment_rows[label].get(year, "0"))
return None
result["just_value_current"] = _val("Market Value", current_year)
result["assessed_value_current"] = _val("Assessed Value", current_year)
result["just_value_last"] = _val("Market Value", last_year)
result["assessed_value_last"] = _val("Assessed Value", last_year)
result["tax_year_current"] = int(current_year) if current_year else None
result["tax_year_last"] = int(last_year) if last_year else None
# Homestead detection from benefits section text
benefits_text = sections.get("pa-benefitsinformation", "") or ""
result["homestead_active"] = "homestead" in benefits_text.lower() and "$" in benefits_text
# Legal description
legal_text = sections.get("pa-legaldescription", "") or ""
result["legal_description"] = re.sub(
r"^Legal Description\s*\n",
"",
legal_text.strip(),
)[:500] if legal_text else None
# Renovation signal
from data_fetchers.pa_duval import _detect_renovation_pattern
result["renovation_signal"] = _detect_renovation_pattern(
result["sales_history"], listing_price=listing_price,
)
# Raw sections for advanced consumers
result["_raw_sections"] = sections
result["_raw_taxable_text"] = taxable_rows.get("_text", "")
except Exception as e:
import traceback
result["errors"].append(f"{type(e).__name__}: {e}")
result["_trace"] = traceback.format_exc()[:600]
return result
# ════════════════════════════════════════════════════════════════════════════
# CLI
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Miami-Dade PA full record fetcher")
parser.add_argument("--parcel", help="Folio number (e.g. '31-2202-034-2470')")
parser.add_argument("--address", help="Alternative address search")
args = parser.parse_args()
if not args.parcel and not args.address:
parser.error("--parcel or --address required")
rec = fetch_miami_dade_pa_record(parcel_id=args.parcel, address=args.address)
print(json.dumps(rec, indent=2, default=str))