feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,404 @@
|
||||
"""data_fetchers/pa_miami_dade.py — Full Miami-Dade PA extractor.
|
||||
|
||||
Sitio: https://apps.miamidadepa.gov/PropertySearch/ (Angular 14 + Kendo UI)
|
||||
Deep link: /PropertySearch/#/?folio={folio_no_dashes}
|
||||
|
||||
Extrae todo lo publico del Miami-Dade PA via los components Angular:
|
||||
- pa-propertyinformation: folio, sub-division, address, owner, mailing,
|
||||
PA primary zone, primary land use, beds/baths/half, floors, living units,
|
||||
living area, adjusted area, lot size, year built
|
||||
- pa-salesinformation: sales history (date, price, OR book-page, qualification,
|
||||
previous owner)
|
||||
- pa-assessmentinformation: land/building/extra/market/assessed 3 anios
|
||||
- pa-taxablevalueinformation: COUNTY/SCHOOL/etc exemption + taxable
|
||||
- pa-benefitsinformation: homestead + other exemptions
|
||||
- pa-legaldescription: legal description completa
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
|
||||
rec = fetch_miami_dade_pa_record(parcel_id="31-2202-034-2470")
|
||||
# rec["owner_name"], rec["year_built"], rec["sales_history"]...
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
|
||||
_BASE_URL = "https://apps.miamidadepa.gov/PropertySearch"
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Text parsing helpers — labels are on left, values on right (newline separated)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _grab_after_label(text: str, label: str) -> Optional[str]:
|
||||
"""Find 'label' and return text immediately after (until next label/newline)."""
|
||||
if not text or not label:
|
||||
return None
|
||||
# Match "Label:value" or "Label\nvalue" or "Label\tvalue"
|
||||
pattern = re.compile(
|
||||
rf"{re.escape(label)}\s*[:\t]*\s*\n?\s*([^\n]+?)(?:\n|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
m = pattern.search(text)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
def _to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", str(s))
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", str(s))
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_miami_dade_pa_record(
|
||||
parcel_id: Optional[str] = None,
|
||||
address: Optional[str] = None,
|
||||
timeout_seconds: int = 45,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Fetch full Miami-Dade PA record.
|
||||
|
||||
Args:
|
||||
parcel_id: folio number (e.g. "31-2202-034-2470" or "3122020342470")
|
||||
address: alternative search by address (less reliable in this portal)
|
||||
timeout_seconds: max wait per playwright op
|
||||
listing_price: enables flip-in-progress detection
|
||||
|
||||
Returns: rich dict (same schema as pa_duval/pa_broward) with errors list.
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
result = {
|
||||
"county": "Miami-Dade",
|
||||
"source": "Miami-Dade Property Appraiser (apps.miamidadepa.gov)",
|
||||
"fetched_at": fetched_at,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if not parcel_id and not address:
|
||||
result["errors"].append("no parcel_id or address provided")
|
||||
return result
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
result["errors"].append("playwright not installed")
|
||||
return result
|
||||
|
||||
# Normalize folio (no dashes for URL)
|
||||
folio_clean = (parcel_id or "").replace("-", "").strip()
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(user_agent=USER_AGENT)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
if folio_clean:
|
||||
# Deep link by folio
|
||||
url = f"{_BASE_URL}/#/?folio={folio_clean}"
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
else:
|
||||
# Search by address — landing page + fill form
|
||||
page.goto(f"{_BASE_URL}/", wait_until="domcontentloaded")
|
||||
time.sleep(5)
|
||||
# Address tab is default. Fill kendo-textbox[formcontrolname='address']
|
||||
addr_input = page.locator("kendo-textbox[formcontrolname='address'] input").first
|
||||
addr_input.fill(address or "")
|
||||
page.locator("button[aria-label='Search button']").first.click()
|
||||
|
||||
# Wait for property info to render
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"() => document.querySelector('pa-propertyinformation') "
|
||||
"&& document.querySelector('pa-propertyinformation').innerText.includes('Folio')",
|
||||
timeout=20000,
|
||||
)
|
||||
except Exception as e:
|
||||
result["errors"].append(f"detail page didn't render: {e}")
|
||||
browser.close()
|
||||
return result
|
||||
|
||||
time.sleep(2)
|
||||
result["source_url"] = page.url
|
||||
|
||||
# Extract text from each pa-component
|
||||
sections = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const components = [
|
||||
'pa-propertyinformation','pa-salesinformation',
|
||||
'pa-assessmentinformation','pa-taxablevalueinformation',
|
||||
'pa-benefitsinformation','pa-legaldescription',
|
||||
'pa-additionalinformation',
|
||||
];
|
||||
for (const tag of components) {
|
||||
const el = document.querySelector(tag);
|
||||
out[tag] = el ? (el.innerText || '').trim() : '';
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Also extract sales history table rows
|
||||
sales_rows = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const sec = document.querySelector('pa-salesinformation');
|
||||
if (!sec) return out;
|
||||
const tbl = sec.querySelector('table');
|
||||
if (!tbl) return out;
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
out.push({
|
||||
date: (cells[0]?.textContent || '').trim(),
|
||||
price: (cells[1]?.textContent || '').trim(),
|
||||
book_page: (cells[2]?.textContent || '').trim(),
|
||||
qualification: (cells[3]?.textContent || '').trim(),
|
||||
previous_owner: cells.length > 4 ? (cells[4]?.textContent || '').trim() : '',
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract assessment table (3 years)
|
||||
# Header row: find the row whose first cell text is "Year".
|
||||
assessment_rows = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const sec = document.querySelector('pa-assessmentinformation');
|
||||
if (!sec) return out;
|
||||
const tables = sec.querySelectorAll('table');
|
||||
if (tables.length === 0) return out;
|
||||
// Find header row in any table
|
||||
let years = [];
|
||||
let headerRowIdx = -1;
|
||||
let chosenTbl = null;
|
||||
for (const tbl of tables) {
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const firstCell = (rows[i].querySelector('th, td')?.textContent || '').trim().toLowerCase();
|
||||
if (firstCell === 'year') {
|
||||
const headerCells = rows[i].querySelectorAll('th, td');
|
||||
years = Array.from(headerCells).map(c => (c.textContent || '').trim()).slice(1);
|
||||
headerRowIdx = i;
|
||||
chosenTbl = tbl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (chosenTbl) break;
|
||||
}
|
||||
if (!chosenTbl || years.length === 0) return out;
|
||||
const rows = chosenTbl.querySelectorAll('tr');
|
||||
for (let i = headerRowIdx + 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td, th');
|
||||
if (cells.length < 2) continue;
|
||||
const label = (cells[0]?.textContent || '').trim();
|
||||
const values = {};
|
||||
for (let j = 1; j < cells.length && j-1 < years.length; j++) {
|
||||
values[years[j-1]] = (cells[j].textContent || '').trim();
|
||||
}
|
||||
if (label) out[label] = values;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract taxable value table (by district)
|
||||
taxable_rows = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const sec = document.querySelector('pa-taxablevalueinformation');
|
||||
if (!sec) return out;
|
||||
out._text = (sec.innerText || '').trim().substring(0, 2000);
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
# ─── Post-process — parse via text labels ─────────────────────
|
||||
prop_text = sections.get("pa-propertyinformation", "")
|
||||
result["parcel_id"] = _grab_after_label(prop_text, "Folio")
|
||||
result["subdivision"] = _grab_after_label(prop_text, "Sub-Division")
|
||||
# Address: "Property Address\n{addr}"
|
||||
addr_block_match = re.search(
|
||||
r"Property Address\s*\n([^\n]+)", prop_text, re.IGNORECASE,
|
||||
)
|
||||
if addr_block_match:
|
||||
result["site_address"] = addr_block_match.group(1).strip()
|
||||
# Owner: "Owner\n{name(s)}"
|
||||
owner_match = re.search(
|
||||
r"Owner\s*\n([^\n]+(?:\n[^\n]+)?)", prop_text, re.IGNORECASE,
|
||||
)
|
||||
if owner_match:
|
||||
owner_text = owner_match.group(1).strip()
|
||||
# Split on newline for multiple owners
|
||||
lines = [l.strip() for l in owner_text.split("\n") if l.strip()]
|
||||
result["owner_name"] = lines[0] if lines else None
|
||||
result["co_owners"] = lines[1:] if len(lines) > 1 else []
|
||||
|
||||
mailing_match = re.search(
|
||||
r"Mailing Address\s*\n((?:[^\n]+\n?){1,3})", prop_text, re.IGNORECASE,
|
||||
)
|
||||
if mailing_match:
|
||||
result["mailing_address"] = re.sub(
|
||||
r"\s+", " ", mailing_match.group(1).strip(),
|
||||
)
|
||||
|
||||
result["pa_primary_zone"] = _grab_after_label(prop_text, "PA Primary Zone")
|
||||
result["use_code"] = _grab_after_label(prop_text, "Primary Land Use")
|
||||
result["use_description"] = result.get("use_code")
|
||||
beds_baths = _grab_after_label(prop_text, "Beds / Baths /Half")
|
||||
if beds_baths:
|
||||
parts = [p.strip() for p in beds_baths.split("/")]
|
||||
try:
|
||||
result["bedrooms"] = int(parts[0]) if parts[0] else None
|
||||
except (ValueError, IndexError):
|
||||
result["bedrooms"] = None
|
||||
try:
|
||||
result["baths"] = float(parts[1]) if len(parts) > 1 and parts[1] else None
|
||||
except (ValueError, IndexError):
|
||||
result["baths"] = None
|
||||
result["floors"] = _to_int(_grab_after_label(prop_text, "Floors"))
|
||||
result["living_units"] = _to_int(_grab_after_label(prop_text, "Living Units"))
|
||||
living_area = _grab_after_label(prop_text, "Living Area")
|
||||
result["sqft_heated"] = _to_int(living_area) if living_area else None
|
||||
adj_area = _grab_after_label(prop_text, "Adjusted Area")
|
||||
result["sqft_total"] = _to_int(adj_area) if adj_area else None
|
||||
lot_size = _grab_after_label(prop_text, "Lot Size")
|
||||
result["lot_total_sqft"] = _to_int(lot_size) if lot_size else None
|
||||
result["year_built"] = _to_int(_grab_after_label(prop_text, "Year Built"))
|
||||
|
||||
# Sales history — clean each row
|
||||
result["sales_history"] = []
|
||||
for r in sales_rows:
|
||||
date_str = r.get("date", "")
|
||||
price_str = r.get("price", "")
|
||||
# Skip header rows / non-data
|
||||
if not date_str or "Sale" in date_str or date_str.lower() == "previous sale":
|
||||
continue
|
||||
rec = {
|
||||
"date": date_str,
|
||||
"price": _money_to_int(price_str),
|
||||
"book_page": r.get("book_page", ""),
|
||||
"qualification": r.get("qualification", ""),
|
||||
"previous_owner": r.get("previous_owner", ""),
|
||||
# Approximate Duval-compatible 'qualified' flag
|
||||
"qualified": "Qualified" if "qual" in r.get("qualification", "").lower()
|
||||
and "disqual" not in r.get("qualification", "").lower()
|
||||
else "Unqualified",
|
||||
}
|
||||
if rec["date"]:
|
||||
result["sales_history"].append(rec)
|
||||
|
||||
# Most recent qualified sale
|
||||
qualified = [s for s in result["sales_history"]
|
||||
if s.get("qualified", "").startswith("Qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
result["most_recent_qualified_sale"] = qualified[0] if qualified else None
|
||||
|
||||
# Assessment 3-year values (Year column → Land, Building, Market, Assessed)
|
||||
# assessment_rows = {"Land Value": {"2025": "$0", ...}, "Market Value": {...}}
|
||||
result["assessment_table"] = assessment_rows
|
||||
# Resolve current/last/two-years
|
||||
years_present = []
|
||||
for label_dict in assessment_rows.values():
|
||||
if isinstance(label_dict, dict):
|
||||
for y in label_dict.keys():
|
||||
if y and y not in years_present:
|
||||
years_present.append(y)
|
||||
# Pick most recent year as current
|
||||
years_present_sorted = sorted([y for y in years_present if y.isdigit()], reverse=True)
|
||||
current_year = years_present_sorted[0] if years_present_sorted else None
|
||||
last_year = years_present_sorted[1] if len(years_present_sorted) > 1 else None
|
||||
|
||||
def _val(label, year):
|
||||
if year and assessment_rows.get(label):
|
||||
return _money_to_int(assessment_rows[label].get(year, "0"))
|
||||
return None
|
||||
|
||||
result["just_value_current"] = _val("Market Value", current_year)
|
||||
result["assessed_value_current"] = _val("Assessed Value", current_year)
|
||||
result["just_value_last"] = _val("Market Value", last_year)
|
||||
result["assessed_value_last"] = _val("Assessed Value", last_year)
|
||||
result["tax_year_current"] = int(current_year) if current_year else None
|
||||
result["tax_year_last"] = int(last_year) if last_year else None
|
||||
|
||||
# Homestead detection from benefits section text
|
||||
benefits_text = sections.get("pa-benefitsinformation", "") or ""
|
||||
result["homestead_active"] = "homestead" in benefits_text.lower() and "$" in benefits_text
|
||||
|
||||
# Legal description
|
||||
legal_text = sections.get("pa-legaldescription", "") or ""
|
||||
result["legal_description"] = re.sub(
|
||||
r"^Legal Description\s*\n",
|
||||
"",
|
||||
legal_text.strip(),
|
||||
)[:500] if legal_text else None
|
||||
|
||||
# Renovation signal
|
||||
from data_fetchers.pa_duval import _detect_renovation_pattern
|
||||
result["renovation_signal"] = _detect_renovation_pattern(
|
||||
result["sales_history"], listing_price=listing_price,
|
||||
)
|
||||
|
||||
# Raw sections for advanced consumers
|
||||
result["_raw_sections"] = sections
|
||||
result["_raw_taxable_text"] = taxable_rows.get("_text", "")
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result["errors"].append(f"{type(e).__name__}: {e}")
|
||||
result["_trace"] = traceback.format_exc()[:600]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Miami-Dade PA full record fetcher")
|
||||
parser.add_argument("--parcel", help="Folio number (e.g. '31-2202-034-2470')")
|
||||
parser.add_argument("--address", help="Alternative address search")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.parcel and not args.address:
|
||||
parser.error("--parcel or --address required")
|
||||
|
||||
rec = fetch_miami_dade_pa_record(parcel_id=args.parcel, address=args.address)
|
||||
print(json.dumps(rec, indent=2, default=str))
|
||||
Reference in New Issue
Block a user