"""data_fetchers/pa_miami_dade.py — Full Miami-Dade PA extractor. Sitio: https://apps.miamidadepa.gov/PropertySearch/ (Angular 14 + Kendo UI) Deep link: /PropertySearch/#/?folio={folio_no_dashes} Extrae todo lo publico del Miami-Dade PA via los components Angular: - pa-propertyinformation: folio, sub-division, address, owner, mailing, PA primary zone, primary land use, beds/baths/half, floors, living units, living area, adjusted area, lot size, year built - pa-salesinformation: sales history (date, price, OR book-page, qualification, previous owner) - pa-assessmentinformation: land/building/extra/market/assessed 3 anios - pa-taxablevalueinformation: COUNTY/SCHOOL/etc exemption + taxable - pa-benefitsinformation: homestead + other exemptions - pa-legaldescription: legal description completa USAGE: from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record rec = fetch_miami_dade_pa_record(parcel_id="31-2202-034-2470") # rec["owner_name"], rec["year_built"], rec["sales_history"]... """ from __future__ import annotations import re import time from datetime import datetime, timezone from typing import Optional USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131" _BASE_URL = "https://apps.miamidadepa.gov/PropertySearch" # ════════════════════════════════════════════════════════════════════════════ # Text parsing helpers — labels are on left, values on right (newline separated) # ════════════════════════════════════════════════════════════════════════════ def _grab_after_label(text: str, label: str) -> Optional[str]: """Find 'label' and return text immediately after (until next label/newline).""" if not text or not label: return None # Match "Label:value" or "Label\nvalue" or "Label\tvalue" pattern = re.compile( rf"{re.escape(label)}\s*[:\t]*\s*\n?\s*([^\n]+?)(?:\n|$)", re.IGNORECASE, ) m = pattern.search(text) if m: return m.group(1).strip() return None def _to_int(s) -> Optional[int]: if not s: return None cleaned = re.sub(r"[^\d-]", "", str(s)) try: return int(cleaned) if cleaned else None except ValueError: return None def _money_to_int(s) -> Optional[int]: if not s: return None cleaned = re.sub(r"[^\d.-]", "", str(s)) if not cleaned or cleaned == "-": return None try: return int(float(cleaned)) except ValueError: return None # ════════════════════════════════════════════════════════════════════════════ # Public API # ════════════════════════════════════════════════════════════════════════════ def fetch_miami_dade_pa_record( parcel_id: Optional[str] = None, address: Optional[str] = None, timeout_seconds: int = 45, listing_price: Optional[float] = None, ) -> dict: """Fetch full Miami-Dade PA record. Args: parcel_id: folio number (e.g. "31-2202-034-2470" or "3122020342470") address: alternative search by address (less reliable in this portal) timeout_seconds: max wait per playwright op listing_price: enables flip-in-progress detection Returns: rich dict (same schema as pa_duval/pa_broward) with errors list. """ fetched_at = datetime.now(timezone.utc).isoformat() result = { "county": "Miami-Dade", "source": "Miami-Dade Property Appraiser (apps.miamidadepa.gov)", "fetched_at": fetched_at, "errors": [], } if not parcel_id and not address: result["errors"].append("no parcel_id or address provided") return result try: from playwright.sync_api import sync_playwright except ImportError: result["errors"].append("playwright not installed") return result # Normalize folio (no dashes for URL) folio_clean = (parcel_id or "").replace("-", "").strip() try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context(user_agent=USER_AGENT) page = ctx.new_page() page.set_default_timeout(timeout_seconds * 1000) if folio_clean: # Deep link by folio url = f"{_BASE_URL}/#/?folio={folio_clean}" page.goto(url, wait_until="domcontentloaded") else: # Search by address — landing page + fill form page.goto(f"{_BASE_URL}/", wait_until="domcontentloaded") time.sleep(5) # Address tab is default. Fill kendo-textbox[formcontrolname='address'] addr_input = page.locator("kendo-textbox[formcontrolname='address'] input").first addr_input.fill(address or "") page.locator("button[aria-label='Search button']").first.click() # Wait for property info to render try: page.wait_for_function( "() => document.querySelector('pa-propertyinformation') " "&& document.querySelector('pa-propertyinformation').innerText.includes('Folio')", timeout=20000, ) except Exception as e: result["errors"].append(f"detail page didn't render: {e}") browser.close() return result time.sleep(2) result["source_url"] = page.url # Extract text from each pa-component sections = page.evaluate(""" () => { const out = {}; const components = [ 'pa-propertyinformation','pa-salesinformation', 'pa-assessmentinformation','pa-taxablevalueinformation', 'pa-benefitsinformation','pa-legaldescription', 'pa-additionalinformation', ]; for (const tag of components) { const el = document.querySelector(tag); out[tag] = el ? (el.innerText || '').trim() : ''; } return out; } """) # Also extract sales history table rows sales_rows = page.evaluate(""" () => { const out = []; const sec = document.querySelector('pa-salesinformation'); if (!sec) return out; const tbl = sec.querySelector('table'); if (!tbl) return out; const rows = tbl.querySelectorAll('tr'); for (let i = 1; i < rows.length; i++) { const cells = rows[i].querySelectorAll('td'); if (cells.length < 4) continue; out.push({ date: (cells[0]?.textContent || '').trim(), price: (cells[1]?.textContent || '').trim(), book_page: (cells[2]?.textContent || '').trim(), qualification: (cells[3]?.textContent || '').trim(), previous_owner: cells.length > 4 ? (cells[4]?.textContent || '').trim() : '', }); } return out; } """) # Extract assessment table (3 years) # Header row: find the row whose first cell text is "Year". assessment_rows = page.evaluate(""" () => { const out = {}; const sec = document.querySelector('pa-assessmentinformation'); if (!sec) return out; const tables = sec.querySelectorAll('table'); if (tables.length === 0) return out; // Find header row in any table let years = []; let headerRowIdx = -1; let chosenTbl = null; for (const tbl of tables) { const rows = tbl.querySelectorAll('tr'); for (let i = 0; i < rows.length; i++) { const firstCell = (rows[i].querySelector('th, td')?.textContent || '').trim().toLowerCase(); if (firstCell === 'year') { const headerCells = rows[i].querySelectorAll('th, td'); years = Array.from(headerCells).map(c => (c.textContent || '').trim()).slice(1); headerRowIdx = i; chosenTbl = tbl; break; } } if (chosenTbl) break; } if (!chosenTbl || years.length === 0) return out; const rows = chosenTbl.querySelectorAll('tr'); for (let i = headerRowIdx + 1; i < rows.length; i++) { const cells = rows[i].querySelectorAll('td, th'); if (cells.length < 2) continue; const label = (cells[0]?.textContent || '').trim(); const values = {}; for (let j = 1; j < cells.length && j-1 < years.length; j++) { values[years[j-1]] = (cells[j].textContent || '').trim(); } if (label) out[label] = values; } return out; } """) # Extract taxable value table (by district) taxable_rows = page.evaluate(""" () => { const out = {}; const sec = document.querySelector('pa-taxablevalueinformation'); if (!sec) return out; out._text = (sec.innerText || '').trim().substring(0, 2000); return out; } """) browser.close() # ─── Post-process — parse via text labels ───────────────────── prop_text = sections.get("pa-propertyinformation", "") result["parcel_id"] = _grab_after_label(prop_text, "Folio") result["subdivision"] = _grab_after_label(prop_text, "Sub-Division") # Address: "Property Address\n{addr}" addr_block_match = re.search( r"Property Address\s*\n([^\n]+)", prop_text, re.IGNORECASE, ) if addr_block_match: result["site_address"] = addr_block_match.group(1).strip() # Owner: "Owner\n{name(s)}" owner_match = re.search( r"Owner\s*\n([^\n]+(?:\n[^\n]+)?)", prop_text, re.IGNORECASE, ) if owner_match: owner_text = owner_match.group(1).strip() # Split on newline for multiple owners lines = [l.strip() for l in owner_text.split("\n") if l.strip()] result["owner_name"] = lines[0] if lines else None result["co_owners"] = lines[1:] if len(lines) > 1 else [] mailing_match = re.search( r"Mailing Address\s*\n((?:[^\n]+\n?){1,3})", prop_text, re.IGNORECASE, ) if mailing_match: result["mailing_address"] = re.sub( r"\s+", " ", mailing_match.group(1).strip(), ) result["pa_primary_zone"] = _grab_after_label(prop_text, "PA Primary Zone") result["use_code"] = _grab_after_label(prop_text, "Primary Land Use") result["use_description"] = result.get("use_code") beds_baths = _grab_after_label(prop_text, "Beds / Baths /Half") if beds_baths: parts = [p.strip() for p in beds_baths.split("/")] try: result["bedrooms"] = int(parts[0]) if parts[0] else None except (ValueError, IndexError): result["bedrooms"] = None try: result["baths"] = float(parts[1]) if len(parts) > 1 and parts[1] else None except (ValueError, IndexError): result["baths"] = None result["floors"] = _to_int(_grab_after_label(prop_text, "Floors")) result["living_units"] = _to_int(_grab_after_label(prop_text, "Living Units")) living_area = _grab_after_label(prop_text, "Living Area") result["sqft_heated"] = _to_int(living_area) if living_area else None adj_area = _grab_after_label(prop_text, "Adjusted Area") result["sqft_total"] = _to_int(adj_area) if adj_area else None lot_size = _grab_after_label(prop_text, "Lot Size") result["lot_total_sqft"] = _to_int(lot_size) if lot_size else None result["year_built"] = _to_int(_grab_after_label(prop_text, "Year Built")) # Sales history — clean each row result["sales_history"] = [] for r in sales_rows: date_str = r.get("date", "") price_str = r.get("price", "") # Skip header rows / non-data if not date_str or "Sale" in date_str or date_str.lower() == "previous sale": continue rec = { "date": date_str, "price": _money_to_int(price_str), "book_page": r.get("book_page", ""), "qualification": r.get("qualification", ""), "previous_owner": r.get("previous_owner", ""), # Approximate Duval-compatible 'qualified' flag "qualified": "Qualified" if "qual" in r.get("qualification", "").lower() and "disqual" not in r.get("qualification", "").lower() else "Unqualified", } if rec["date"]: result["sales_history"].append(rec) # Most recent qualified sale qualified = [s for s in result["sales_history"] if s.get("qualified", "").startswith("Qualified") and s.get("price", 0) and s["price"] >= 1000] result["most_recent_qualified_sale"] = qualified[0] if qualified else None # Assessment 3-year values (Year column → Land, Building, Market, Assessed) # assessment_rows = {"Land Value": {"2025": "$0", ...}, "Market Value": {...}} result["assessment_table"] = assessment_rows # Resolve current/last/two-years years_present = [] for label_dict in assessment_rows.values(): if isinstance(label_dict, dict): for y in label_dict.keys(): if y and y not in years_present: years_present.append(y) # Pick most recent year as current years_present_sorted = sorted([y for y in years_present if y.isdigit()], reverse=True) current_year = years_present_sorted[0] if years_present_sorted else None last_year = years_present_sorted[1] if len(years_present_sorted) > 1 else None def _val(label, year): if year and assessment_rows.get(label): return _money_to_int(assessment_rows[label].get(year, "0")) return None result["just_value_current"] = _val("Market Value", current_year) result["assessed_value_current"] = _val("Assessed Value", current_year) result["just_value_last"] = _val("Market Value", last_year) result["assessed_value_last"] = _val("Assessed Value", last_year) result["tax_year_current"] = int(current_year) if current_year else None result["tax_year_last"] = int(last_year) if last_year else None # Homestead detection from benefits section text benefits_text = sections.get("pa-benefitsinformation", "") or "" result["homestead_active"] = "homestead" in benefits_text.lower() and "$" in benefits_text # Legal description legal_text = sections.get("pa-legaldescription", "") or "" result["legal_description"] = re.sub( r"^Legal Description\s*\n", "", legal_text.strip(), )[:500] if legal_text else None # Renovation signal from data_fetchers.pa_duval import _detect_renovation_pattern result["renovation_signal"] = _detect_renovation_pattern( result["sales_history"], listing_price=listing_price, ) # Raw sections for advanced consumers result["_raw_sections"] = sections result["_raw_taxable_text"] = taxable_rows.get("_text", "") except Exception as e: import traceback result["errors"].append(f"{type(e).__name__}: {e}") result["_trace"] = traceback.format_exc()[:600] return result # ════════════════════════════════════════════════════════════════════════════ # CLI # ════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": import argparse import json parser = argparse.ArgumentParser(description="Miami-Dade PA full record fetcher") parser.add_argument("--parcel", help="Folio number (e.g. '31-2202-034-2470')") parser.add_argument("--address", help="Alternative address search") args = parser.parse_args() if not args.parcel and not args.address: parser.error("--parcel or --address required") rec = fetch_miami_dade_pa_record(parcel_id=args.parcel, address=args.address) print(json.dumps(rec, indent=2, default=str))