feat: AR-House initial commit

2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
@@ -0,0 +1,404 @@
+"""data_fetchers/pa_miami_dade.py — Full Miami-Dade PA extractor.
+
+Sitio: https://apps.miamidadepa.gov/PropertySearch/ (Angular 14 + Kendo UI)
+Deep link: /PropertySearch/#/?folio={folio_no_dashes}
+
+Extrae todo lo publico del Miami-Dade PA via los components Angular:
+- pa-propertyinformation: folio, sub-division, address, owner, mailing,
+  PA primary zone, primary land use, beds/baths/half, floors, living units,
+  living area, adjusted area, lot size, year built
+- pa-salesinformation: sales history (date, price, OR book-page, qualification,
+  previous owner)
+- pa-assessmentinformation: land/building/extra/market/assessed 3 anios
+- pa-taxablevalueinformation: COUNTY/SCHOOL/etc exemption + taxable
+- pa-benefitsinformation: homestead + other exemptions
+- pa-legaldescription: legal description completa
+
+USAGE:
+    from data_fetchers.pa_miami_dade import fetch_miami_dade_pa_record
+    rec = fetch_miami_dade_pa_record(parcel_id="31-2202-034-2470")
+    # rec["owner_name"], rec["year_built"], rec["sales_history"]...
+"""
+from __future__ import annotations
+
+import re
+import time
+from datetime import datetime, timezone
+from typing import Optional
+
+
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
+_BASE_URL = "https://apps.miamidadepa.gov/PropertySearch"
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# Text parsing helpers — labels are on left, values on right (newline separated)
+# ════════════════════════════════════════════════════════════════════════════
+
+def _grab_after_label(text: str, label: str) -> Optional[str]:
+    """Find 'label' and return text immediately after (until next label/newline)."""
+    if not text or not label:
+        return None
+    # Match "Label:value" or "Label\nvalue" or "Label\tvalue"
+    pattern = re.compile(
+        rf"{re.escape(label)}\s*[:\t]*\s*\n?\s*([^\n]+?)(?:\n|$)",
+        re.IGNORECASE,
+    )
+    m = pattern.search(text)
+    if m:
+        return m.group(1).strip()
+    return None
+
+
+def _to_int(s) -> Optional[int]:
+    if not s:
+        return None
+    cleaned = re.sub(r"[^\d-]", "", str(s))
+    try:
+        return int(cleaned) if cleaned else None
+    except ValueError:
+        return None
+
+
+def _money_to_int(s) -> Optional[int]:
+    if not s:
+        return None
+    cleaned = re.sub(r"[^\d.-]", "", str(s))
+    if not cleaned or cleaned == "-":
+        return None
+    try:
+        return int(float(cleaned))
+    except ValueError:
+        return None
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# Public API
+# ════════════════════════════════════════════════════════════════════════════
+
+def fetch_miami_dade_pa_record(
+    parcel_id: Optional[str] = None,
+    address: Optional[str] = None,
+    timeout_seconds: int = 45,
+    listing_price: Optional[float] = None,
+) -> dict:
+    """Fetch full Miami-Dade PA record.
+
+    Args:
+        parcel_id: folio number (e.g. "31-2202-034-2470" or "3122020342470")
+        address: alternative search by address (less reliable in this portal)
+        timeout_seconds: max wait per playwright op
+        listing_price: enables flip-in-progress detection
+
+    Returns: rich dict (same schema as pa_duval/pa_broward) with errors list.
+    """
+    fetched_at = datetime.now(timezone.utc).isoformat()
+    result = {
+        "county": "Miami-Dade",
+        "source": "Miami-Dade Property Appraiser (apps.miamidadepa.gov)",
+        "fetched_at": fetched_at,
+        "errors": [],
+    }
+
+    if not parcel_id and not address:
+        result["errors"].append("no parcel_id or address provided")
+        return result
+
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError:
+        result["errors"].append("playwright not installed")
+        return result
+
+    # Normalize folio (no dashes for URL)
+    folio_clean = (parcel_id or "").replace("-", "").strip()
+
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            ctx = browser.new_context(user_agent=USER_AGENT)
+            page = ctx.new_page()
+            page.set_default_timeout(timeout_seconds * 1000)
+
+            if folio_clean:
+                # Deep link by folio
+                url = f"{_BASE_URL}/#/?folio={folio_clean}"
+                page.goto(url, wait_until="domcontentloaded")
+            else:
+                # Search by address — landing page + fill form
+                page.goto(f"{_BASE_URL}/", wait_until="domcontentloaded")
+                time.sleep(5)
+                # Address tab is default. Fill kendo-textbox[formcontrolname='address']
+                addr_input = page.locator("kendo-textbox[formcontrolname='address'] input").first
+                addr_input.fill(address or "")
+                page.locator("button[aria-label='Search button']").first.click()
+
+            # Wait for property info to render
+            try:
+                page.wait_for_function(
+                    "() => document.querySelector('pa-propertyinformation') "
+                    "&& document.querySelector('pa-propertyinformation').innerText.includes('Folio')",
+                    timeout=20000,
+                )
+            except Exception as e:
+                result["errors"].append(f"detail page didn't render: {e}")
+                browser.close()
+                return result
+
+            time.sleep(2)
+            result["source_url"] = page.url
+
+            # Extract text from each pa-component
+            sections = page.evaluate("""
+                () => {
+                    const out = {};
+                    const components = [
+                        'pa-propertyinformation','pa-salesinformation',
+                        'pa-assessmentinformation','pa-taxablevalueinformation',
+                        'pa-benefitsinformation','pa-legaldescription',
+                        'pa-additionalinformation',
+                    ];
+                    for (const tag of components) {
+                        const el = document.querySelector(tag);
+                        out[tag] = el ? (el.innerText || '').trim() : '';
+                    }
+                    return out;
+                }
+            """)
+
+            # Also extract sales history table rows
+            sales_rows = page.evaluate("""
+                () => {
+                    const out = [];
+                    const sec = document.querySelector('pa-salesinformation');
+                    if (!sec) return out;
+                    const tbl = sec.querySelector('table');
+                    if (!tbl) return out;
+                    const rows = tbl.querySelectorAll('tr');
+                    for (let i = 1; i < rows.length; i++) {
+                        const cells = rows[i].querySelectorAll('td');
+                        if (cells.length < 4) continue;
+                        out.push({
+                            date: (cells[0]?.textContent || '').trim(),
+                            price: (cells[1]?.textContent || '').trim(),
+                            book_page: (cells[2]?.textContent || '').trim(),
+                            qualification: (cells[3]?.textContent || '').trim(),
+                            previous_owner: cells.length > 4 ? (cells[4]?.textContent || '').trim() : '',
+                        });
+                    }
+                    return out;
+                }
+            """)
+
+            # Extract assessment table (3 years)
+            # Header row: find the row whose first cell text is "Year".
+            assessment_rows = page.evaluate("""
+                () => {
+                    const out = {};
+                    const sec = document.querySelector('pa-assessmentinformation');
+                    if (!sec) return out;
+                    const tables = sec.querySelectorAll('table');
+                    if (tables.length === 0) return out;
+                    // Find header row in any table
+                    let years = [];
+                    let headerRowIdx = -1;
+                    let chosenTbl = null;
+                    for (const tbl of tables) {
+                        const rows = tbl.querySelectorAll('tr');
+                        for (let i = 0; i < rows.length; i++) {
+                            const firstCell = (rows[i].querySelector('th, td')?.textContent || '').trim().toLowerCase();
+                            if (firstCell === 'year') {
+                                const headerCells = rows[i].querySelectorAll('th, td');
+                                years = Array.from(headerCells).map(c => (c.textContent || '').trim()).slice(1);
+                                headerRowIdx = i;
+                                chosenTbl = tbl;
+                                break;
+                            }
+                        }
+                        if (chosenTbl) break;
+                    }
+                    if (!chosenTbl || years.length === 0) return out;
+                    const rows = chosenTbl.querySelectorAll('tr');
+                    for (let i = headerRowIdx + 1; i < rows.length; i++) {
+                        const cells = rows[i].querySelectorAll('td, th');
+                        if (cells.length < 2) continue;
+                        const label = (cells[0]?.textContent || '').trim();
+                        const values = {};
+                        for (let j = 1; j < cells.length && j-1 < years.length; j++) {
+                            values[years[j-1]] = (cells[j].textContent || '').trim();
+                        }
+                        if (label) out[label] = values;
+                    }
+                    return out;
+                }
+            """)
+
+            # Extract taxable value table (by district)
+            taxable_rows = page.evaluate("""
+                () => {
+                    const out = {};
+                    const sec = document.querySelector('pa-taxablevalueinformation');
+                    if (!sec) return out;
+                    out._text = (sec.innerText || '').trim().substring(0, 2000);
+                    return out;
+                }
+            """)
+
+            browser.close()
+
+            # ─── Post-process — parse via text labels ─────────────────────
+            prop_text = sections.get("pa-propertyinformation", "")
+            result["parcel_id"] = _grab_after_label(prop_text, "Folio")
+            result["subdivision"] = _grab_after_label(prop_text, "Sub-Division")
+            # Address: "Property Address\n{addr}"
+            addr_block_match = re.search(
+                r"Property Address\s*\n([^\n]+)", prop_text, re.IGNORECASE,
+            )
+            if addr_block_match:
+                result["site_address"] = addr_block_match.group(1).strip()
+            # Owner: "Owner\n{name(s)}"
+            owner_match = re.search(
+                r"Owner\s*\n([^\n]+(?:\n[^\n]+)?)", prop_text, re.IGNORECASE,
+            )
+            if owner_match:
+                owner_text = owner_match.group(1).strip()
+                # Split on newline for multiple owners
+                lines = [l.strip() for l in owner_text.split("\n") if l.strip()]
+                result["owner_name"] = lines[0] if lines else None
+                result["co_owners"] = lines[1:] if len(lines) > 1 else []
+
+            mailing_match = re.search(
+                r"Mailing Address\s*\n((?:[^\n]+\n?){1,3})", prop_text, re.IGNORECASE,
+            )
+            if mailing_match:
+                result["mailing_address"] = re.sub(
+                    r"\s+", " ", mailing_match.group(1).strip(),
+                )
+
+            result["pa_primary_zone"] = _grab_after_label(prop_text, "PA Primary Zone")
+            result["use_code"] = _grab_after_label(prop_text, "Primary Land Use")
+            result["use_description"] = result.get("use_code")
+            beds_baths = _grab_after_label(prop_text, "Beds / Baths /Half")
+            if beds_baths:
+                parts = [p.strip() for p in beds_baths.split("/")]
+                try:
+                    result["bedrooms"] = int(parts[0]) if parts[0] else None
+                except (ValueError, IndexError):
+                    result["bedrooms"] = None
+                try:
+                    result["baths"] = float(parts[1]) if len(parts) > 1 and parts[1] else None
+                except (ValueError, IndexError):
+                    result["baths"] = None
+            result["floors"] = _to_int(_grab_after_label(prop_text, "Floors"))
+            result["living_units"] = _to_int(_grab_after_label(prop_text, "Living Units"))
+            living_area = _grab_after_label(prop_text, "Living Area")
+            result["sqft_heated"] = _to_int(living_area) if living_area else None
+            adj_area = _grab_after_label(prop_text, "Adjusted Area")
+            result["sqft_total"] = _to_int(adj_area) if adj_area else None
+            lot_size = _grab_after_label(prop_text, "Lot Size")
+            result["lot_total_sqft"] = _to_int(lot_size) if lot_size else None
+            result["year_built"] = _to_int(_grab_after_label(prop_text, "Year Built"))
+
+            # Sales history — clean each row
+            result["sales_history"] = []
+            for r in sales_rows:
+                date_str = r.get("date", "")
+                price_str = r.get("price", "")
+                # Skip header rows / non-data
+                if not date_str or "Sale" in date_str or date_str.lower() == "previous sale":
+                    continue
+                rec = {
+                    "date": date_str,
+                    "price": _money_to_int(price_str),
+                    "book_page": r.get("book_page", ""),
+                    "qualification": r.get("qualification", ""),
+                    "previous_owner": r.get("previous_owner", ""),
+                    # Approximate Duval-compatible 'qualified' flag
+                    "qualified": "Qualified" if "qual" in r.get("qualification", "").lower()
+                                  and "disqual" not in r.get("qualification", "").lower()
+                                  else "Unqualified",
+                }
+                if rec["date"]:
+                    result["sales_history"].append(rec)
+
+            # Most recent qualified sale
+            qualified = [s for s in result["sales_history"]
+                         if s.get("qualified", "").startswith("Qualified")
+                         and s.get("price", 0) and s["price"] >= 1000]
+            result["most_recent_qualified_sale"] = qualified[0] if qualified else None
+
+            # Assessment 3-year values (Year column → Land, Building, Market, Assessed)
+            # assessment_rows = {"Land Value": {"2025": "$0", ...}, "Market Value": {...}}
+            result["assessment_table"] = assessment_rows
+            # Resolve current/last/two-years
+            years_present = []
+            for label_dict in assessment_rows.values():
+                if isinstance(label_dict, dict):
+                    for y in label_dict.keys():
+                        if y and y not in years_present:
+                            years_present.append(y)
+            # Pick most recent year as current
+            years_present_sorted = sorted([y for y in years_present if y.isdigit()], reverse=True)
+            current_year = years_present_sorted[0] if years_present_sorted else None
+            last_year = years_present_sorted[1] if len(years_present_sorted) > 1 else None
+
+            def _val(label, year):
+                if year and assessment_rows.get(label):
+                    return _money_to_int(assessment_rows[label].get(year, "0"))
+                return None
+
+            result["just_value_current"] = _val("Market Value", current_year)
+            result["assessed_value_current"] = _val("Assessed Value", current_year)
+            result["just_value_last"] = _val("Market Value", last_year)
+            result["assessed_value_last"] = _val("Assessed Value", last_year)
+            result["tax_year_current"] = int(current_year) if current_year else None
+            result["tax_year_last"] = int(last_year) if last_year else None
+
+            # Homestead detection from benefits section text
+            benefits_text = sections.get("pa-benefitsinformation", "") or ""
+            result["homestead_active"] = "homestead" in benefits_text.lower() and "$" in benefits_text
+
+            # Legal description
+            legal_text = sections.get("pa-legaldescription", "") or ""
+            result["legal_description"] = re.sub(
+                r"^Legal Description\s*\n",
+                "",
+                legal_text.strip(),
+            )[:500] if legal_text else None
+
+            # Renovation signal
+            from data_fetchers.pa_duval import _detect_renovation_pattern
+            result["renovation_signal"] = _detect_renovation_pattern(
+                result["sales_history"], listing_price=listing_price,
+            )
+
+            # Raw sections for advanced consumers
+            result["_raw_sections"] = sections
+            result["_raw_taxable_text"] = taxable_rows.get("_text", "")
+
+    except Exception as e:
+        import traceback
+        result["errors"].append(f"{type(e).__name__}: {e}")
+        result["_trace"] = traceback.format_exc()[:600]
+
+    return result
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# CLI
+# ════════════════════════════════════════════════════════════════════════════
+
+if __name__ == "__main__":
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Miami-Dade PA full record fetcher")
+    parser.add_argument("--parcel", help="Folio number (e.g. '31-2202-034-2470')")
+    parser.add_argument("--address", help="Alternative address search")
+    args = parser.parse_args()
+
+    if not args.parcel and not args.address:
+        parser.error("--parcel or --address required")
+
+    rec = fetch_miami_dade_pa_record(parcel_id=args.parcel, address=args.address)
+    print(json.dumps(rec, indent=2, default=str))