feat: AR-House initial commit

2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
@@ -0,0 +1,456 @@
+"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform.
+
+Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces
+que comparten 33 condados de FL. Una sola implementacion los cubre a todos.
+
+COUNTIES COVERED (33):
+  Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist,
+  Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes,
+  Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau,
+  Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton,
+  Washington.
+
+NOT INCLUDED (use other adapters):
+  Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee,
+  Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe,
+  Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas,
+  Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor.
+
+USAGE:
+    from data_fetchers.civitek_ocrs import fetch_civitek_court_records
+    result = fetch_civitek_court_records(
+        county_name="Hernando",
+        case_number="2024-CA-001234",
+    )
+    # → {status, case_data, lis_pendens, sources_used, source_url, errors, ...}
+
+TECHNICAL NOTES:
+- Uses Playwright headless Chromium (free, ~$0 cost per query)
+- Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests
+- Auto-generated DOM ids (j_idt*) change per session — we use text selectors
+- Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable
+- Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse)
+- Rate limit: not stated by Civitek — we self-throttle to 1 req/2s
+"""
+from __future__ import annotations
+
+import re
+import time
+from datetime import datetime
+from typing import Optional
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# COUNTY CODE MAPPING (Civitek 2-digit codes)
+# ════════════════════════════════════════════════════════════════════════════
+
+CIVITEK_COUNTY_CODES: dict[str, str] = {
+    # Format: "County Name (canonical)": "NN" (2-digit Civitek code)
+    "Baker": "02",
+    "Bradford": "04",
+    "Calhoun": "07",
+    "Columbia": "12",
+    "DeSoto": "14",
+    "Dixie": "15",
+    "Franklin": "19",
+    "Gilchrist": "21",
+    "Glades": "22",
+    "Gulf": "23",
+    "Hamilton": "24",
+    "Hardee": "25",
+    "Hendry": "26",
+    "Hernando": "27",
+    "Highlands": "28",
+    "Holmes": "30",
+    "Jackson": "32",
+    "Jefferson": "33",
+    "Lafayette": "34",
+    "Levy": "38",
+    "Liberty": "39",
+    "Madison": "40",
+    "Marion": "42",
+    "Nassau": "45",
+    "Okeechobee": "47",
+    "Pasco": "51",
+    "Putnam": "54",
+    "Santa Rosa": "57",
+    "Sumter": "60",
+    "Union": "63",
+    "Wakulla": "65",
+    "Walton": "66",
+    "Washington": "67",
+}
+
+
+def is_civitek_county(county_name: Optional[str]) -> bool:
+    """True if county is in Civitek (33 FL counties)."""
+    if not county_name:
+        return False
+    cn = county_name.strip().replace(" County", "").replace(" county", "")
+    return cn in CIVITEK_COUNTY_CODES
+
+
+def civitek_code_for(county_name: str) -> Optional[str]:
+    """Return Civitek 2-digit code for a county name, or None."""
+    cn = county_name.strip().replace(" County", "").replace(" county", "")
+    return CIVITEK_COUNTY_CODES.get(cn)
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# CASE NUMBER PARSER
+# ════════════════════════════════════════════════════════════════════════════
+
+# Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately.
+# Common formats observed in realauction.com deals:
+#   "2024-CA-001234"
+#   "23-2024-CA-001234"
+#   "2024CA001234"
+#   "2024-001234-CA"
+#   "27-2024-CA-001234"  (court code prefix)
+_CASE_PATTERNS = [
+    # year-type-seq
+    re.compile(r"(?:\d{2}-)?(?P<year>20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P<seq>\d{3,8})", re.IGNORECASE),
+    # year-seq-type
+    re.compile(r"(?P<year>20\d{2})[\-\s]?(?P<seq>\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE),
+    # tight: yearTypeNNNNNN
+    re.compile(r"(?P<year>20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P<seq>\d{3,8})", re.IGNORECASE),
+]
+
+
+def parse_case_number(case_number: str) -> Optional[tuple[str, str]]:
+    """Parse a FL case_number into (year, sequence). Returns None if unparseable.
+
+    Examples:
+        "2024-CA-001234" → ("2024", "001234")
+        "23-2024-CA-001234" → ("2024", "001234")
+        "2024CA001234" → ("2024", "001234")
+    """
+    if not case_number:
+        return None
+    cn = case_number.strip().upper()
+    for pat in _CASE_PATTERNS:
+        m = pat.search(cn)
+        if m:
+            year = m.group("year")
+            seq = m.group("seq").lstrip("0") or "0"
+            return (year, seq)
+    return None
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# PUBLIC API
+# ════════════════════════════════════════════════════════════════════════════
+
+def fetch_civitek_court_records(
+    county_name: str,
+    case_number: Optional[str] = None,
+    party_lastname: Optional[str] = None,
+    party_firstname: Optional[str] = None,
+    business_name: Optional[str] = None,
+    headless: bool = True,
+    timeout_seconds: int = 45,
+) -> dict:
+    """Fetch court records from Civitek OCRS.
+
+    Provide ONE of:
+        - case_number (e.g., "2024-CA-001234") → fastest, most precise
+        - party_lastname (with optional firstname) → person search
+        - business_name → business search
+
+    Returns dict matching court_records.py contract:
+        {
+            "status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" |
+                      "FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" |
+                      "NOT_FOUND" | "ERROR",
+            "county": str (normalized),
+            "case_number_searched": str,
+            "search_method": "case_number" | "person_name" | "business_name",
+            "results": list of dicts (raw cases found),
+            "case_data": dict (top result enriched) | None,
+            "lis_pendens": list,
+            "liens_inventory": dict,
+            "sources_used": ["civitek_ocrs"],
+            "source_url": str,
+            "errors": list of strings,
+            "fetched_at": ISO timestamp,
+        }
+    """
+    fetched_at = datetime.utcnow().isoformat() + "Z"
+    county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "")
+
+    # Validate county
+    code = civitek_code_for(county_normalized)
+    if not code:
+        return _error_result(
+            county=county_normalized,
+            case_number_searched=case_number or "",
+            error=f"County '{county_normalized}' not on Civitek platform. "
+                  f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...",
+            fetched_at=fetched_at,
+        )
+
+    # Validate at least one search criterion
+    if not (case_number or party_lastname or business_name):
+        return _error_result(
+            county=county_normalized,
+            case_number_searched="",
+            error="Must provide one of: case_number, party_lastname, or business_name",
+            fetched_at=fetched_at,
+        )
+
+    # Determine search method
+    if case_number:
+        parsed = parse_case_number(case_number)
+        if not parsed:
+            return _error_result(
+                county=county_normalized,
+                case_number_searched=case_number,
+                error=f"Could not parse case_number '{case_number}' into year+sequence",
+                fetched_at=fetched_at,
+            )
+        year, seq = parsed
+        search_method = "case_number"
+    elif business_name:
+        year = seq = None
+        search_method = "business_name"
+    else:
+        year = seq = None
+        search_method = "person_name"
+
+    # Execute Playwright flow
+    try:
+        from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
+    except ImportError:
+        return _error_result(
+            county=county_normalized,
+            case_number_searched=case_number or "",
+            error="playwright not installed. Run: pip install playwright && playwright install chromium",
+            fetched_at=fetched_at,
+        )
+
+    base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/"
+    errors: list[str] = []
+    results: list[dict] = []
+    final_url = base_url
+    status_from_results = "UNKNOWN"
+
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=headless)
+            ctx = browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
+            )
+            page = ctx.new_page()
+            page.set_default_timeout(timeout_seconds * 1000)
+
+            # Step 1: Entry page
+            page.goto(base_url)
+            page.wait_for_timeout(1500)
+
+            # Step 2: Click Public
+            page.locator("button:has-text('Public')").first.click()
+            page.wait_for_timeout(2500)
+
+            # Step 3: Click I Agree (disclaimer)
+            agree_btn = page.locator("button:has-text('I Agree')").first
+            if agree_btn.count() == 0:
+                errors.append("Disclaimer page didn't show 'I Agree' button")
+                browser.close()
+                return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at)
+            agree_btn.click()
+            page.wait_for_timeout(2500)
+            final_url = page.url
+
+            # Step 4: Fill form based on search_method
+            if search_method == "case_number":
+                # Switch to Case Search tab (data-index=1)
+                case_tab = page.locator("li[role='tab']:has-text('Case Search')").first
+                if case_tab.count() == 0:
+                    errors.append("Case Search tab not found")
+                else:
+                    case_tab.click()
+                    page.wait_for_timeout(1500)
+                page.fill("#form\\:search_tab\\:year", year)
+                page.fill("#form\\:search_tab\\:seq", seq)
+            elif search_method == "person_name":
+                # Person Search tab is default (data-index=0)
+                page.fill("#form\\:search_tab\\:lastname", party_lastname)
+                if party_firstname:
+                    page.fill("#form\\:search_tab\\:fname", party_firstname)
+            elif search_method == "business_name":
+                page.fill("#form\\:search_tab\\:businessname", business_name)
+
+            # Step 5: Submit
+            search_btn = page.locator(
+                "button:has(.ui-button-text:text-is('Search'))"
+            ).first
+            if search_btn.count() == 0:
+                search_btn = page.locator("button:has-text('Search')").first
+            search_btn.click()
+            page.wait_for_timeout(6000)
+
+            # Step 6: Capture validation errors
+            err_msgs = page.locator(".ui-message-error, .ui-messages-error").all()
+            for m in err_msgs[:5]:
+                try:
+                    t = (m.inner_text() or "").strip()
+                    if t and t not in errors:
+                        errors.append(t[:200])
+                except Exception:
+                    pass
+
+            # Step 7: Parse results table
+            # Civitek results render in a data table with role=grid or as a panelgrid.
+            # Look for tables that contain "Case" headers
+            results = _parse_results(page)
+            final_url = page.url
+
+            browser.close()
+    except PWTimeout as e:
+        errors.append(f"Playwright timeout: {e}")
+    except Exception as e:
+        errors.append(f"Playwright crashed: {type(e).__name__}: {e}")
+
+    # Determine status from results
+    if errors and not results:
+        status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND"
+    elif not results:
+        status_from_results = "NOT_FOUND"
+    else:
+        # Has results — infer from case_type
+        first_type = (results[0].get("case_type") or "").upper()
+        if "CA" in first_type or "CIVIL" in first_type:
+            status_from_results = "FORECLOSURE_PENDING"  # CA cases include foreclosures
+        elif "CF" in first_type or "FELONY" in first_type:
+            status_from_results = "CLEAN"  # unrelated criminal
+        else:
+            status_from_results = "UNKNOWN"
+
+    # Top result enriched
+    case_data = results[0] if results else None
+
+    return {
+        "status": status_from_results,
+        "county": county_normalized,
+        "case_number_searched": case_number or "",
+        "search_method": search_method,
+        "results": results,
+        "case_data": case_data,
+        "lis_pendens": [],
+        "liens_inventory": {},
+        "sources_used": ["civitek_ocrs"],
+        "source_url": final_url,
+        "errors": errors,
+        "fetched_at": fetched_at,
+    }
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# Internal helpers
+# ════════════════════════════════════════════════════════════════════════════
+
+def _parse_results(page) -> list[dict]:
+    """Parse the results table from a Civitek search results page.
+
+    Civitek renders results as a DataTable (PrimeFaces). Look for tables with
+    case-related headers. Returns list of dicts with case_number, parties,
+    filed_date, case_type.
+    """
+    results: list[dict] = []
+    tables = page.locator("table").all()
+    for tbl in tables:
+        try:
+            rows = tbl.locator("tr")
+            row_count = rows.count()
+            if row_count < 2:
+                continue
+            # Header row
+            headers_raw = rows.first.locator("th, td").all()
+            headers = [(h.inner_text() or "").strip().lower() for h in headers_raw]
+            # Heuristic: this table has case results if headers include any
+            # of "case", "uniform", "date", "party", "type"
+            if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type"))
+                       for h in headers):
+                continue
+            # Index columns
+            col_idx = {}
+            for i, h in enumerate(headers):
+                if "case" in h or "uniform" in h:
+                    col_idx["case_number"] = i
+                elif "type" in h or "court" in h:
+                    col_idx["case_type"] = i
+                elif "filed" in h or "date" in h:
+                    col_idx["filed_date"] = i
+                elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h:
+                    col_idx["parties"] = i
+                elif "status" in h:
+                    col_idx["status"] = i
+
+            # Data rows
+            for r in range(1, row_count):
+                cells = rows.nth(r).locator("td").all()
+                if not cells:
+                    continue
+                cell_texts = [(c.inner_text() or "").strip() for c in cells]
+                row_data = {}
+                for k, i in col_idx.items():
+                    if i < len(cell_texts):
+                        row_data[k] = cell_texts[i]
+                if row_data:
+                    results.append(row_data)
+        except Exception:
+            continue
+        # If we found a results table with rows, stop
+        if results:
+            break
+    return results
+
+
+def _error_result(
+    county: str,
+    case_number_searched: str,
+    error: str,
+    fetched_at: str,
+) -> dict:
+    return {
+        "status": "ERROR",
+        "county": county,
+        "case_number_searched": case_number_searched,
+        "search_method": None,
+        "results": [],
+        "case_data": None,
+        "lis_pendens": [],
+        "liens_inventory": {},
+        "sources_used": ["civitek_ocrs"],
+        "source_url": "",
+        "errors": [error],
+        "fetched_at": fetched_at,
+    }
+
+
+# ════════════════════════════════════════════════════════════════════════════
+# CLI for manual testing
+# ════════════════════════════════════════════════════════════════════════════
+
+if __name__ == "__main__":
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test")
+    parser.add_argument("--county", required=True, help="County name (e.g., Hernando)")
+    parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)")
+    parser.add_argument("--last-name", help="Last name for person search")
+    parser.add_argument("--first-name", help="First name (optional with last-name)")
+    parser.add_argument("--business", help="Business name search")
+    parser.add_argument("--no-headless", action="store_true", help="Show browser window")
+    args = parser.parse_args()
+
+    result = fetch_civitek_court_records(
+        county_name=args.county,
+        case_number=args.case,
+        party_lastname=args.last_name,
+        party_firstname=args.first_name,
+        business_name=args.business,
+        headless=not args.no_headless,
+    )
+    print(json.dumps(result, indent=2, default=str))