AR-House/data_fetchers/civitek_ocrs.py

"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform.

Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces
que comparten 33 condados de FL. Una sola implementacion los cubre a todos.

COUNTIES COVERED (33):
  Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist,
  Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes,
  Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau,
  Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton,
  Washington.

NOT INCLUDED (use other adapters):
  Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee,
  Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe,
  Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas,
  Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor.

USAGE:
    from data_fetchers.civitek_ocrs import fetch_civitek_court_records
    result = fetch_civitek_court_records(
        county_name="Hernando",
        case_number="2024-CA-001234",
    )
    # → {status, case_data, lis_pendens, sources_used, source_url, errors, ...}

TECHNICAL NOTES:
- Uses Playwright headless Chromium (free, ~$0 cost per query)
- Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests
- Auto-generated DOM ids (j_idt*) change per session — we use text selectors
- Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable
- Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse)
- Rate limit: not stated by Civitek — we self-throttle to 1 req/2s
"""
from __future__ import annotations

import re
import time
from datetime import datetime
from typing import Optional


# ════════════════════════════════════════════════════════════════════════════
# COUNTY CODE MAPPING (Civitek 2-digit codes)
# ════════════════════════════════════════════════════════════════════════════

CIVITEK_COUNTY_CODES: dict[str, str] = {
    # Format: "County Name (canonical)": "NN" (2-digit Civitek code)
    "Baker": "02",
    "Bradford": "04",
    "Calhoun": "07",
    "Columbia": "12",
    "DeSoto": "14",
    "Dixie": "15",
    "Franklin": "19",
    "Gilchrist": "21",
    "Glades": "22",
    "Gulf": "23",
    "Hamilton": "24",
    "Hardee": "25",
    "Hendry": "26",
    "Hernando": "27",
    "Highlands": "28",
    "Holmes": "30",
    "Jackson": "32",
    "Jefferson": "33",
    "Lafayette": "34",
    "Levy": "38",
    "Liberty": "39",
    "Madison": "40",
    "Marion": "42",
    "Nassau": "45",
    "Okeechobee": "47",
    "Pasco": "51",
    "Putnam": "54",
    "Santa Rosa": "57",
    "Sumter": "60",
    "Union": "63",
    "Wakulla": "65",
    "Walton": "66",
    "Washington": "67",
}


def is_civitek_county(county_name: Optional[str]) -> bool:
    """True if county is in Civitek (33 FL counties)."""
    if not county_name:
        return False
    cn = county_name.strip().replace(" County", "").replace(" county", "")
    return cn in CIVITEK_COUNTY_CODES


def civitek_code_for(county_name: str) -> Optional[str]:
    """Return Civitek 2-digit code for a county name, or None."""
    cn = county_name.strip().replace(" County", "").replace(" county", "")
    return CIVITEK_COUNTY_CODES.get(cn)


# ════════════════════════════════════════════════════════════════════════════
# CASE NUMBER PARSER
# ════════════════════════════════════════════════════════════════════════════

# Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately.
# Common formats observed in realauction.com deals:
#   "2024-CA-001234"
#   "23-2024-CA-001234"
#   "2024CA001234"
#   "2024-001234-CA"
#   "27-2024-CA-001234"  (court code prefix)
_CASE_PATTERNS = [
    # year-type-seq
    re.compile(r"(?:\d{2}-)?(?P<year>20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P<seq>\d{3,8})", re.IGNORECASE),
    # year-seq-type
    re.compile(r"(?P<year>20\d{2})[\-\s]?(?P<seq>\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE),
    # tight: yearTypeNNNNNN
    re.compile(r"(?P<year>20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P<seq>\d{3,8})", re.IGNORECASE),
]


def parse_case_number(case_number: str) -> Optional[tuple[str, str]]:
    """Parse a FL case_number into (year, sequence). Returns None if unparseable.

    Examples:
        "2024-CA-001234" → ("2024", "001234")
        "23-2024-CA-001234" → ("2024", "001234")
        "2024CA001234" → ("2024", "001234")
    """
    if not case_number:
        return None
    cn = case_number.strip().upper()
    for pat in _CASE_PATTERNS:
        m = pat.search(cn)
        if m:
            year = m.group("year")
            seq = m.group("seq").lstrip("0") or "0"
            return (year, seq)
    return None


# ════════════════════════════════════════════════════════════════════════════
# PUBLIC API
# ════════════════════════════════════════════════════════════════════════════

def fetch_civitek_court_records(
    county_name: str,
    case_number: Optional[str] = None,
    party_lastname: Optional[str] = None,
    party_firstname: Optional[str] = None,
    business_name: Optional[str] = None,
    headless: bool = True,
    timeout_seconds: int = 45,
) -> dict:
    """Fetch court records from Civitek OCRS.

    Provide ONE of:
        - case_number (e.g., "2024-CA-001234") → fastest, most precise
        - party_lastname (with optional firstname) → person search
        - business_name → business search

    Returns dict matching court_records.py contract:
        {
            "status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" |
                      "FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" |
                      "NOT_FOUND" | "ERROR",
            "county": str (normalized),
            "case_number_searched": str,
            "search_method": "case_number" | "person_name" | "business_name",
            "results": list of dicts (raw cases found),
            "case_data": dict (top result enriched) | None,
            "lis_pendens": list,
            "liens_inventory": dict,
            "sources_used": ["civitek_ocrs"],
            "source_url": str,
            "errors": list of strings,
            "fetched_at": ISO timestamp,
        }
    """
    fetched_at = datetime.utcnow().isoformat() + "Z"
    county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "")

    # Validate county
    code = civitek_code_for(county_normalized)
    if not code:
        return _error_result(
            county=county_normalized,
            case_number_searched=case_number or "",
            error=f"County '{county_normalized}' not on Civitek platform. "
                  f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...",
            fetched_at=fetched_at,
        )

    # Validate at least one search criterion
    if not (case_number or party_lastname or business_name):
        return _error_result(
            county=county_normalized,
            case_number_searched="",
            error="Must provide one of: case_number, party_lastname, or business_name",
            fetched_at=fetched_at,
        )

    # Determine search method
    if case_number:
        parsed = parse_case_number(case_number)
        if not parsed:
            return _error_result(
                county=county_normalized,
                case_number_searched=case_number,
                error=f"Could not parse case_number '{case_number}' into year+sequence",
                fetched_at=fetched_at,
            )
        year, seq = parsed
        search_method = "case_number"
    elif business_name:
        year = seq = None
        search_method = "business_name"
    else:
        year = seq = None
        search_method = "person_name"

    # Execute Playwright flow
    try:
        from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
    except ImportError:
        return _error_result(
            county=county_normalized,
            case_number_searched=case_number or "",
            error="playwright not installed. Run: pip install playwright && playwright install chromium",
            fetched_at=fetched_at,
        )

    base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/"
    errors: list[str] = []
    results: list[dict] = []
    final_url = base_url
    status_from_results = "UNKNOWN"

    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=headless)
            ctx = browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
            )
            page = ctx.new_page()
            page.set_default_timeout(timeout_seconds * 1000)

            # Step 1: Entry page
            page.goto(base_url)
            page.wait_for_timeout(1500)

            # Step 2: Click Public
            page.locator("button:has-text('Public')").first.click()
            page.wait_for_timeout(2500)

            # Step 3: Click I Agree (disclaimer)
            agree_btn = page.locator("button:has-text('I Agree')").first
            if agree_btn.count() == 0:
                errors.append("Disclaimer page didn't show 'I Agree' button")
                browser.close()
                return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at)
            agree_btn.click()
            page.wait_for_timeout(2500)
            final_url = page.url

            # Step 4: Fill form based on search_method
            if search_method == "case_number":
                # Switch to Case Search tab (data-index=1)
                case_tab = page.locator("li[role='tab']:has-text('Case Search')").first
                if case_tab.count() == 0:
                    errors.append("Case Search tab not found")
                else:
                    case_tab.click()
                    page.wait_for_timeout(1500)
                page.fill("#form\\:search_tab\\:year", year)
                page.fill("#form\\:search_tab\\:seq", seq)
            elif search_method == "person_name":
                # Person Search tab is default (data-index=0)
                page.fill("#form\\:search_tab\\:lastname", party_lastname)
                if party_firstname:
                    page.fill("#form\\:search_tab\\:fname", party_firstname)
            elif search_method == "business_name":
                page.fill("#form\\:search_tab\\:businessname", business_name)

            # Step 5: Submit
            search_btn = page.locator(
                "button:has(.ui-button-text:text-is('Search'))"
            ).first
            if search_btn.count() == 0:
                search_btn = page.locator("button:has-text('Search')").first
            search_btn.click()
            page.wait_for_timeout(6000)

            # Step 6: Capture validation errors
            err_msgs = page.locator(".ui-message-error, .ui-messages-error").all()
            for m in err_msgs[:5]:
                try:
                    t = (m.inner_text() or "").strip()
                    if t and t not in errors:
                        errors.append(t[:200])
                except Exception:
                    pass

            # Step 7: Parse results table
            # Civitek results render in a data table with role=grid or as a panelgrid.
            # Look for tables that contain "Case" headers
            results = _parse_results(page)
            final_url = page.url

            browser.close()
    except PWTimeout as e:
        errors.append(f"Playwright timeout: {e}")
    except Exception as e:
        errors.append(f"Playwright crashed: {type(e).__name__}: {e}")

    # Determine status from results
    if errors and not results:
        status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND"
    elif not results:
        status_from_results = "NOT_FOUND"
    else:
        # Has results — infer from case_type
        first_type = (results[0].get("case_type") or "").upper()
        if "CA" in first_type or "CIVIL" in first_type:
            status_from_results = "FORECLOSURE_PENDING"  # CA cases include foreclosures
        elif "CF" in first_type or "FELONY" in first_type:
            status_from_results = "CLEAN"  # unrelated criminal
        else:
            status_from_results = "UNKNOWN"

    # Top result enriched
    case_data = results[0] if results else None

    return {
        "status": status_from_results,
        "county": county_normalized,
        "case_number_searched": case_number or "",
        "search_method": search_method,
        "results": results,
        "case_data": case_data,
        "lis_pendens": [],
        "liens_inventory": {},
        "sources_used": ["civitek_ocrs"],
        "source_url": final_url,
        "errors": errors,
        "fetched_at": fetched_at,
    }


# ════════════════════════════════════════════════════════════════════════════
# Internal helpers
# ════════════════════════════════════════════════════════════════════════════

def _parse_results(page) -> list[dict]:
    """Parse the results table from a Civitek search results page.

    Civitek renders results as a DataTable (PrimeFaces). Look for tables with
    case-related headers. Returns list of dicts with case_number, parties,
    filed_date, case_type.
    """
    results: list[dict] = []
    tables = page.locator("table").all()
    for tbl in tables:
        try:
            rows = tbl.locator("tr")
            row_count = rows.count()
            if row_count < 2:
                continue
            # Header row
            headers_raw = rows.first.locator("th, td").all()
            headers = [(h.inner_text() or "").strip().lower() for h in headers_raw]
            # Heuristic: this table has case results if headers include any
            # of "case", "uniform", "date", "party", "type"
            if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type"))
                       for h in headers):
                continue
            # Index columns
            col_idx = {}
            for i, h in enumerate(headers):
                if "case" in h or "uniform" in h:
                    col_idx["case_number"] = i
                elif "type" in h or "court" in h:
                    col_idx["case_type"] = i
                elif "filed" in h or "date" in h:
                    col_idx["filed_date"] = i
                elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h:
                    col_idx["parties"] = i
                elif "status" in h:
                    col_idx["status"] = i

            # Data rows
            for r in range(1, row_count):
                cells = rows.nth(r).locator("td").all()
                if not cells:
                    continue
                cell_texts = [(c.inner_text() or "").strip() for c in cells]
                row_data = {}
                for k, i in col_idx.items():
                    if i < len(cell_texts):
                        row_data[k] = cell_texts[i]
                if row_data:
                    results.append(row_data)
        except Exception:
            continue
        # If we found a results table with rows, stop
        if results:
            break
    return results


def _error_result(
    county: str,
    case_number_searched: str,
    error: str,
    fetched_at: str,
) -> dict:
    return {
        "status": "ERROR",
        "county": county,
        "case_number_searched": case_number_searched,
        "search_method": None,
        "results": [],
        "case_data": None,
        "lis_pendens": [],
        "liens_inventory": {},
        "sources_used": ["civitek_ocrs"],
        "source_url": "",
        "errors": [error],
        "fetched_at": fetched_at,
    }


# ════════════════════════════════════════════════════════════════════════════
# CLI for manual testing
# ════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    import argparse
    import json

    parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test")
    parser.add_argument("--county", required=True, help="County name (e.g., Hernando)")
    parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)")
    parser.add_argument("--last-name", help="Last name for person search")
    parser.add_argument("--first-name", help="First name (optional with last-name)")
    parser.add_argument("--business", help="Business name search")
    parser.add_argument("--no-headless", action="store_true", help="Show browser window")
    args = parser.parse_args()

    result = fetch_civitek_court_records(
        county_name=args.county,
        case_number=args.case,
        party_lastname=args.last_name,
        party_firstname=args.first_name,
        business_name=args.business,
        headless=not args.no_headless,
    )
    print(json.dumps(result, indent=2, default=str))