"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform. Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces que comparten 33 condados de FL. Una sola implementacion los cubre a todos. COUNTIES COVERED (33): Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist, Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes, Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau, Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton, Washington. NOT INCLUDED (use other adapters): Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee, Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe, Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas, Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor. USAGE: from data_fetchers.civitek_ocrs import fetch_civitek_court_records result = fetch_civitek_court_records( county_name="Hernando", case_number="2024-CA-001234", ) # → {status, case_data, lis_pendens, sources_used, source_url, errors, ...} TECHNICAL NOTES: - Uses Playwright headless Chromium (free, ~$0 cost per query) - Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests - Auto-generated DOM ids (j_idt*) change per session — we use text selectors - Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable - Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse) - Rate limit: not stated by Civitek — we self-throttle to 1 req/2s """ from __future__ import annotations import re import time from datetime import datetime from typing import Optional # ════════════════════════════════════════════════════════════════════════════ # COUNTY CODE MAPPING (Civitek 2-digit codes) # ════════════════════════════════════════════════════════════════════════════ CIVITEK_COUNTY_CODES: dict[str, str] = { # Format: "County Name (canonical)": "NN" (2-digit Civitek code) "Baker": "02", "Bradford": "04", "Calhoun": "07", "Columbia": "12", "DeSoto": "14", "Dixie": "15", "Franklin": "19", "Gilchrist": "21", "Glades": "22", "Gulf": "23", "Hamilton": "24", "Hardee": "25", "Hendry": "26", "Hernando": "27", "Highlands": "28", "Holmes": "30", "Jackson": "32", "Jefferson": "33", "Lafayette": "34", "Levy": "38", "Liberty": "39", "Madison": "40", "Marion": "42", "Nassau": "45", "Okeechobee": "47", "Pasco": "51", "Putnam": "54", "Santa Rosa": "57", "Sumter": "60", "Union": "63", "Wakulla": "65", "Walton": "66", "Washington": "67", } def is_civitek_county(county_name: Optional[str]) -> bool: """True if county is in Civitek (33 FL counties).""" if not county_name: return False cn = county_name.strip().replace(" County", "").replace(" county", "") return cn in CIVITEK_COUNTY_CODES def civitek_code_for(county_name: str) -> Optional[str]: """Return Civitek 2-digit code for a county name, or None.""" cn = county_name.strip().replace(" County", "").replace(" county", "") return CIVITEK_COUNTY_CODES.get(cn) # ════════════════════════════════════════════════════════════════════════════ # CASE NUMBER PARSER # ════════════════════════════════════════════════════════════════════════════ # Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately. # Common formats observed in realauction.com deals: # "2024-CA-001234" # "23-2024-CA-001234" # "2024CA001234" # "2024-001234-CA" # "27-2024-CA-001234" (court code prefix) _CASE_PATTERNS = [ # year-type-seq re.compile(r"(?:\d{2}-)?(?P20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P\d{3,8})", re.IGNORECASE), # year-seq-type re.compile(r"(?P20\d{2})[\-\s]?(?P\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE), # tight: yearTypeNNNNNN re.compile(r"(?P20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P\d{3,8})", re.IGNORECASE), ] def parse_case_number(case_number: str) -> Optional[tuple[str, str]]: """Parse a FL case_number into (year, sequence). Returns None if unparseable. Examples: "2024-CA-001234" → ("2024", "001234") "23-2024-CA-001234" → ("2024", "001234") "2024CA001234" → ("2024", "001234") """ if not case_number: return None cn = case_number.strip().upper() for pat in _CASE_PATTERNS: m = pat.search(cn) if m: year = m.group("year") seq = m.group("seq").lstrip("0") or "0" return (year, seq) return None # ════════════════════════════════════════════════════════════════════════════ # PUBLIC API # ════════════════════════════════════════════════════════════════════════════ def fetch_civitek_court_records( county_name: str, case_number: Optional[str] = None, party_lastname: Optional[str] = None, party_firstname: Optional[str] = None, business_name: Optional[str] = None, headless: bool = True, timeout_seconds: int = 45, ) -> dict: """Fetch court records from Civitek OCRS. Provide ONE of: - case_number (e.g., "2024-CA-001234") → fastest, most precise - party_lastname (with optional firstname) → person search - business_name → business search Returns dict matching court_records.py contract: { "status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" | "FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" | "NOT_FOUND" | "ERROR", "county": str (normalized), "case_number_searched": str, "search_method": "case_number" | "person_name" | "business_name", "results": list of dicts (raw cases found), "case_data": dict (top result enriched) | None, "lis_pendens": list, "liens_inventory": dict, "sources_used": ["civitek_ocrs"], "source_url": str, "errors": list of strings, "fetched_at": ISO timestamp, } """ fetched_at = datetime.utcnow().isoformat() + "Z" county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "") # Validate county code = civitek_code_for(county_normalized) if not code: return _error_result( county=county_normalized, case_number_searched=case_number or "", error=f"County '{county_normalized}' not on Civitek platform. " f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...", fetched_at=fetched_at, ) # Validate at least one search criterion if not (case_number or party_lastname or business_name): return _error_result( county=county_normalized, case_number_searched="", error="Must provide one of: case_number, party_lastname, or business_name", fetched_at=fetched_at, ) # Determine search method if case_number: parsed = parse_case_number(case_number) if not parsed: return _error_result( county=county_normalized, case_number_searched=case_number, error=f"Could not parse case_number '{case_number}' into year+sequence", fetched_at=fetched_at, ) year, seq = parsed search_method = "case_number" elif business_name: year = seq = None search_method = "business_name" else: year = seq = None search_method = "person_name" # Execute Playwright flow try: from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout except ImportError: return _error_result( county=county_normalized, case_number_searched=case_number or "", error="playwright not installed. Run: pip install playwright && playwright install chromium", fetched_at=fetched_at, ) base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/" errors: list[str] = [] results: list[dict] = [] final_url = base_url status_from_results = "UNKNOWN" try: with sync_playwright() as p: browser = p.chromium.launch(headless=headless) ctx = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120", ) page = ctx.new_page() page.set_default_timeout(timeout_seconds * 1000) # Step 1: Entry page page.goto(base_url) page.wait_for_timeout(1500) # Step 2: Click Public page.locator("button:has-text('Public')").first.click() page.wait_for_timeout(2500) # Step 3: Click I Agree (disclaimer) agree_btn = page.locator("button:has-text('I Agree')").first if agree_btn.count() == 0: errors.append("Disclaimer page didn't show 'I Agree' button") browser.close() return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at) agree_btn.click() page.wait_for_timeout(2500) final_url = page.url # Step 4: Fill form based on search_method if search_method == "case_number": # Switch to Case Search tab (data-index=1) case_tab = page.locator("li[role='tab']:has-text('Case Search')").first if case_tab.count() == 0: errors.append("Case Search tab not found") else: case_tab.click() page.wait_for_timeout(1500) page.fill("#form\\:search_tab\\:year", year) page.fill("#form\\:search_tab\\:seq", seq) elif search_method == "person_name": # Person Search tab is default (data-index=0) page.fill("#form\\:search_tab\\:lastname", party_lastname) if party_firstname: page.fill("#form\\:search_tab\\:fname", party_firstname) elif search_method == "business_name": page.fill("#form\\:search_tab\\:businessname", business_name) # Step 5: Submit search_btn = page.locator( "button:has(.ui-button-text:text-is('Search'))" ).first if search_btn.count() == 0: search_btn = page.locator("button:has-text('Search')").first search_btn.click() page.wait_for_timeout(6000) # Step 6: Capture validation errors err_msgs = page.locator(".ui-message-error, .ui-messages-error").all() for m in err_msgs[:5]: try: t = (m.inner_text() or "").strip() if t and t not in errors: errors.append(t[:200]) except Exception: pass # Step 7: Parse results table # Civitek results render in a data table with role=grid or as a panelgrid. # Look for tables that contain "Case" headers results = _parse_results(page) final_url = page.url browser.close() except PWTimeout as e: errors.append(f"Playwright timeout: {e}") except Exception as e: errors.append(f"Playwright crashed: {type(e).__name__}: {e}") # Determine status from results if errors and not results: status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND" elif not results: status_from_results = "NOT_FOUND" else: # Has results — infer from case_type first_type = (results[0].get("case_type") or "").upper() if "CA" in first_type or "CIVIL" in first_type: status_from_results = "FORECLOSURE_PENDING" # CA cases include foreclosures elif "CF" in first_type or "FELONY" in first_type: status_from_results = "CLEAN" # unrelated criminal else: status_from_results = "UNKNOWN" # Top result enriched case_data = results[0] if results else None return { "status": status_from_results, "county": county_normalized, "case_number_searched": case_number or "", "search_method": search_method, "results": results, "case_data": case_data, "lis_pendens": [], "liens_inventory": {}, "sources_used": ["civitek_ocrs"], "source_url": final_url, "errors": errors, "fetched_at": fetched_at, } # ════════════════════════════════════════════════════════════════════════════ # Internal helpers # ════════════════════════════════════════════════════════════════════════════ def _parse_results(page) -> list[dict]: """Parse the results table from a Civitek search results page. Civitek renders results as a DataTable (PrimeFaces). Look for tables with case-related headers. Returns list of dicts with case_number, parties, filed_date, case_type. """ results: list[dict] = [] tables = page.locator("table").all() for tbl in tables: try: rows = tbl.locator("tr") row_count = rows.count() if row_count < 2: continue # Header row headers_raw = rows.first.locator("th, td").all() headers = [(h.inner_text() or "").strip().lower() for h in headers_raw] # Heuristic: this table has case results if headers include any # of "case", "uniform", "date", "party", "type" if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type")) for h in headers): continue # Index columns col_idx = {} for i, h in enumerate(headers): if "case" in h or "uniform" in h: col_idx["case_number"] = i elif "type" in h or "court" in h: col_idx["case_type"] = i elif "filed" in h or "date" in h: col_idx["filed_date"] = i elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h: col_idx["parties"] = i elif "status" in h: col_idx["status"] = i # Data rows for r in range(1, row_count): cells = rows.nth(r).locator("td").all() if not cells: continue cell_texts = [(c.inner_text() or "").strip() for c in cells] row_data = {} for k, i in col_idx.items(): if i < len(cell_texts): row_data[k] = cell_texts[i] if row_data: results.append(row_data) except Exception: continue # If we found a results table with rows, stop if results: break return results def _error_result( county: str, case_number_searched: str, error: str, fetched_at: str, ) -> dict: return { "status": "ERROR", "county": county, "case_number_searched": case_number_searched, "search_method": None, "results": [], "case_data": None, "lis_pendens": [], "liens_inventory": {}, "sources_used": ["civitek_ocrs"], "source_url": "", "errors": [error], "fetched_at": fetched_at, } # ════════════════════════════════════════════════════════════════════════════ # CLI for manual testing # ════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": import argparse import json parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test") parser.add_argument("--county", required=True, help="County name (e.g., Hernando)") parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)") parser.add_argument("--last-name", help="Last name for person search") parser.add_argument("--first-name", help="First name (optional with last-name)") parser.add_argument("--business", help="Business name search") parser.add_argument("--no-headless", action="store_true", help="Show browser window") args = parser.parse_args() result = fetch_civitek_court_records( county_name=args.county, case_number=args.case, party_lastname=args.last_name, party_firstname=args.first_name, business_name=args.business, headless=not args.no_headless, ) print(json.dumps(result, indent=2, default=str))