457 lines
18 KiB
Python
457 lines
18 KiB
Python
"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform.
|
|
|
|
Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces
|
|
que comparten 33 condados de FL. Una sola implementacion los cubre a todos.
|
|
|
|
COUNTIES COVERED (33):
|
|
Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist,
|
|
Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes,
|
|
Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau,
|
|
Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton,
|
|
Washington.
|
|
|
|
NOT INCLUDED (use other adapters):
|
|
Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee,
|
|
Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe,
|
|
Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas,
|
|
Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor.
|
|
|
|
USAGE:
|
|
from data_fetchers.civitek_ocrs import fetch_civitek_court_records
|
|
result = fetch_civitek_court_records(
|
|
county_name="Hernando",
|
|
case_number="2024-CA-001234",
|
|
)
|
|
# → {status, case_data, lis_pendens, sources_used, source_url, errors, ...}
|
|
|
|
TECHNICAL NOTES:
|
|
- Uses Playwright headless Chromium (free, ~$0 cost per query)
|
|
- Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests
|
|
- Auto-generated DOM ids (j_idt*) change per session — we use text selectors
|
|
- Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable
|
|
- Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse)
|
|
- Rate limit: not stated by Civitek — we self-throttle to 1 req/2s
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# COUNTY CODE MAPPING (Civitek 2-digit codes)
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
CIVITEK_COUNTY_CODES: dict[str, str] = {
|
|
# Format: "County Name (canonical)": "NN" (2-digit Civitek code)
|
|
"Baker": "02",
|
|
"Bradford": "04",
|
|
"Calhoun": "07",
|
|
"Columbia": "12",
|
|
"DeSoto": "14",
|
|
"Dixie": "15",
|
|
"Franklin": "19",
|
|
"Gilchrist": "21",
|
|
"Glades": "22",
|
|
"Gulf": "23",
|
|
"Hamilton": "24",
|
|
"Hardee": "25",
|
|
"Hendry": "26",
|
|
"Hernando": "27",
|
|
"Highlands": "28",
|
|
"Holmes": "30",
|
|
"Jackson": "32",
|
|
"Jefferson": "33",
|
|
"Lafayette": "34",
|
|
"Levy": "38",
|
|
"Liberty": "39",
|
|
"Madison": "40",
|
|
"Marion": "42",
|
|
"Nassau": "45",
|
|
"Okeechobee": "47",
|
|
"Pasco": "51",
|
|
"Putnam": "54",
|
|
"Santa Rosa": "57",
|
|
"Sumter": "60",
|
|
"Union": "63",
|
|
"Wakulla": "65",
|
|
"Walton": "66",
|
|
"Washington": "67",
|
|
}
|
|
|
|
|
|
def is_civitek_county(county_name: Optional[str]) -> bool:
|
|
"""True if county is in Civitek (33 FL counties)."""
|
|
if not county_name:
|
|
return False
|
|
cn = county_name.strip().replace(" County", "").replace(" county", "")
|
|
return cn in CIVITEK_COUNTY_CODES
|
|
|
|
|
|
def civitek_code_for(county_name: str) -> Optional[str]:
|
|
"""Return Civitek 2-digit code for a county name, or None."""
|
|
cn = county_name.strip().replace(" County", "").replace(" county", "")
|
|
return CIVITEK_COUNTY_CODES.get(cn)
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# CASE NUMBER PARSER
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately.
|
|
# Common formats observed in realauction.com deals:
|
|
# "2024-CA-001234"
|
|
# "23-2024-CA-001234"
|
|
# "2024CA001234"
|
|
# "2024-001234-CA"
|
|
# "27-2024-CA-001234" (court code prefix)
|
|
_CASE_PATTERNS = [
|
|
# year-type-seq
|
|
re.compile(r"(?:\d{2}-)?(?P<year>20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P<seq>\d{3,8})", re.IGNORECASE),
|
|
# year-seq-type
|
|
re.compile(r"(?P<year>20\d{2})[\-\s]?(?P<seq>\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE),
|
|
# tight: yearTypeNNNNNN
|
|
re.compile(r"(?P<year>20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P<seq>\d{3,8})", re.IGNORECASE),
|
|
]
|
|
|
|
|
|
def parse_case_number(case_number: str) -> Optional[tuple[str, str]]:
|
|
"""Parse a FL case_number into (year, sequence). Returns None if unparseable.
|
|
|
|
Examples:
|
|
"2024-CA-001234" → ("2024", "001234")
|
|
"23-2024-CA-001234" → ("2024", "001234")
|
|
"2024CA001234" → ("2024", "001234")
|
|
"""
|
|
if not case_number:
|
|
return None
|
|
cn = case_number.strip().upper()
|
|
for pat in _CASE_PATTERNS:
|
|
m = pat.search(cn)
|
|
if m:
|
|
year = m.group("year")
|
|
seq = m.group("seq").lstrip("0") or "0"
|
|
return (year, seq)
|
|
return None
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# PUBLIC API
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def fetch_civitek_court_records(
|
|
county_name: str,
|
|
case_number: Optional[str] = None,
|
|
party_lastname: Optional[str] = None,
|
|
party_firstname: Optional[str] = None,
|
|
business_name: Optional[str] = None,
|
|
headless: bool = True,
|
|
timeout_seconds: int = 45,
|
|
) -> dict:
|
|
"""Fetch court records from Civitek OCRS.
|
|
|
|
Provide ONE of:
|
|
- case_number (e.g., "2024-CA-001234") → fastest, most precise
|
|
- party_lastname (with optional firstname) → person search
|
|
- business_name → business search
|
|
|
|
Returns dict matching court_records.py contract:
|
|
{
|
|
"status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" |
|
|
"FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" |
|
|
"NOT_FOUND" | "ERROR",
|
|
"county": str (normalized),
|
|
"case_number_searched": str,
|
|
"search_method": "case_number" | "person_name" | "business_name",
|
|
"results": list of dicts (raw cases found),
|
|
"case_data": dict (top result enriched) | None,
|
|
"lis_pendens": list,
|
|
"liens_inventory": dict,
|
|
"sources_used": ["civitek_ocrs"],
|
|
"source_url": str,
|
|
"errors": list of strings,
|
|
"fetched_at": ISO timestamp,
|
|
}
|
|
"""
|
|
fetched_at = datetime.utcnow().isoformat() + "Z"
|
|
county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "")
|
|
|
|
# Validate county
|
|
code = civitek_code_for(county_normalized)
|
|
if not code:
|
|
return _error_result(
|
|
county=county_normalized,
|
|
case_number_searched=case_number or "",
|
|
error=f"County '{county_normalized}' not on Civitek platform. "
|
|
f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...",
|
|
fetched_at=fetched_at,
|
|
)
|
|
|
|
# Validate at least one search criterion
|
|
if not (case_number or party_lastname or business_name):
|
|
return _error_result(
|
|
county=county_normalized,
|
|
case_number_searched="",
|
|
error="Must provide one of: case_number, party_lastname, or business_name",
|
|
fetched_at=fetched_at,
|
|
)
|
|
|
|
# Determine search method
|
|
if case_number:
|
|
parsed = parse_case_number(case_number)
|
|
if not parsed:
|
|
return _error_result(
|
|
county=county_normalized,
|
|
case_number_searched=case_number,
|
|
error=f"Could not parse case_number '{case_number}' into year+sequence",
|
|
fetched_at=fetched_at,
|
|
)
|
|
year, seq = parsed
|
|
search_method = "case_number"
|
|
elif business_name:
|
|
year = seq = None
|
|
search_method = "business_name"
|
|
else:
|
|
year = seq = None
|
|
search_method = "person_name"
|
|
|
|
# Execute Playwright flow
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
|
except ImportError:
|
|
return _error_result(
|
|
county=county_normalized,
|
|
case_number_searched=case_number or "",
|
|
error="playwright not installed. Run: pip install playwright && playwright install chromium",
|
|
fetched_at=fetched_at,
|
|
)
|
|
|
|
base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/"
|
|
errors: list[str] = []
|
|
results: list[dict] = []
|
|
final_url = base_url
|
|
status_from_results = "UNKNOWN"
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=headless)
|
|
ctx = browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
|
|
)
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(timeout_seconds * 1000)
|
|
|
|
# Step 1: Entry page
|
|
page.goto(base_url)
|
|
page.wait_for_timeout(1500)
|
|
|
|
# Step 2: Click Public
|
|
page.locator("button:has-text('Public')").first.click()
|
|
page.wait_for_timeout(2500)
|
|
|
|
# Step 3: Click I Agree (disclaimer)
|
|
agree_btn = page.locator("button:has-text('I Agree')").first
|
|
if agree_btn.count() == 0:
|
|
errors.append("Disclaimer page didn't show 'I Agree' button")
|
|
browser.close()
|
|
return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at)
|
|
agree_btn.click()
|
|
page.wait_for_timeout(2500)
|
|
final_url = page.url
|
|
|
|
# Step 4: Fill form based on search_method
|
|
if search_method == "case_number":
|
|
# Switch to Case Search tab (data-index=1)
|
|
case_tab = page.locator("li[role='tab']:has-text('Case Search')").first
|
|
if case_tab.count() == 0:
|
|
errors.append("Case Search tab not found")
|
|
else:
|
|
case_tab.click()
|
|
page.wait_for_timeout(1500)
|
|
page.fill("#form\\:search_tab\\:year", year)
|
|
page.fill("#form\\:search_tab\\:seq", seq)
|
|
elif search_method == "person_name":
|
|
# Person Search tab is default (data-index=0)
|
|
page.fill("#form\\:search_tab\\:lastname", party_lastname)
|
|
if party_firstname:
|
|
page.fill("#form\\:search_tab\\:fname", party_firstname)
|
|
elif search_method == "business_name":
|
|
page.fill("#form\\:search_tab\\:businessname", business_name)
|
|
|
|
# Step 5: Submit
|
|
search_btn = page.locator(
|
|
"button:has(.ui-button-text:text-is('Search'))"
|
|
).first
|
|
if search_btn.count() == 0:
|
|
search_btn = page.locator("button:has-text('Search')").first
|
|
search_btn.click()
|
|
page.wait_for_timeout(6000)
|
|
|
|
# Step 6: Capture validation errors
|
|
err_msgs = page.locator(".ui-message-error, .ui-messages-error").all()
|
|
for m in err_msgs[:5]:
|
|
try:
|
|
t = (m.inner_text() or "").strip()
|
|
if t and t not in errors:
|
|
errors.append(t[:200])
|
|
except Exception:
|
|
pass
|
|
|
|
# Step 7: Parse results table
|
|
# Civitek results render in a data table with role=grid or as a panelgrid.
|
|
# Look for tables that contain "Case" headers
|
|
results = _parse_results(page)
|
|
final_url = page.url
|
|
|
|
browser.close()
|
|
except PWTimeout as e:
|
|
errors.append(f"Playwright timeout: {e}")
|
|
except Exception as e:
|
|
errors.append(f"Playwright crashed: {type(e).__name__}: {e}")
|
|
|
|
# Determine status from results
|
|
if errors and not results:
|
|
status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND"
|
|
elif not results:
|
|
status_from_results = "NOT_FOUND"
|
|
else:
|
|
# Has results — infer from case_type
|
|
first_type = (results[0].get("case_type") or "").upper()
|
|
if "CA" in first_type or "CIVIL" in first_type:
|
|
status_from_results = "FORECLOSURE_PENDING" # CA cases include foreclosures
|
|
elif "CF" in first_type or "FELONY" in first_type:
|
|
status_from_results = "CLEAN" # unrelated criminal
|
|
else:
|
|
status_from_results = "UNKNOWN"
|
|
|
|
# Top result enriched
|
|
case_data = results[0] if results else None
|
|
|
|
return {
|
|
"status": status_from_results,
|
|
"county": county_normalized,
|
|
"case_number_searched": case_number or "",
|
|
"search_method": search_method,
|
|
"results": results,
|
|
"case_data": case_data,
|
|
"lis_pendens": [],
|
|
"liens_inventory": {},
|
|
"sources_used": ["civitek_ocrs"],
|
|
"source_url": final_url,
|
|
"errors": errors,
|
|
"fetched_at": fetched_at,
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Internal helpers
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def _parse_results(page) -> list[dict]:
|
|
"""Parse the results table from a Civitek search results page.
|
|
|
|
Civitek renders results as a DataTable (PrimeFaces). Look for tables with
|
|
case-related headers. Returns list of dicts with case_number, parties,
|
|
filed_date, case_type.
|
|
"""
|
|
results: list[dict] = []
|
|
tables = page.locator("table").all()
|
|
for tbl in tables:
|
|
try:
|
|
rows = tbl.locator("tr")
|
|
row_count = rows.count()
|
|
if row_count < 2:
|
|
continue
|
|
# Header row
|
|
headers_raw = rows.first.locator("th, td").all()
|
|
headers = [(h.inner_text() or "").strip().lower() for h in headers_raw]
|
|
# Heuristic: this table has case results if headers include any
|
|
# of "case", "uniform", "date", "party", "type"
|
|
if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type"))
|
|
for h in headers):
|
|
continue
|
|
# Index columns
|
|
col_idx = {}
|
|
for i, h in enumerate(headers):
|
|
if "case" in h or "uniform" in h:
|
|
col_idx["case_number"] = i
|
|
elif "type" in h or "court" in h:
|
|
col_idx["case_type"] = i
|
|
elif "filed" in h or "date" in h:
|
|
col_idx["filed_date"] = i
|
|
elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h:
|
|
col_idx["parties"] = i
|
|
elif "status" in h:
|
|
col_idx["status"] = i
|
|
|
|
# Data rows
|
|
for r in range(1, row_count):
|
|
cells = rows.nth(r).locator("td").all()
|
|
if not cells:
|
|
continue
|
|
cell_texts = [(c.inner_text() or "").strip() for c in cells]
|
|
row_data = {}
|
|
for k, i in col_idx.items():
|
|
if i < len(cell_texts):
|
|
row_data[k] = cell_texts[i]
|
|
if row_data:
|
|
results.append(row_data)
|
|
except Exception:
|
|
continue
|
|
# If we found a results table with rows, stop
|
|
if results:
|
|
break
|
|
return results
|
|
|
|
|
|
def _error_result(
|
|
county: str,
|
|
case_number_searched: str,
|
|
error: str,
|
|
fetched_at: str,
|
|
) -> dict:
|
|
return {
|
|
"status": "ERROR",
|
|
"county": county,
|
|
"case_number_searched": case_number_searched,
|
|
"search_method": None,
|
|
"results": [],
|
|
"case_data": None,
|
|
"lis_pendens": [],
|
|
"liens_inventory": {},
|
|
"sources_used": ["civitek_ocrs"],
|
|
"source_url": "",
|
|
"errors": [error],
|
|
"fetched_at": fetched_at,
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# CLI for manual testing
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import json
|
|
|
|
parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test")
|
|
parser.add_argument("--county", required=True, help="County name (e.g., Hernando)")
|
|
parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)")
|
|
parser.add_argument("--last-name", help="Last name for person search")
|
|
parser.add_argument("--first-name", help="First name (optional with last-name)")
|
|
parser.add_argument("--business", help="Business name search")
|
|
parser.add_argument("--no-headless", action="store_true", help="Show browser window")
|
|
args = parser.parse_args()
|
|
|
|
result = fetch_civitek_court_records(
|
|
county_name=args.county,
|
|
case_number=args.case,
|
|
party_lastname=args.last_name,
|
|
party_firstname=args.first_name,
|
|
business_name=args.business,
|
|
headless=not args.no_headless,
|
|
)
|
|
print(json.dumps(result, indent=2, default=str))
|