feat: AR-House initial commit

This commit is contained in:
2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
+456
View File
@@ -0,0 +1,456 @@
"""civitek_ocrs.py — Court records adapter for 33 FL counties on Civitek OCRS platform.
Civitek OCRS (Online Court Records Search) es la plataforma JSF/PrimeFaces
que comparten 33 condados de FL. Una sola implementacion los cubre a todos.
COUNTIES COVERED (33):
Baker, Bradford, Calhoun, Columbia, DeSoto, Dixie, Franklin, Gilchrist,
Glades, Gulf, Hamilton, Hardee, Hendry, Hernando, Highlands, Holmes,
Jackson, Jefferson, Lafayette, Levy, Liberty, Madison, Marion, Nassau,
Okeechobee, Pasco, Putnam, Santa Rosa, Sumter, Union, Wakulla, Walton,
Washington.
NOT INCLUDED (use other adapters):
Indian River, Brevard, Volusia, Lake, Citrus, Flagler, Charlotte, Manatee,
Sarasota, Polk, Osceola, Seminole, Alachua, Bay, Escambia, Leon, Monroe,
Collier, Lee, St. Lucie, Martin, St. Johns, Clay, Duval, Orange, Pinellas,
Hillsborough, Miami-Dade, Broward, Palm Beach, Suwannee, Citrus, Taylor.
USAGE:
from data_fetchers.civitek_ocrs import fetch_civitek_court_records
result = fetch_civitek_court_records(
county_name="Hernando",
case_number="2024-CA-001234",
)
# → {status, case_data, lis_pendens, sources_used, source_url, errors, ...}
TECHNICAL NOTES:
- Uses Playwright headless Chromium (free, ~$0 cost per query)
- Civitek is PrimeFaces/JSF stateful — needs full browser, not curl/requests
- Auto-generated DOM ids (j_idt*) change per session — we use text selectors
- Field ids bound to managed beans (search_tab:lastname, search_tab:year) ARE stable
- Per-query latency: ~6-10s (entry → disclaimer → tab switch → search → parse)
- Rate limit: not stated by Civitek — we self-throttle to 1 req/2s
"""
from __future__ import annotations
import re
import time
from datetime import datetime
from typing import Optional
# ════════════════════════════════════════════════════════════════════════════
# COUNTY CODE MAPPING (Civitek 2-digit codes)
# ════════════════════════════════════════════════════════════════════════════
CIVITEK_COUNTY_CODES: dict[str, str] = {
# Format: "County Name (canonical)": "NN" (2-digit Civitek code)
"Baker": "02",
"Bradford": "04",
"Calhoun": "07",
"Columbia": "12",
"DeSoto": "14",
"Dixie": "15",
"Franklin": "19",
"Gilchrist": "21",
"Glades": "22",
"Gulf": "23",
"Hamilton": "24",
"Hardee": "25",
"Hendry": "26",
"Hernando": "27",
"Highlands": "28",
"Holmes": "30",
"Jackson": "32",
"Jefferson": "33",
"Lafayette": "34",
"Levy": "38",
"Liberty": "39",
"Madison": "40",
"Marion": "42",
"Nassau": "45",
"Okeechobee": "47",
"Pasco": "51",
"Putnam": "54",
"Santa Rosa": "57",
"Sumter": "60",
"Union": "63",
"Wakulla": "65",
"Walton": "66",
"Washington": "67",
}
def is_civitek_county(county_name: Optional[str]) -> bool:
"""True if county is in Civitek (33 FL counties)."""
if not county_name:
return False
cn = county_name.strip().replace(" County", "").replace(" county", "")
return cn in CIVITEK_COUNTY_CODES
def civitek_code_for(county_name: str) -> Optional[str]:
"""Return Civitek 2-digit code for a county name, or None."""
cn = county_name.strip().replace(" County", "").replace(" county", "")
return CIVITEK_COUNTY_CODES.get(cn)
# ════════════════════════════════════════════════════════════════════════════
# CASE NUMBER PARSER
# ════════════════════════════════════════════════════════════════════════════
# Real FL case numbers come in many shapes. Civitek wants (year, sequence) separately.
# Common formats observed in realauction.com deals:
# "2024-CA-001234"
# "23-2024-CA-001234"
# "2024CA001234"
# "2024-001234-CA"
# "27-2024-CA-001234" (court code prefix)
_CASE_PATTERNS = [
# year-type-seq
re.compile(r"(?:\d{2}-)?(?P<year>20\d{2})[\-\s]?(?:CA|CC|CF|MM|DR|CP)[\-\s]?(?P<seq>\d{3,8})", re.IGNORECASE),
# year-seq-type
re.compile(r"(?P<year>20\d{2})[\-\s]?(?P<seq>\d{3,8})[\-\s]?(?:CA|CC|CF|MM|DR|CP)", re.IGNORECASE),
# tight: yearTypeNNNNNN
re.compile(r"(?P<year>20\d{2})(?:CA|CC|CF|MM|DR|CP)(?P<seq>\d{3,8})", re.IGNORECASE),
]
def parse_case_number(case_number: str) -> Optional[tuple[str, str]]:
"""Parse a FL case_number into (year, sequence). Returns None if unparseable.
Examples:
"2024-CA-001234" → ("2024", "001234")
"23-2024-CA-001234" → ("2024", "001234")
"2024CA001234" → ("2024", "001234")
"""
if not case_number:
return None
cn = case_number.strip().upper()
for pat in _CASE_PATTERNS:
m = pat.search(cn)
if m:
year = m.group("year")
seq = m.group("seq").lstrip("0") or "0"
return (year, seq)
return None
# ════════════════════════════════════════════════════════════════════════════
# PUBLIC API
# ════════════════════════════════════════════════════════════════════════════
def fetch_civitek_court_records(
county_name: str,
case_number: Optional[str] = None,
party_lastname: Optional[str] = None,
party_firstname: Optional[str] = None,
business_name: Optional[str] = None,
headless: bool = True,
timeout_seconds: int = 45,
) -> dict:
"""Fetch court records from Civitek OCRS.
Provide ONE of:
- case_number (e.g., "2024-CA-001234") → fastest, most precise
- party_lastname (with optional firstname) → person search
- business_name → business search
Returns dict matching court_records.py contract:
{
"status": "CLEAN" | "LIS_PENDENS_ACTIVE" | "FORECLOSURE_PENDING" |
"FORECLOSURE_COMPLETE" | "OWNER_VERIFIED" | "UNKNOWN" |
"NOT_FOUND" | "ERROR",
"county": str (normalized),
"case_number_searched": str,
"search_method": "case_number" | "person_name" | "business_name",
"results": list of dicts (raw cases found),
"case_data": dict (top result enriched) | None,
"lis_pendens": list,
"liens_inventory": dict,
"sources_used": ["civitek_ocrs"],
"source_url": str,
"errors": list of strings,
"fetched_at": ISO timestamp,
}
"""
fetched_at = datetime.utcnow().isoformat() + "Z"
county_normalized = (county_name or "").strip().replace(" County", "").replace(" county", "")
# Validate county
code = civitek_code_for(county_normalized)
if not code:
return _error_result(
county=county_normalized,
case_number_searched=case_number or "",
error=f"County '{county_normalized}' not on Civitek platform. "
f"Supported: {sorted(CIVITEK_COUNTY_CODES.keys())[:10]}...",
fetched_at=fetched_at,
)
# Validate at least one search criterion
if not (case_number or party_lastname or business_name):
return _error_result(
county=county_normalized,
case_number_searched="",
error="Must provide one of: case_number, party_lastname, or business_name",
fetched_at=fetched_at,
)
# Determine search method
if case_number:
parsed = parse_case_number(case_number)
if not parsed:
return _error_result(
county=county_normalized,
case_number_searched=case_number,
error=f"Could not parse case_number '{case_number}' into year+sequence",
fetched_at=fetched_at,
)
year, seq = parsed
search_method = "case_number"
elif business_name:
year = seq = None
search_method = "business_name"
else:
year = seq = None
search_method = "person_name"
# Execute Playwright flow
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except ImportError:
return _error_result(
county=county_normalized,
case_number_searched=case_number or "",
error="playwright not installed. Run: pip install playwright && playwright install chromium",
fetched_at=fetched_at,
)
base_url = f"https://www.civitekflorida.com/ocrs/county/{code}/"
errors: list[str] = []
results: list[dict] = []
final_url = base_url
status_from_results = "UNKNOWN"
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=headless)
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120",
)
page = ctx.new_page()
page.set_default_timeout(timeout_seconds * 1000)
# Step 1: Entry page
page.goto(base_url)
page.wait_for_timeout(1500)
# Step 2: Click Public
page.locator("button:has-text('Public')").first.click()
page.wait_for_timeout(2500)
# Step 3: Click I Agree (disclaimer)
agree_btn = page.locator("button:has-text('I Agree')").first
if agree_btn.count() == 0:
errors.append("Disclaimer page didn't show 'I Agree' button")
browser.close()
return _error_result(county_normalized, case_number or "", "; ".join(errors), fetched_at)
agree_btn.click()
page.wait_for_timeout(2500)
final_url = page.url
# Step 4: Fill form based on search_method
if search_method == "case_number":
# Switch to Case Search tab (data-index=1)
case_tab = page.locator("li[role='tab']:has-text('Case Search')").first
if case_tab.count() == 0:
errors.append("Case Search tab not found")
else:
case_tab.click()
page.wait_for_timeout(1500)
page.fill("#form\\:search_tab\\:year", year)
page.fill("#form\\:search_tab\\:seq", seq)
elif search_method == "person_name":
# Person Search tab is default (data-index=0)
page.fill("#form\\:search_tab\\:lastname", party_lastname)
if party_firstname:
page.fill("#form\\:search_tab\\:fname", party_firstname)
elif search_method == "business_name":
page.fill("#form\\:search_tab\\:businessname", business_name)
# Step 5: Submit
search_btn = page.locator(
"button:has(.ui-button-text:text-is('Search'))"
).first
if search_btn.count() == 0:
search_btn = page.locator("button:has-text('Search')").first
search_btn.click()
page.wait_for_timeout(6000)
# Step 6: Capture validation errors
err_msgs = page.locator(".ui-message-error, .ui-messages-error").all()
for m in err_msgs[:5]:
try:
t = (m.inner_text() or "").strip()
if t and t not in errors:
errors.append(t[:200])
except Exception:
pass
# Step 7: Parse results table
# Civitek results render in a data table with role=grid or as a panelgrid.
# Look for tables that contain "Case" headers
results = _parse_results(page)
final_url = page.url
browser.close()
except PWTimeout as e:
errors.append(f"Playwright timeout: {e}")
except Exception as e:
errors.append(f"Playwright crashed: {type(e).__name__}: {e}")
# Determine status from results
if errors and not results:
status_from_results = "ERROR" if any("crashed" in e.lower() or "timeout" in e.lower() for e in errors) else "NOT_FOUND"
elif not results:
status_from_results = "NOT_FOUND"
else:
# Has results — infer from case_type
first_type = (results[0].get("case_type") or "").upper()
if "CA" in first_type or "CIVIL" in first_type:
status_from_results = "FORECLOSURE_PENDING" # CA cases include foreclosures
elif "CF" in first_type or "FELONY" in first_type:
status_from_results = "CLEAN" # unrelated criminal
else:
status_from_results = "UNKNOWN"
# Top result enriched
case_data = results[0] if results else None
return {
"status": status_from_results,
"county": county_normalized,
"case_number_searched": case_number or "",
"search_method": search_method,
"results": results,
"case_data": case_data,
"lis_pendens": [],
"liens_inventory": {},
"sources_used": ["civitek_ocrs"],
"source_url": final_url,
"errors": errors,
"fetched_at": fetched_at,
}
# ════════════════════════════════════════════════════════════════════════════
# Internal helpers
# ════════════════════════════════════════════════════════════════════════════
def _parse_results(page) -> list[dict]:
"""Parse the results table from a Civitek search results page.
Civitek renders results as a DataTable (PrimeFaces). Look for tables with
case-related headers. Returns list of dicts with case_number, parties,
filed_date, case_type.
"""
results: list[dict] = []
tables = page.locator("table").all()
for tbl in tables:
try:
rows = tbl.locator("tr")
row_count = rows.count()
if row_count < 2:
continue
# Header row
headers_raw = rows.first.locator("th, td").all()
headers = [(h.inner_text() or "").strip().lower() for h in headers_raw]
# Heuristic: this table has case results if headers include any
# of "case", "uniform", "date", "party", "type"
if not any(any(kw in h for kw in ("case", "uniform", "filed", "party", "type"))
for h in headers):
continue
# Index columns
col_idx = {}
for i, h in enumerate(headers):
if "case" in h or "uniform" in h:
col_idx["case_number"] = i
elif "type" in h or "court" in h:
col_idx["case_type"] = i
elif "filed" in h or "date" in h:
col_idx["filed_date"] = i
elif "party" in h or "name" in h or "defendant" in h or "plaintiff" in h:
col_idx["parties"] = i
elif "status" in h:
col_idx["status"] = i
# Data rows
for r in range(1, row_count):
cells = rows.nth(r).locator("td").all()
if not cells:
continue
cell_texts = [(c.inner_text() or "").strip() for c in cells]
row_data = {}
for k, i in col_idx.items():
if i < len(cell_texts):
row_data[k] = cell_texts[i]
if row_data:
results.append(row_data)
except Exception:
continue
# If we found a results table with rows, stop
if results:
break
return results
def _error_result(
county: str,
case_number_searched: str,
error: str,
fetched_at: str,
) -> dict:
return {
"status": "ERROR",
"county": county,
"case_number_searched": case_number_searched,
"search_method": None,
"results": [],
"case_data": None,
"lis_pendens": [],
"liens_inventory": {},
"sources_used": ["civitek_ocrs"],
"source_url": "",
"errors": [error],
"fetched_at": fetched_at,
}
# ════════════════════════════════════════════════════════════════════════════
# CLI for manual testing
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Civitek OCRS adapter manual test")
parser.add_argument("--county", required=True, help="County name (e.g., Hernando)")
parser.add_argument("--case", help="Case number (e.g., 2024-CA-001234)")
parser.add_argument("--last-name", help="Last name for person search")
parser.add_argument("--first-name", help="First name (optional with last-name)")
parser.add_argument("--business", help="Business name search")
parser.add_argument("--no-headless", action="store_true", help="Show browser window")
args = parser.parse_args()
result = fetch_civitek_court_records(
county_name=args.county,
case_number=args.case,
party_lastname=args.last_name,
party_firstname=args.first_name,
business_name=args.business,
headless=not args.no_headless,
)
print(json.dumps(result, indent=2, default=str))