855 lines
43 KiB
Python
855 lines
43 KiB
Python
"""scrapers/realauction_clerk.py — Generic scraper para 5+ Florida counties.
|
|
|
|
REALAUCTION.COM WHITE-LABEL PLATFORM:
|
|
Multiples county clerks usan el mismo SaaS de realforeclose.com con subdominios
|
|
distintos. Comparten ~95% del HTML structure → un solo scraper sirve para todos.
|
|
|
|
Counties soportados (FL):
|
|
Miami-Dade miamidade.realforeclose.com
|
|
Duval duval.realforeclose.com
|
|
Broward broward.realforeclose.com
|
|
Palm Beach mypalmbeach.realforeclose.com
|
|
Hillsborough hillsborough.realforeclose.com
|
|
Orange myorangeclerk.realforeclose.com
|
|
|
|
URL PATTERN:
|
|
https://www.{subdomain}.realforeclose.com/index.cfm
|
|
?zaction=AUCTION
|
|
&zmethod=PREVIEW
|
|
&AuctionDate=MM/DD/YYYY
|
|
|
|
ESTRUCTURA del page (descubierta via DOM inspection en Miami-Dade, asumida igual
|
|
en otros county subdomains porque comparten platform):
|
|
- <div class="AUCTION_ITEM"> wrapper por listing
|
|
- <div class="ASTAT_MSGB Astat_DATA">{STATUS}</div> con auction status
|
|
- <table> con rows key/value (Case#, Parcel ID, Property Address, etc.)
|
|
- Status posibles dead: Redeemed, Canceled per *, Sold, Title Transferred,
|
|
Withdrawn, Dismissed (parser filtra estos)
|
|
|
|
ANTI-BOT: Chrome UA real bypassa el 403 que tira el sitio para UA no-standard.
|
|
RATE LIMIT: 2.5s entre requests del mismo county.
|
|
|
|
USO:
|
|
from scrapers.realauction_clerk import scrape_realauction_county
|
|
deals = scrape_realauction_county(county="Duval", days_ahead=7)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Callable, Optional
|
|
|
|
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Configuration: county registry
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Subdomain mapping. Si un county no esta aqui, scrape_realauction_county() raises.
|
|
# IMPORTANT: subdomains validados via DOM inspection. Cuando se agrega un county
|
|
# nuevo, hacer un curl al URL pattern para confirmar HTTP 200.
|
|
# Configuracion per (county, platform) tuple. The 'domain' field defaults to
|
|
# 'realforeclose.com' if omitted (backward compat). For tax deed sales it's
|
|
# 'realtaxdeed.com' instead. Both share the same Realauction.com platform with
|
|
# identical HTML structure — only the URL host differs.
|
|
REALAUCTION_DEFAULT_DOMAIN = "realforeclose.com"
|
|
|
|
REALAUCTION_COUNTIES = {
|
|
# ALL 41 FL counties confirmed working en realforeclose.com (probe 2026-05-14)
|
|
# Detection: title='RealForeclose- {County} County -Splash Page'
|
|
# 26 small FL counties NOT en realforeclose (usan plataformas distintas)
|
|
# ─── Major urban counties ──────────────────────────────────────────────
|
|
"Miami-Dade": {"subdomain": "miamidade", "source_id": "miami_dade_clerk", "state": "FL", "label": "Miami-Dade"},
|
|
"Duval": {"subdomain": "duval", "source_id": "duval_clerk", "state": "FL", "label": "Duval (Jacksonville)"},
|
|
"Broward": {"subdomain": "broward", "source_id": "broward_clerk", "state": "FL", "label": "Broward (Fort Lauderdale)"},
|
|
"Palm Beach": {"subdomain": "palmbeach", "source_id": "palm_beach_clerk", "state": "FL", "label": "Palm Beach (West Palm)"},
|
|
"Hillsborough": {"subdomain": "hillsborough","source_id": "hillsborough_clerk", "state": "FL", "label": "Hillsborough (Tampa)"},
|
|
"Orange": {"subdomain": "orange", "source_id": "orange_clerk", "state": "FL", "label": "Orange (Orlando)"},
|
|
"Pinellas": {"subdomain": "pinellas", "source_id": "pinellas_clerk", "state": "FL", "label": "Pinellas (St Petersburg)"},
|
|
"Lee": {"subdomain": "lee", "source_id": "lee_clerk", "state": "FL", "label": "Lee (Fort Myers)"},
|
|
"Polk": {"subdomain": "polk", "source_id": "polk_clerk", "state": "FL", "label": "Polk (Lakeland)"},
|
|
"Brevard": {"subdomain": "brevard", "source_id": "brevard_clerk", "state": "FL", "label": "Brevard (Cocoa, Melbourne)"},
|
|
"Volusia": {"subdomain": "volusia", "source_id": "volusia_clerk", "state": "FL", "label": "Volusia (Daytona Beach)"},
|
|
"Pasco": {"subdomain": "pasco", "source_id": "pasco_clerk", "state": "FL", "label": "Pasco"},
|
|
"Sarasota": {"subdomain": "sarasota", "source_id": "sarasota_clerk", "state": "FL", "label": "Sarasota"},
|
|
"Manatee": {"subdomain": "manatee", "source_id": "manatee_clerk", "state": "FL", "label": "Manatee (Bradenton)"},
|
|
"Seminole": {"subdomain": "seminole", "source_id": "seminole_clerk", "state": "FL", "label": "Seminole (Sanford)"},
|
|
# ─── Mid-size counties ─────────────────────────────────────────────────
|
|
"Marion": {"subdomain": "marion", "source_id": "marion_clerk", "state": "FL", "label": "Marion (Ocala)"},
|
|
"Lake": {"subdomain": "lake", "source_id": "lake_clerk", "state": "FL", "label": "Lake (Mt Dora, Tavares)"},
|
|
"Osceola": {"subdomain": "osceola", "source_id": "osceola_clerk", "state": "FL", "label": "Osceola (Kissimmee)"},
|
|
"Leon": {"subdomain": "leon", "source_id": "leon_clerk", "state": "FL", "label": "Leon (Tallahassee)"},
|
|
"Alachua": {"subdomain": "alachua", "source_id": "alachua_clerk", "state": "FL", "label": "Alachua (Gainesville)"},
|
|
"Escambia": {"subdomain": "escambia", "source_id": "escambia_clerk", "state": "FL", "label": "Escambia (Pensacola)"},
|
|
"Santa Rosa": {"subdomain": "santarosa", "source_id": "santa_rosa_clerk", "state": "FL", "label": "Santa Rosa (Milton)"},
|
|
"Bay": {"subdomain": "bay", "source_id": "bay_clerk", "state": "FL", "label": "Bay (Panama City)"},
|
|
"St Lucie": {"subdomain": "stlucie", "source_id": "st_lucie_clerk", "state": "FL", "label": "St Lucie (Port St Lucie)"},
|
|
"Indian River": {"subdomain": "indianriver", "source_id": "indian_river_clerk", "state": "FL", "label": "Indian River (Vero Beach)"},
|
|
"Martin": {"subdomain": "martin", "source_id": "martin_clerk", "state": "FL", "label": "Martin (Stuart)"},
|
|
"Citrus": {"subdomain": "citrus", "source_id": "citrus_clerk", "state": "FL", "label": "Citrus (Crystal River)"},
|
|
"Charlotte": {"subdomain": "charlotte", "source_id": "charlotte_clerk", "state": "FL", "label": "Charlotte (Port Charlotte)"},
|
|
"Clay": {"subdomain": "clay", "source_id": "clay_clerk", "state": "FL", "label": "Clay (Green Cove Springs)"},
|
|
"Nassau": {"subdomain": "nassau", "source_id": "nassau_clerk", "state": "FL", "label": "Nassau (Fernandina Beach)"},
|
|
"Putnam": {"subdomain": "putnam", "source_id": "putnam_clerk", "state": "FL", "label": "Putnam (Palatka)"},
|
|
"Flagler": {"subdomain": "flagler", "source_id": "flagler_clerk", "state": "FL", "label": "Flagler (Palm Coast)"},
|
|
"Walton": {"subdomain": "walton", "source_id": "walton_clerk", "state": "FL", "label": "Walton (DeFuniak Springs)"},
|
|
"Okeechobee": {"subdomain": "okeechobee", "source_id": "okeechobee_clerk", "state": "FL", "label": "Okeechobee"},
|
|
# ─── Small rural counties (small auction volume but still on platform) ─
|
|
"Baker": {"subdomain": "baker", "source_id": "baker_clerk", "state": "FL", "label": "Baker (Macclenny)"},
|
|
"Calhoun": {"subdomain": "calhoun", "source_id": "calhoun_clerk", "state": "FL", "label": "Calhoun"},
|
|
"Gilchrist": {"subdomain": "gilchrist", "source_id": "gilchrist_clerk", "state": "FL", "label": "Gilchrist"},
|
|
"Gulf": {"subdomain": "gulf", "source_id": "gulf_clerk", "state": "FL", "label": "Gulf (Port St Joe)"},
|
|
"Jackson": {"subdomain": "jackson", "source_id": "jackson_clerk", "state": "FL", "label": "Jackson (Marianna)"},
|
|
"Suwannee": {"subdomain": "suwannee", "source_id": "suwannee_clerk", "state": "FL", "label": "Suwannee (Live Oak)"},
|
|
"Washington": {"subdomain": "washington", "source_id": "washington_clerk", "state": "FL", "label": "Washington (Chipley)"},
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# COLORADO — mismo platform (realauction.com is multi-state)
|
|
# Discovered via https://www.realauction.com/clients
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
"Denver": {"subdomain": "denver", "source_id": "denver_clerk_co", "state": "CO", "label": "Denver County, CO"},
|
|
"Eagle": {"subdomain": "eagle", "source_id": "eagle_clerk_co", "state": "CO", "label": "Eagle County, CO"},
|
|
"El Paso": {"subdomain": "elpasoco", "source_id": "el_paso_clerk_co", "state": "CO", "label": "El Paso County, CO (Colorado Springs)"},
|
|
"Larimer": {"subdomain": "larimer", "source_id": "larimer_clerk_co", "state": "CO", "label": "Larimer County, CO (Fort Collins)"},
|
|
"Mesa": {"subdomain": "mesa", "source_id": "mesa_clerk_co", "state": "CO", "label": "Mesa County, CO (Grand Junction)"},
|
|
"Summit": {"subdomain": "summit", "source_id": "summit_clerk_co", "state": "CO", "label": "Summit County, CO (Breckenridge)"},
|
|
"Weld": {"subdomain": "weld", "source_id": "weld_clerk_co", "state": "CO", "label": "Weld County, CO (Greeley)"},
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# TAX DEED SALES via .realtaxdeed.com (same engine, diff domain)
|
|
# 17 portales: 3 AZ + 14 FL. SAME HTML structure as realforeclose.com.
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# Arizona
|
|
"Apache TD": {"subdomain": "apache", "domain": "realtaxdeed.com", "source_id": "apache_taxdeed_az", "state": "AZ", "label": "Apache County, AZ — Tax Deed"},
|
|
"Coconino TD": {"subdomain": "coconino", "domain": "realtaxdeed.com", "source_id": "coconino_taxdeed_az", "state": "AZ", "label": "Coconino County, AZ — Tax Deed (Flagstaff)"},
|
|
"Mohave TD": {"subdomain": "mohave", "domain": "realtaxdeed.com", "source_id": "mohave_taxdeed_az", "state": "AZ", "label": "Mohave County, AZ — Tax Deed (Kingman)"},
|
|
# Florida tax deed (distinct from foreclosure entries above)
|
|
"Alachua TD": {"subdomain": "alachua", "domain": "realtaxdeed.com", "source_id": "alachua_taxdeed", "state": "FL", "label": "Alachua County — Tax Deed"},
|
|
"Baker TD": {"subdomain": "baker", "domain": "realtaxdeed.com", "source_id": "baker_taxdeed", "state": "FL", "label": "Baker County — Tax Deed"},
|
|
"Bay TD": {"subdomain": "bay", "domain": "realtaxdeed.com", "source_id": "bay_taxdeed", "state": "FL", "label": "Bay County — Tax Deed"},
|
|
"Citrus TD": {"subdomain": "citrus", "domain": "realtaxdeed.com", "source_id": "citrus_taxdeed", "state": "FL", "label": "Citrus County — Tax Deed"},
|
|
"Clay TD": {"subdomain": "clay", "domain": "realtaxdeed.com", "source_id": "clay_taxdeed", "state": "FL", "label": "Clay County — Tax Deed"},
|
|
"Duval TD": {"subdomain": "duval", "domain": "realtaxdeed.com", "source_id": "duval_taxdeed", "state": "FL", "label": "Duval County — Tax Deed (Jacksonville)"},
|
|
"Escambia TD": {"subdomain": "escambia", "domain": "realtaxdeed.com", "source_id": "escambia_taxdeed", "state": "FL", "label": "Escambia County — Tax Deed (Pensacola)"},
|
|
"Flagler TD": {"subdomain": "flagler", "domain": "realtaxdeed.com", "source_id": "flagler_taxdeed", "state": "FL", "label": "Flagler County — Tax Deed"},
|
|
"Gilchrist TD": {"subdomain": "gilchrist", "domain": "realtaxdeed.com", "source_id": "gilchrist_taxdeed", "state": "FL", "label": "Gilchrist County — Tax Deed"},
|
|
"Gulf TD": {"subdomain": "gulf", "domain": "realtaxdeed.com", "source_id": "gulf_taxdeed", "state": "FL", "label": "Gulf County — Tax Deed"},
|
|
"Hendry TD": {"subdomain": "hendry", "domain": "realtaxdeed.com", "source_id": "hendry_taxdeed", "state": "FL", "label": "Hendry County — Tax Deed"},
|
|
"Hernando TD": {"subdomain": "hernando", "domain": "realtaxdeed.com", "source_id": "hernando_taxdeed", "state": "FL", "label": "Hernando County — Tax Deed"},
|
|
"Highlands TD": {"subdomain": "highlands", "domain": "realtaxdeed.com", "source_id": "highlands_taxdeed", "state": "FL", "label": "Highlands County — Tax Deed (Sebring)"},
|
|
"Hillsborough TD": {"subdomain": "hillsborough", "domain": "realtaxdeed.com", "source_id": "hillsborough_taxdeed", "state": "FL", "label": "Hillsborough County — Tax Deed (Tampa)"},
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# COLORADO TAX DEEDS via .treasurersdeedsale.com (same engine again)
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
"Adams TD": {"subdomain": "adams", "domain": "treasurersdeedsale.com", "source_id": "adams_taxdeed_co", "state": "CO", "label": "Adams County, CO — Treasurer's Deed"},
|
|
"Denver TD": {"subdomain": "denver", "domain": "treasurersdeedsale.com", "source_id": "denver_taxdeed_co", "state": "CO", "label": "Denver County, CO — Treasurer's Deed"},
|
|
"Eagle TD": {"subdomain": "eagle", "domain": "treasurersdeedsale.com", "source_id": "eagle_taxdeed_co", "state": "CO", "label": "Eagle County, CO — Treasurer's Deed"},
|
|
"El Paso TD": {"subdomain": "elpasoco", "domain": "treasurersdeedsale.com", "source_id": "el_paso_taxdeed_co", "state": "CO", "label": "El Paso County, CO — Treasurer's Deed"},
|
|
"Larimer TD": {"subdomain": "larimer", "domain": "treasurersdeedsale.com", "source_id": "larimer_taxdeed_co", "state": "CO", "label": "Larimer County, CO — Treasurer's Deed"},
|
|
"Mesa TD": {"subdomain": "mesa", "domain": "treasurersdeedsale.com", "source_id": "mesa_taxdeed_co", "state": "CO", "label": "Mesa County, CO — Treasurer's Deed"},
|
|
"Pitkin TD": {"subdomain": "pitkin", "domain": "treasurersdeedsale.com", "source_id": "pitkin_taxdeed_co", "state": "CO", "label": "Pitkin County, CO — Treasurer's Deed (Aspen)"},
|
|
"Weld TD": {"subdomain": "weld", "domain": "treasurersdeedsale.com", "source_id": "weld_taxdeed_co", "state": "CO", "label": "Weld County, CO — Treasurer's Deed"},
|
|
|
|
# ─── Counties NOT en realforeclose.com (parking only — usan plataformas
|
|
# distintas): Bradford, Collier, Columbia, DeSoto, Dixie, Franklin,
|
|
# Gadsden, Glades, Hamilton, Hardee, Hendry, Hernando, Highlands,
|
|
# Holmes, Jefferson, Lafayette, Levy, Liberty, Madison, Monroe,
|
|
# Okaloosa, St. Johns, Sumter, Taylor, Union, Wakulla.
|
|
# ─── Otras plataformas Realauction observadas (TODO add support):
|
|
# - .realtaxdeed.com (Apache/Coconino/Mohave AZ, FL tax deed sales)
|
|
# - .treasurersdeedsale.com (CO tax deed sales — Adams, Pitkin, etc.)
|
|
# - california.taxdefaultsale.com (CA Fresno)
|
|
}
|
|
|
|
|
|
def get_county_config(county: str) -> dict:
|
|
"""Returns the realauction config para un county. Raises ValueError si no soportado."""
|
|
config = REALAUCTION_COUNTIES.get(county)
|
|
if not config:
|
|
valid = ", ".join(REALAUCTION_COUNTIES.keys())
|
|
raise ValueError(
|
|
f"County '{county}' no soportado por realauction_clerk. "
|
|
f"Validos: {valid}"
|
|
)
|
|
return config
|
|
|
|
|
|
# Real Chrome UA — bypassa anti-bot del sitio
|
|
_CHROME_UA = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/131.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
# Rate limit por dominio: 1 request c/2.5s para no joder al sitio
|
|
_REQUEST_INTERVAL_SECONDS = 2.5
|
|
|
|
# Mapping del clerk's "Auction Type" → nuestro deal_type canonico
|
|
_AUCTION_TYPE_MAP = {
|
|
"FORECLOSURE": "foreclosure",
|
|
"TAXDEED": "tax_deed",
|
|
"TAX DEED": "tax_deed",
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Parsing helpers (county-agnostic)
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def _parse_money(s: str) -> Optional[float]:
|
|
"""Parse '$353,041.78' → 353041.78. Return None si invalido."""
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d.]", "", s)
|
|
if not cleaned or cleaned == ".":
|
|
return None
|
|
try:
|
|
return float(cleaned)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _parse_address(line1: str, line2: str) -> dict:
|
|
"""Parse property address.
|
|
|
|
line1 = "7355 POINCIANA CT" (street)
|
|
line2 = "MIAMI LAKES, FL- 33014" (city, state-zip)
|
|
"""
|
|
out = {"address": None, "city": None, "state": None, "zip": None}
|
|
line1 = (line1 or "").strip()
|
|
line2 = (line2 or "").strip()
|
|
|
|
line2_clean = re.sub(r"\bFL-\s*", "FL ", line2).strip()
|
|
if line1 and line2_clean:
|
|
out["address"] = f"{line1}, {line2_clean}"
|
|
elif line1:
|
|
out["address"] = line1
|
|
elif line2_clean:
|
|
out["address"] = line2_clean
|
|
|
|
if line2:
|
|
m = re.match(r"^(.+?),\s*([A-Z]{2})[-\s]\s*(\d{5})(?:-\d{4})?", line2)
|
|
if m:
|
|
out["city"] = m.group(1).title()
|
|
out["state"] = m.group(2)
|
|
out["zip"] = m.group(3)
|
|
else:
|
|
m2 = re.search(r"\b([A-Z]{2})[-\s]\s*(\d{5})", line2)
|
|
if m2:
|
|
out["state"] = m2.group(1)
|
|
out["zip"] = m2.group(2)
|
|
out["city"] = line2.split(",")[0].strip().title() if "," in line2 else None
|
|
|
|
return out
|
|
|
|
|
|
def _extract_case_from_table_rows(rows: list[list[str]]) -> Optional[dict]:
|
|
"""Dado las rows de una tabla case, extrae el case dict.
|
|
Returns None si no es una tabla de case valida.
|
|
"""
|
|
fields: dict[str, str] = {}
|
|
addr_line2: Optional[str] = None
|
|
next_row_is_addr_line2 = False
|
|
|
|
_ADDR_LINE2_DISALLOWED_KEYWORDS = (
|
|
"Assessed Value", "Plaintiff Max Bid", "Auction Type", "Case #",
|
|
"Certificate #", "Final Judgment", "Opening Bid", "Parcel ID",
|
|
)
|
|
|
|
for row in rows:
|
|
non_empty = [c for c in row if c]
|
|
if not non_empty:
|
|
continue
|
|
|
|
if next_row_is_addr_line2 and len(non_empty) >= 1:
|
|
candidate = non_empty[0].strip()
|
|
looks_like_addr = not any(
|
|
kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS
|
|
)
|
|
if looks_like_addr:
|
|
addr_line2 = candidate
|
|
next_row_is_addr_line2 = False
|
|
if looks_like_addr:
|
|
continue
|
|
|
|
if len(non_empty) >= 2:
|
|
key = non_empty[0].rstrip(":").strip()
|
|
value = non_empty[1].strip()
|
|
fields[key] = value
|
|
if key == "Property Address":
|
|
if value:
|
|
next_row_is_addr_line2 = False
|
|
else:
|
|
next_row_is_addr_line2 = True
|
|
elif len(non_empty) == 1:
|
|
if next_row_is_addr_line2:
|
|
candidate = non_empty[0].strip()
|
|
if not any(kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS):
|
|
addr_line2 = candidate
|
|
next_row_is_addr_line2 = False
|
|
|
|
if not fields.get("Case #"):
|
|
return None
|
|
|
|
# Auction Type explicit (Miami-Dade/Duval/Broward style) → source of truth.
|
|
# Si NO esta (Orange style — solo AD_LBL/AD_DTA divs sin Auction Type field),
|
|
# inferir desde case_number con fallback a foreclosure (mas comun):
|
|
# - "TD" en case# o "TAXDEED" → tax_deed
|
|
# - default → foreclosure (clerks listan mayoria foreclosure)
|
|
auction_type_raw = (fields.get("Auction Type") or "").upper().strip()
|
|
if auction_type_raw:
|
|
deal_type = _AUCTION_TYPE_MAP.get(auction_type_raw)
|
|
if not deal_type:
|
|
return None # Explicit pero unknown — skip
|
|
else:
|
|
case_num_upper = (fields.get("Case #") or "").upper()
|
|
if "TAXDEED" in case_num_upper or "-TD-" in case_num_upper:
|
|
deal_type = "tax_deed"
|
|
auction_type_raw = "TAXDEED (inferred)"
|
|
else:
|
|
deal_type = "foreclosure"
|
|
auction_type_raw = "FORECLOSURE (inferred)"
|
|
|
|
case = {
|
|
"deal_type": deal_type,
|
|
"case_number": fields.get("Case #"),
|
|
"auction_type_raw": auction_type_raw,
|
|
}
|
|
|
|
if fields.get("Certificate #"):
|
|
case["certificate_number"] = fields["Certificate #"]
|
|
if fields.get("Final Judgment Amount"):
|
|
case["final_judgment_amount"] = _parse_money(fields["Final Judgment Amount"])
|
|
if fields.get("Opening Bid"):
|
|
case["starting_bid"] = _parse_money(fields["Opening Bid"])
|
|
if fields.get("Parcel ID"):
|
|
case["parcel_id"] = fields["Parcel ID"]
|
|
if fields.get("Assessed Value"):
|
|
case["assessed_value"] = _parse_money(fields["Assessed Value"])
|
|
if fields.get("Plaintiff Max Bid"):
|
|
case["plaintiff_max_bid_raw"] = fields["Plaintiff Max Bid"]
|
|
|
|
addr_parts = _parse_address(fields.get("Property Address", ""), addr_line2 or "")
|
|
case.update(addr_parts)
|
|
|
|
return case
|
|
|
|
|
|
def _build_description(case: dict) -> str:
|
|
"""Compact text description from case facts — useful for DealClassifier context."""
|
|
bits = []
|
|
status = case.get("auction_status")
|
|
if status:
|
|
bits.append(f"Status: {status}")
|
|
if case.get("auction_type_raw"):
|
|
bits.append(f"Auction Type: {case['auction_type_raw']}")
|
|
if case.get("certificate_number"):
|
|
bits.append(f"Tax Cert #: {case['certificate_number']}")
|
|
if case.get("final_judgment_amount"):
|
|
bits.append(f"Final Judgment Amount: ${case['final_judgment_amount']:,.2f}")
|
|
if case.get("starting_bid"):
|
|
bits.append(f"Opening Bid: ${case['starting_bid']:,.2f}")
|
|
if case.get("assessed_value"):
|
|
bits.append(f"Assessed Value (PA): ${case['assessed_value']:,.2f}")
|
|
if case.get("parcel_id"):
|
|
bits.append(f"Parcel ID: {case['parcel_id']}")
|
|
if case.get("plaintiff_max_bid_raw"):
|
|
bits.append(f"Plaintiff Max Bid: {case['plaintiff_max_bid_raw']}")
|
|
return " | ".join(bits)
|
|
|
|
|
|
def _build_deal_record(case: dict, auction_date_iso: str, county_config: dict) -> dict:
|
|
"""Convert clerk case dict → deal record compatible with deals_db.insert_deal.
|
|
|
|
Reglas pricing (heredadas del Miami-Dade v1.1 fix):
|
|
- tax_deed: listing_price = starting_bid
|
|
- foreclosure: listing_price = None (bid hidden pre-auction)
|
|
- final_judgment_amount stored separately (NOT confused with listing_price)
|
|
"""
|
|
deal_type = case.get("deal_type")
|
|
starting_bid = case.get("starting_bid")
|
|
assessed_value = case.get("assessed_value")
|
|
final_judgment = case.get("final_judgment_amount")
|
|
|
|
if deal_type == "tax_deed":
|
|
listing_price = starting_bid
|
|
elif deal_type == "foreclosure":
|
|
listing_price = None
|
|
else:
|
|
listing_price = starting_bid or assessed_value
|
|
|
|
subdomain = county_config["subdomain"]
|
|
domain = county_config.get("domain", REALAUCTION_DEFAULT_DOMAIN)
|
|
source_id = county_config["source_id"]
|
|
state = county_config["state"]
|
|
county_label = county_config["label"].split(" (")[0] # "Duval (Jacksonville)" → "Duval"
|
|
|
|
deal = {
|
|
"source": source_id,
|
|
"source_url": (
|
|
f"https://{subdomain}.{domain}/index.cfm"
|
|
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate="
|
|
f"{auction_date_iso[5:7]}/{auction_date_iso[8:10]}/{auction_date_iso[0:4]}"
|
|
),
|
|
"address": case.get("address"),
|
|
"city": case.get("city"),
|
|
"state": case.get("state") or state,
|
|
"zip": case.get("zip"),
|
|
"county": county_label,
|
|
"parcel_id": case.get("parcel_id"),
|
|
"listing_price": listing_price,
|
|
"deal_type": deal_type,
|
|
"starting_bid": starting_bid,
|
|
"estimated_arv": assessed_value,
|
|
"final_judgment_amount": final_judgment,
|
|
"auction_status": case.get("auction_status") or "scheduled",
|
|
"case_number": case.get("case_number"),
|
|
"auction_date": auction_date_iso,
|
|
"listing_description": _build_description(case),
|
|
}
|
|
return deal
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Status filtering (REDEEMED/CANCELED bug fix)
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Cases con estos statuses NO van a auction → NO incluir en results.
|
|
# Substring matching: "Canceled per Bankruptcy" → dead (contains "canceled").
|
|
_DEAD_STATUS_SUBSTRINGS = (
|
|
"redeemed",
|
|
"canceled",
|
|
"cancelled",
|
|
"sold",
|
|
"closed", # case closed/disposed
|
|
"title transferred",
|
|
"withdrawn",
|
|
"dismissed",
|
|
)
|
|
|
|
|
|
def _is_status_dead(status: Optional[str]) -> bool:
|
|
"""Returns True si el case esta inactivo (off-market)."""
|
|
if not status:
|
|
return False
|
|
s = status.strip().lower()
|
|
if not s:
|
|
return False
|
|
return any(dead in s for dead in _DEAD_STATUS_SUBSTRINGS)
|
|
|
|
|
|
def _parse_cases_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[dict]:
|
|
"""Parse all auction cases from a Realforeclose page HTML using stdlib only."""
|
|
from html.parser import HTMLParser
|
|
|
|
class AuctionItemParser(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.items: list[dict] = []
|
|
self.auction_item_depth = 0
|
|
self.current_item_depth_start = None
|
|
self.div_depth = 0
|
|
self.status_label_active = False
|
|
self.expecting_status_value = False
|
|
self.in_astat_msga = False
|
|
self.in_astat_msgb = False
|
|
self.astat_msga_text = ""
|
|
self.astat_msgb_text = ""
|
|
self.in_table = 0
|
|
self.current_table: list[list[str]] = []
|
|
self.current_row: list[str] = []
|
|
self.in_cell = 0
|
|
self.cell_text_parts: list[str] = []
|
|
self.current_status = ""
|
|
self.current_tables: list[list[list[str]]] = []
|
|
# Orange-style div-based fields: <div class="AD_LBL">Label:</div>
|
|
# <div class="AD_DTA">Value</div>
|
|
# We collect these as synthetic [Label, Value] rows so el extractor
|
|
# downstream funciona sin cambios.
|
|
self.in_ad_lbl = False
|
|
self.in_ad_dta = False
|
|
self.ad_lbl_text = ""
|
|
self.ad_dta_text = ""
|
|
self.last_ad_lbl: Optional[str] = None
|
|
self.current_ad_rows: list[list[str]] = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_d = dict(attrs)
|
|
classes = (attrs_d.get("class") or "").split()
|
|
|
|
if tag == "div":
|
|
self.div_depth += 1
|
|
if "AUCTION_ITEM" in classes:
|
|
if self.auction_item_depth == 0:
|
|
self.current_item_depth_start = self.div_depth
|
|
self.current_status = ""
|
|
self.current_tables = []
|
|
self.current_ad_rows = []
|
|
self.last_ad_lbl = None
|
|
self.auction_item_depth += 1
|
|
elif self.auction_item_depth:
|
|
if "ASTAT_MSGA" in classes:
|
|
self.in_astat_msga = True
|
|
self.astat_msga_text = ""
|
|
elif "ASTAT_MSGB" in classes:
|
|
self.in_astat_msgb = True
|
|
self.astat_msgb_text = ""
|
|
elif "AD_LBL" in classes:
|
|
self.in_ad_lbl = True
|
|
self.ad_lbl_text = ""
|
|
elif "AD_DTA" in classes:
|
|
self.in_ad_dta = True
|
|
self.ad_dta_text = ""
|
|
elif tag == "table" and self.auction_item_depth:
|
|
self.in_table += 1
|
|
self.current_table = []
|
|
elif tag == "tr" and self.in_table:
|
|
self.current_row = []
|
|
elif tag in ("td", "th") and self.in_table:
|
|
self.in_cell += 1
|
|
self.cell_text_parts = []
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag == "div":
|
|
if self.in_astat_msga:
|
|
self.in_astat_msga = False
|
|
if "auction status" in self.astat_msga_text.strip().lower():
|
|
self.expecting_status_value = True
|
|
elif self.in_astat_msgb:
|
|
self.in_astat_msgb = False
|
|
if self.expecting_status_value and self.auction_item_depth:
|
|
self.current_status = self.astat_msgb_text.strip()
|
|
self.expecting_status_value = False
|
|
elif self.in_ad_lbl:
|
|
self.in_ad_lbl = False
|
|
self.last_ad_lbl = self.ad_lbl_text.strip()
|
|
elif self.in_ad_dta:
|
|
self.in_ad_dta = False
|
|
# Pair with most recent AD_LBL if any
|
|
if self.last_ad_lbl and self.auction_item_depth:
|
|
label = self.last_ad_lbl
|
|
value = " ".join(self.ad_dta_text.split()).strip()
|
|
self.current_ad_rows.append([label, value])
|
|
self.last_ad_lbl = None
|
|
if self.auction_item_depth and self.div_depth == self.current_item_depth_start:
|
|
self.auction_item_depth -= 1
|
|
if self.auction_item_depth == 0:
|
|
# If item had AD_LBL/AD_DTA pairs (Orange-style), add them
|
|
# as a synthetic table so el extractor downstream funciona.
|
|
tables = list(self.current_tables)
|
|
if self.current_ad_rows:
|
|
tables.append(self.current_ad_rows)
|
|
self.items.append({
|
|
"status": self.current_status,
|
|
"tables": tables,
|
|
})
|
|
self.current_status = ""
|
|
self.current_tables = []
|
|
self.current_ad_rows = []
|
|
self.current_item_depth_start = None
|
|
elif self.auction_item_depth:
|
|
pass
|
|
self.div_depth -= 1
|
|
elif tag in ("td", "th") and self.in_cell:
|
|
self.in_cell -= 1
|
|
text = " ".join("".join(self.cell_text_parts).split()).strip()
|
|
self.current_row.append(text)
|
|
self.cell_text_parts = []
|
|
elif tag == "tr" and self.in_table:
|
|
if self.current_row:
|
|
self.current_table.append(self.current_row)
|
|
self.current_row = []
|
|
elif tag == "table":
|
|
if self.in_table:
|
|
self.in_table -= 1
|
|
if self.current_table and self.auction_item_depth:
|
|
self.current_tables.append(self.current_table)
|
|
self.current_table = []
|
|
|
|
def handle_data(self, data):
|
|
if self.in_cell:
|
|
self.cell_text_parts.append(data)
|
|
elif self.in_astat_msga:
|
|
self.astat_msga_text += data
|
|
elif self.in_astat_msgb:
|
|
self.astat_msgb_text += data
|
|
elif self.in_ad_lbl:
|
|
self.ad_lbl_text += data
|
|
elif self.in_ad_dta:
|
|
self.ad_dta_text += data
|
|
|
|
parser = AuctionItemParser()
|
|
try:
|
|
parser.feed(html)
|
|
except Exception as e:
|
|
if log_fn:
|
|
log_fn(f" HTML parser error: {e}")
|
|
return []
|
|
|
|
cases_on_page: list[dict] = []
|
|
skipped_dead = 0
|
|
for item in parser.items:
|
|
status = item["status"] or ""
|
|
if _is_status_dead(status):
|
|
skipped_dead += 1
|
|
continue
|
|
for rows in item["tables"]:
|
|
try:
|
|
case = _extract_case_from_table_rows(rows)
|
|
if case and case.get("case_number"):
|
|
case["auction_status"] = status or "scheduled"
|
|
if not any(c.get("case_number") == case["case_number"] for c in cases_on_page):
|
|
cases_on_page.append(case)
|
|
break
|
|
except Exception as e:
|
|
if log_fn:
|
|
log_fn(f" table parse error: {e}")
|
|
|
|
if skipped_dead and log_fn:
|
|
log_fn(f" filtered {skipped_dead} dead case(s) (Redeemed/Canceled/Sold/etc)")
|
|
|
|
return cases_on_page
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Main scraper entry point
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def scrape_realauction_county(
|
|
*,
|
|
county: str,
|
|
days_ahead: int = 14,
|
|
days_back: int = 0,
|
|
status_cb: Optional[Callable[[str], None]] = None,
|
|
max_dates: Optional[int] = None,
|
|
use_cache: bool = True,
|
|
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
|
) -> list[dict]:
|
|
"""Scrape realauction.com calendar para un county especifico.
|
|
|
|
Args:
|
|
county: nombre del county (e.g. "Miami-Dade", "Duval", "Broward").
|
|
Debe estar en REALAUCTION_COUNTIES.
|
|
days_ahead: dias hacia adelante (default 14)
|
|
days_back: dias hacia atras (default 0)
|
|
status_cb: callback opcional para logging
|
|
max_dates: limita el numero de dias scrapeados (testing)
|
|
use_cache: usar cache 24h (default True)
|
|
cache_ttl_seconds: TTL del cache
|
|
|
|
Returns:
|
|
list[dict] de deal records listos para deals_db.insert_deal
|
|
"""
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
|
|
config = get_county_config(county)
|
|
subdomain = config["subdomain"]
|
|
domain = config.get("domain", REALAUCTION_DEFAULT_DOMAIN)
|
|
source_id = config["source_id"]
|
|
|
|
def _log(msg: str) -> None:
|
|
if status_cb:
|
|
status_cb(msg)
|
|
|
|
deals: list[dict] = []
|
|
today = datetime.now(timezone.utc).date()
|
|
dates_to_scrape = []
|
|
for offset in range(-days_back, days_ahead + 1):
|
|
d = today + timedelta(days=offset)
|
|
dates_to_scrape.append(d)
|
|
if max_dates:
|
|
dates_to_scrape = dates_to_scrape[:max_dates]
|
|
|
|
_log(f"{config['label']} Clerk: scraping {len(dates_to_scrape)} dates (cache={'ON' if use_cache else 'OFF'})")
|
|
|
|
cache_namespace = source_id # e.g. "duval_clerk"
|
|
cached_pages: dict[str, str] = {}
|
|
dates_needing_fetch: list = []
|
|
cache_hits = 0
|
|
for date in dates_to_scrape:
|
|
date_str = date.strftime("%m/%d/%Y")
|
|
url = (
|
|
f"https://{subdomain}.{domain}/index.cfm"
|
|
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
|
|
)
|
|
if use_cache:
|
|
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
|
|
if cached:
|
|
cached_pages[url] = cached
|
|
cache_hits += 1
|
|
continue
|
|
dates_needing_fetch.append(date)
|
|
_log(f" cache hits: {cache_hits}/{len(dates_to_scrape)}; need to fetch {len(dates_needing_fetch)}")
|
|
|
|
fresh_pages: dict[str, str] = {}
|
|
if dates_needing_fetch:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
user_agent=_CHROME_UA,
|
|
viewport={"width": 1280, "height": 800},
|
|
locale="en-US",
|
|
timezone_id="America/New_York",
|
|
)
|
|
page = context.new_page()
|
|
page.set_default_timeout(20_000)
|
|
|
|
last_request_at = 0.0
|
|
for date in dates_needing_fetch:
|
|
elapsed = time.time() - last_request_at
|
|
if elapsed < _REQUEST_INTERVAL_SECONDS:
|
|
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
|
|
last_request_at = time.time()
|
|
|
|
date_str = date.strftime("%m/%d/%Y")
|
|
url = (
|
|
f"https://{subdomain}.{domain}/index.cfm"
|
|
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
|
|
)
|
|
_log(f" Fetching {date_str}...")
|
|
|
|
try:
|
|
response = page.goto(url, wait_until="networkidle", timeout=20_000)
|
|
if response.status != 200:
|
|
_log(f" HTTP {response.status} — skipping")
|
|
continue
|
|
except PlaywrightTimeout:
|
|
_log(f" timeout — skipping")
|
|
continue
|
|
except Exception as e:
|
|
_log(f" error {e} — skipping")
|
|
continue
|
|
|
|
try:
|
|
time.sleep(1.5)
|
|
except Exception:
|
|
pass
|
|
|
|
html = page.content()
|
|
fresh_pages[url] = html
|
|
if use_cache:
|
|
save_cache(cache_namespace, url, html,
|
|
status_code=200, ttl_seconds=cache_ttl_seconds)
|
|
|
|
browser.close()
|
|
|
|
for date in dates_to_scrape:
|
|
date_str = date.strftime("%m/%d/%Y")
|
|
auction_date_iso = date.isoformat()
|
|
url = (
|
|
f"https://{subdomain}.{domain}/index.cfm"
|
|
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
|
|
)
|
|
html = cached_pages.get(url) or fresh_pages.get(url)
|
|
if not html:
|
|
continue
|
|
|
|
cases_on_page = _parse_cases_from_html(html, _log)
|
|
_log(f" {date_str}: parsed {len(cases_on_page)} case(s)")
|
|
|
|
for case in cases_on_page:
|
|
deal = _build_deal_record(case, auction_date_iso, config)
|
|
if not deal.get("address") and not deal.get("listing_price"):
|
|
continue
|
|
deals.append(deal)
|
|
|
|
_log(f"{config['label']} Clerk: scraped {len(deals)} total deals")
|
|
return deals
|
|
|
|
|
|
def run_scraper_to_db(
|
|
*,
|
|
county: str,
|
|
days_ahead: int = 14,
|
|
days_back: int = 0,
|
|
auto_classify: bool = True,
|
|
status_cb: Optional[Callable[[str], None]] = None,
|
|
max_dates: Optional[int] = None,
|
|
) -> dict:
|
|
"""Full pipeline: scrape county → persist → optionally auto-classify nuevos."""
|
|
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
|
|
init_db()
|
|
|
|
config = get_county_config(county)
|
|
source_id = config["source_id"]
|
|
|
|
run_id = record_scraper_run(source_id)
|
|
errors: list[str] = []
|
|
|
|
def _log(m: str) -> None:
|
|
if status_cb:
|
|
status_cb(m)
|
|
|
|
try:
|
|
deals = scrape_realauction_county(
|
|
county=county,
|
|
days_ahead=days_ahead,
|
|
days_back=days_back,
|
|
status_cb=status_cb,
|
|
max_dates=max_dates,
|
|
)
|
|
except Exception as e:
|
|
errors.append(f"scrape failed: {e}")
|
|
deals = []
|
|
|
|
deals_new = 0
|
|
deals_updated = 0
|
|
new_deal_ids: list[int] = []
|
|
|
|
for deal in deals:
|
|
try:
|
|
deal_id, is_new = insert_deal(deal)
|
|
if is_new:
|
|
deals_new += 1
|
|
new_deal_ids.append(deal_id)
|
|
else:
|
|
deals_updated += 1
|
|
except Exception as e:
|
|
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
|
|
|
|
classified_count = 0
|
|
if auto_classify and new_deal_ids:
|
|
_log(f"Auto-classifying {len(new_deal_ids)} new deals...")
|
|
from deal_classifier import classify_deal
|
|
from deals_db import get_deal_by_id, update_classification
|
|
for did in new_deal_ids:
|
|
try:
|
|
d = get_deal_by_id(did)
|
|
if not d:
|
|
continue
|
|
result = classify_deal(d)
|
|
update_classification(
|
|
deal_id=did,
|
|
status=result["classification_status"],
|
|
score=result["score"],
|
|
reasons=result["reasons"],
|
|
strategy=result["strategy"],
|
|
)
|
|
classified_count += 1
|
|
except Exception as e:
|
|
errors.append(f"classify fail for deal_id={did}: {e}")
|
|
|
|
finish_scraper_run(
|
|
run_id,
|
|
deals_found=len(deals),
|
|
deals_new=deals_new,
|
|
deals_updated=deals_updated,
|
|
errors_count=len(errors),
|
|
errors_summary=errors if errors else None,
|
|
firecrawl_credits_used=0,
|
|
status="success" if not errors else ("partial" if deals else "failed"),
|
|
)
|
|
|
|
return {
|
|
"source": source_id,
|
|
"scraper_run_id": run_id,
|
|
"deals_found": len(deals),
|
|
"deals_new": deals_new,
|
|
"deals_updated": deals_updated,
|
|
"deals_classified": classified_count,
|
|
"errors_count": len(errors),
|
|
"errors": errors,
|
|
}
|