feat: AR-House initial commit

This commit is contained in:
2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
+854
View File
@@ -0,0 +1,854 @@
"""scrapers/realauction_clerk.py — Generic scraper para 5+ Florida counties.
REALAUCTION.COM WHITE-LABEL PLATFORM:
Multiples county clerks usan el mismo SaaS de realforeclose.com con subdominios
distintos. Comparten ~95% del HTML structure → un solo scraper sirve para todos.
Counties soportados (FL):
Miami-Dade miamidade.realforeclose.com
Duval duval.realforeclose.com
Broward broward.realforeclose.com
Palm Beach mypalmbeach.realforeclose.com
Hillsborough hillsborough.realforeclose.com
Orange myorangeclerk.realforeclose.com
URL PATTERN:
https://www.{subdomain}.realforeclose.com/index.cfm
?zaction=AUCTION
&zmethod=PREVIEW
&AuctionDate=MM/DD/YYYY
ESTRUCTURA del page (descubierta via DOM inspection en Miami-Dade, asumida igual
en otros county subdomains porque comparten platform):
- <div class="AUCTION_ITEM"> wrapper por listing
- <div class="ASTAT_MSGB Astat_DATA">{STATUS}</div> con auction status
- <table> con rows key/value (Case#, Parcel ID, Property Address, etc.)
- Status posibles dead: Redeemed, Canceled per *, Sold, Title Transferred,
Withdrawn, Dismissed (parser filtra estos)
ANTI-BOT: Chrome UA real bypassa el 403 que tira el sitio para UA no-standard.
RATE LIMIT: 2.5s entre requests del mismo county.
USO:
from scrapers.realauction_clerk import scrape_realauction_county
deals = scrape_realauction_county(county="Duval", days_ahead=7)
"""
from __future__ import annotations
import re
import time
from datetime import datetime, timedelta, timezone
from typing import Callable, Optional
from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY
# ════════════════════════════════════════════════════════════════════════════
# Configuration: county registry
# ════════════════════════════════════════════════════════════════════════════
# Subdomain mapping. Si un county no esta aqui, scrape_realauction_county() raises.
# IMPORTANT: subdomains validados via DOM inspection. Cuando se agrega un county
# nuevo, hacer un curl al URL pattern para confirmar HTTP 200.
# Configuracion per (county, platform) tuple. The 'domain' field defaults to
# 'realforeclose.com' if omitted (backward compat). For tax deed sales it's
# 'realtaxdeed.com' instead. Both share the same Realauction.com platform with
# identical HTML structure — only the URL host differs.
REALAUCTION_DEFAULT_DOMAIN = "realforeclose.com"
REALAUCTION_COUNTIES = {
# ALL 41 FL counties confirmed working en realforeclose.com (probe 2026-05-14)
# Detection: title='RealForeclose- {County} County -Splash Page'
# 26 small FL counties NOT en realforeclose (usan plataformas distintas)
# ─── Major urban counties ──────────────────────────────────────────────
"Miami-Dade": {"subdomain": "miamidade", "source_id": "miami_dade_clerk", "state": "FL", "label": "Miami-Dade"},
"Duval": {"subdomain": "duval", "source_id": "duval_clerk", "state": "FL", "label": "Duval (Jacksonville)"},
"Broward": {"subdomain": "broward", "source_id": "broward_clerk", "state": "FL", "label": "Broward (Fort Lauderdale)"},
"Palm Beach": {"subdomain": "palmbeach", "source_id": "palm_beach_clerk", "state": "FL", "label": "Palm Beach (West Palm)"},
"Hillsborough": {"subdomain": "hillsborough","source_id": "hillsborough_clerk", "state": "FL", "label": "Hillsborough (Tampa)"},
"Orange": {"subdomain": "orange", "source_id": "orange_clerk", "state": "FL", "label": "Orange (Orlando)"},
"Pinellas": {"subdomain": "pinellas", "source_id": "pinellas_clerk", "state": "FL", "label": "Pinellas (St Petersburg)"},
"Lee": {"subdomain": "lee", "source_id": "lee_clerk", "state": "FL", "label": "Lee (Fort Myers)"},
"Polk": {"subdomain": "polk", "source_id": "polk_clerk", "state": "FL", "label": "Polk (Lakeland)"},
"Brevard": {"subdomain": "brevard", "source_id": "brevard_clerk", "state": "FL", "label": "Brevard (Cocoa, Melbourne)"},
"Volusia": {"subdomain": "volusia", "source_id": "volusia_clerk", "state": "FL", "label": "Volusia (Daytona Beach)"},
"Pasco": {"subdomain": "pasco", "source_id": "pasco_clerk", "state": "FL", "label": "Pasco"},
"Sarasota": {"subdomain": "sarasota", "source_id": "sarasota_clerk", "state": "FL", "label": "Sarasota"},
"Manatee": {"subdomain": "manatee", "source_id": "manatee_clerk", "state": "FL", "label": "Manatee (Bradenton)"},
"Seminole": {"subdomain": "seminole", "source_id": "seminole_clerk", "state": "FL", "label": "Seminole (Sanford)"},
# ─── Mid-size counties ─────────────────────────────────────────────────
"Marion": {"subdomain": "marion", "source_id": "marion_clerk", "state": "FL", "label": "Marion (Ocala)"},
"Lake": {"subdomain": "lake", "source_id": "lake_clerk", "state": "FL", "label": "Lake (Mt Dora, Tavares)"},
"Osceola": {"subdomain": "osceola", "source_id": "osceola_clerk", "state": "FL", "label": "Osceola (Kissimmee)"},
"Leon": {"subdomain": "leon", "source_id": "leon_clerk", "state": "FL", "label": "Leon (Tallahassee)"},
"Alachua": {"subdomain": "alachua", "source_id": "alachua_clerk", "state": "FL", "label": "Alachua (Gainesville)"},
"Escambia": {"subdomain": "escambia", "source_id": "escambia_clerk", "state": "FL", "label": "Escambia (Pensacola)"},
"Santa Rosa": {"subdomain": "santarosa", "source_id": "santa_rosa_clerk", "state": "FL", "label": "Santa Rosa (Milton)"},
"Bay": {"subdomain": "bay", "source_id": "bay_clerk", "state": "FL", "label": "Bay (Panama City)"},
"St Lucie": {"subdomain": "stlucie", "source_id": "st_lucie_clerk", "state": "FL", "label": "St Lucie (Port St Lucie)"},
"Indian River": {"subdomain": "indianriver", "source_id": "indian_river_clerk", "state": "FL", "label": "Indian River (Vero Beach)"},
"Martin": {"subdomain": "martin", "source_id": "martin_clerk", "state": "FL", "label": "Martin (Stuart)"},
"Citrus": {"subdomain": "citrus", "source_id": "citrus_clerk", "state": "FL", "label": "Citrus (Crystal River)"},
"Charlotte": {"subdomain": "charlotte", "source_id": "charlotte_clerk", "state": "FL", "label": "Charlotte (Port Charlotte)"},
"Clay": {"subdomain": "clay", "source_id": "clay_clerk", "state": "FL", "label": "Clay (Green Cove Springs)"},
"Nassau": {"subdomain": "nassau", "source_id": "nassau_clerk", "state": "FL", "label": "Nassau (Fernandina Beach)"},
"Putnam": {"subdomain": "putnam", "source_id": "putnam_clerk", "state": "FL", "label": "Putnam (Palatka)"},
"Flagler": {"subdomain": "flagler", "source_id": "flagler_clerk", "state": "FL", "label": "Flagler (Palm Coast)"},
"Walton": {"subdomain": "walton", "source_id": "walton_clerk", "state": "FL", "label": "Walton (DeFuniak Springs)"},
"Okeechobee": {"subdomain": "okeechobee", "source_id": "okeechobee_clerk", "state": "FL", "label": "Okeechobee"},
# ─── Small rural counties (small auction volume but still on platform) ─
"Baker": {"subdomain": "baker", "source_id": "baker_clerk", "state": "FL", "label": "Baker (Macclenny)"},
"Calhoun": {"subdomain": "calhoun", "source_id": "calhoun_clerk", "state": "FL", "label": "Calhoun"},
"Gilchrist": {"subdomain": "gilchrist", "source_id": "gilchrist_clerk", "state": "FL", "label": "Gilchrist"},
"Gulf": {"subdomain": "gulf", "source_id": "gulf_clerk", "state": "FL", "label": "Gulf (Port St Joe)"},
"Jackson": {"subdomain": "jackson", "source_id": "jackson_clerk", "state": "FL", "label": "Jackson (Marianna)"},
"Suwannee": {"subdomain": "suwannee", "source_id": "suwannee_clerk", "state": "FL", "label": "Suwannee (Live Oak)"},
"Washington": {"subdomain": "washington", "source_id": "washington_clerk", "state": "FL", "label": "Washington (Chipley)"},
# ═══════════════════════════════════════════════════════════════════════
# COLORADO — mismo platform (realauction.com is multi-state)
# Discovered via https://www.realauction.com/clients
# ═══════════════════════════════════════════════════════════════════════
"Denver": {"subdomain": "denver", "source_id": "denver_clerk_co", "state": "CO", "label": "Denver County, CO"},
"Eagle": {"subdomain": "eagle", "source_id": "eagle_clerk_co", "state": "CO", "label": "Eagle County, CO"},
"El Paso": {"subdomain": "elpasoco", "source_id": "el_paso_clerk_co", "state": "CO", "label": "El Paso County, CO (Colorado Springs)"},
"Larimer": {"subdomain": "larimer", "source_id": "larimer_clerk_co", "state": "CO", "label": "Larimer County, CO (Fort Collins)"},
"Mesa": {"subdomain": "mesa", "source_id": "mesa_clerk_co", "state": "CO", "label": "Mesa County, CO (Grand Junction)"},
"Summit": {"subdomain": "summit", "source_id": "summit_clerk_co", "state": "CO", "label": "Summit County, CO (Breckenridge)"},
"Weld": {"subdomain": "weld", "source_id": "weld_clerk_co", "state": "CO", "label": "Weld County, CO (Greeley)"},
# ═══════════════════════════════════════════════════════════════════════
# TAX DEED SALES via .realtaxdeed.com (same engine, diff domain)
# 17 portales: 3 AZ + 14 FL. SAME HTML structure as realforeclose.com.
# ═══════════════════════════════════════════════════════════════════════
# Arizona
"Apache TD": {"subdomain": "apache", "domain": "realtaxdeed.com", "source_id": "apache_taxdeed_az", "state": "AZ", "label": "Apache County, AZ — Tax Deed"},
"Coconino TD": {"subdomain": "coconino", "domain": "realtaxdeed.com", "source_id": "coconino_taxdeed_az", "state": "AZ", "label": "Coconino County, AZ — Tax Deed (Flagstaff)"},
"Mohave TD": {"subdomain": "mohave", "domain": "realtaxdeed.com", "source_id": "mohave_taxdeed_az", "state": "AZ", "label": "Mohave County, AZ — Tax Deed (Kingman)"},
# Florida tax deed (distinct from foreclosure entries above)
"Alachua TD": {"subdomain": "alachua", "domain": "realtaxdeed.com", "source_id": "alachua_taxdeed", "state": "FL", "label": "Alachua County — Tax Deed"},
"Baker TD": {"subdomain": "baker", "domain": "realtaxdeed.com", "source_id": "baker_taxdeed", "state": "FL", "label": "Baker County — Tax Deed"},
"Bay TD": {"subdomain": "bay", "domain": "realtaxdeed.com", "source_id": "bay_taxdeed", "state": "FL", "label": "Bay County — Tax Deed"},
"Citrus TD": {"subdomain": "citrus", "domain": "realtaxdeed.com", "source_id": "citrus_taxdeed", "state": "FL", "label": "Citrus County — Tax Deed"},
"Clay TD": {"subdomain": "clay", "domain": "realtaxdeed.com", "source_id": "clay_taxdeed", "state": "FL", "label": "Clay County — Tax Deed"},
"Duval TD": {"subdomain": "duval", "domain": "realtaxdeed.com", "source_id": "duval_taxdeed", "state": "FL", "label": "Duval County — Tax Deed (Jacksonville)"},
"Escambia TD": {"subdomain": "escambia", "domain": "realtaxdeed.com", "source_id": "escambia_taxdeed", "state": "FL", "label": "Escambia County — Tax Deed (Pensacola)"},
"Flagler TD": {"subdomain": "flagler", "domain": "realtaxdeed.com", "source_id": "flagler_taxdeed", "state": "FL", "label": "Flagler County — Tax Deed"},
"Gilchrist TD": {"subdomain": "gilchrist", "domain": "realtaxdeed.com", "source_id": "gilchrist_taxdeed", "state": "FL", "label": "Gilchrist County — Tax Deed"},
"Gulf TD": {"subdomain": "gulf", "domain": "realtaxdeed.com", "source_id": "gulf_taxdeed", "state": "FL", "label": "Gulf County — Tax Deed"},
"Hendry TD": {"subdomain": "hendry", "domain": "realtaxdeed.com", "source_id": "hendry_taxdeed", "state": "FL", "label": "Hendry County — Tax Deed"},
"Hernando TD": {"subdomain": "hernando", "domain": "realtaxdeed.com", "source_id": "hernando_taxdeed", "state": "FL", "label": "Hernando County — Tax Deed"},
"Highlands TD": {"subdomain": "highlands", "domain": "realtaxdeed.com", "source_id": "highlands_taxdeed", "state": "FL", "label": "Highlands County — Tax Deed (Sebring)"},
"Hillsborough TD": {"subdomain": "hillsborough", "domain": "realtaxdeed.com", "source_id": "hillsborough_taxdeed", "state": "FL", "label": "Hillsborough County — Tax Deed (Tampa)"},
# ═══════════════════════════════════════════════════════════════════════
# COLORADO TAX DEEDS via .treasurersdeedsale.com (same engine again)
# ═══════════════════════════════════════════════════════════════════════
"Adams TD": {"subdomain": "adams", "domain": "treasurersdeedsale.com", "source_id": "adams_taxdeed_co", "state": "CO", "label": "Adams County, CO — Treasurer's Deed"},
"Denver TD": {"subdomain": "denver", "domain": "treasurersdeedsale.com", "source_id": "denver_taxdeed_co", "state": "CO", "label": "Denver County, CO — Treasurer's Deed"},
"Eagle TD": {"subdomain": "eagle", "domain": "treasurersdeedsale.com", "source_id": "eagle_taxdeed_co", "state": "CO", "label": "Eagle County, CO — Treasurer's Deed"},
"El Paso TD": {"subdomain": "elpasoco", "domain": "treasurersdeedsale.com", "source_id": "el_paso_taxdeed_co", "state": "CO", "label": "El Paso County, CO — Treasurer's Deed"},
"Larimer TD": {"subdomain": "larimer", "domain": "treasurersdeedsale.com", "source_id": "larimer_taxdeed_co", "state": "CO", "label": "Larimer County, CO — Treasurer's Deed"},
"Mesa TD": {"subdomain": "mesa", "domain": "treasurersdeedsale.com", "source_id": "mesa_taxdeed_co", "state": "CO", "label": "Mesa County, CO — Treasurer's Deed"},
"Pitkin TD": {"subdomain": "pitkin", "domain": "treasurersdeedsale.com", "source_id": "pitkin_taxdeed_co", "state": "CO", "label": "Pitkin County, CO — Treasurer's Deed (Aspen)"},
"Weld TD": {"subdomain": "weld", "domain": "treasurersdeedsale.com", "source_id": "weld_taxdeed_co", "state": "CO", "label": "Weld County, CO — Treasurer's Deed"},
# ─── Counties NOT en realforeclose.com (parking only — usan plataformas
# distintas): Bradford, Collier, Columbia, DeSoto, Dixie, Franklin,
# Gadsden, Glades, Hamilton, Hardee, Hendry, Hernando, Highlands,
# Holmes, Jefferson, Lafayette, Levy, Liberty, Madison, Monroe,
# Okaloosa, St. Johns, Sumter, Taylor, Union, Wakulla.
# ─── Otras plataformas Realauction observadas (TODO add support):
# - .realtaxdeed.com (Apache/Coconino/Mohave AZ, FL tax deed sales)
# - .treasurersdeedsale.com (CO tax deed sales — Adams, Pitkin, etc.)
# - california.taxdefaultsale.com (CA Fresno)
}
def get_county_config(county: str) -> dict:
"""Returns the realauction config para un county. Raises ValueError si no soportado."""
config = REALAUCTION_COUNTIES.get(county)
if not config:
valid = ", ".join(REALAUCTION_COUNTIES.keys())
raise ValueError(
f"County '{county}' no soportado por realauction_clerk. "
f"Validos: {valid}"
)
return config
# Real Chrome UA — bypassa anti-bot del sitio
_CHROME_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
# Rate limit por dominio: 1 request c/2.5s para no joder al sitio
_REQUEST_INTERVAL_SECONDS = 2.5
# Mapping del clerk's "Auction Type" → nuestro deal_type canonico
_AUCTION_TYPE_MAP = {
"FORECLOSURE": "foreclosure",
"TAXDEED": "tax_deed",
"TAX DEED": "tax_deed",
}
# ════════════════════════════════════════════════════════════════════════════
# Parsing helpers (county-agnostic)
# ════════════════════════════════════════════════════════════════════════════
def _parse_money(s: str) -> Optional[float]:
"""Parse '$353,041.78' → 353041.78. Return None si invalido."""
if not s:
return None
cleaned = re.sub(r"[^\d.]", "", s)
if not cleaned or cleaned == ".":
return None
try:
return float(cleaned)
except ValueError:
return None
def _parse_address(line1: str, line2: str) -> dict:
"""Parse property address.
line1 = "7355 POINCIANA CT" (street)
line2 = "MIAMI LAKES, FL- 33014" (city, state-zip)
"""
out = {"address": None, "city": None, "state": None, "zip": None}
line1 = (line1 or "").strip()
line2 = (line2 or "").strip()
line2_clean = re.sub(r"\bFL-\s*", "FL ", line2).strip()
if line1 and line2_clean:
out["address"] = f"{line1}, {line2_clean}"
elif line1:
out["address"] = line1
elif line2_clean:
out["address"] = line2_clean
if line2:
m = re.match(r"^(.+?),\s*([A-Z]{2})[-\s]\s*(\d{5})(?:-\d{4})?", line2)
if m:
out["city"] = m.group(1).title()
out["state"] = m.group(2)
out["zip"] = m.group(3)
else:
m2 = re.search(r"\b([A-Z]{2})[-\s]\s*(\d{5})", line2)
if m2:
out["state"] = m2.group(1)
out["zip"] = m2.group(2)
out["city"] = line2.split(",")[0].strip().title() if "," in line2 else None
return out
def _extract_case_from_table_rows(rows: list[list[str]]) -> Optional[dict]:
"""Dado las rows de una tabla case, extrae el case dict.
Returns None si no es una tabla de case valida.
"""
fields: dict[str, str] = {}
addr_line2: Optional[str] = None
next_row_is_addr_line2 = False
_ADDR_LINE2_DISALLOWED_KEYWORDS = (
"Assessed Value", "Plaintiff Max Bid", "Auction Type", "Case #",
"Certificate #", "Final Judgment", "Opening Bid", "Parcel ID",
)
for row in rows:
non_empty = [c for c in row if c]
if not non_empty:
continue
if next_row_is_addr_line2 and len(non_empty) >= 1:
candidate = non_empty[0].strip()
looks_like_addr = not any(
kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS
)
if looks_like_addr:
addr_line2 = candidate
next_row_is_addr_line2 = False
if looks_like_addr:
continue
if len(non_empty) >= 2:
key = non_empty[0].rstrip(":").strip()
value = non_empty[1].strip()
fields[key] = value
if key == "Property Address":
if value:
next_row_is_addr_line2 = False
else:
next_row_is_addr_line2 = True
elif len(non_empty) == 1:
if next_row_is_addr_line2:
candidate = non_empty[0].strip()
if not any(kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS):
addr_line2 = candidate
next_row_is_addr_line2 = False
if not fields.get("Case #"):
return None
# Auction Type explicit (Miami-Dade/Duval/Broward style) → source of truth.
# Si NO esta (Orange style — solo AD_LBL/AD_DTA divs sin Auction Type field),
# inferir desde case_number con fallback a foreclosure (mas comun):
# - "TD" en case# o "TAXDEED" → tax_deed
# - default → foreclosure (clerks listan mayoria foreclosure)
auction_type_raw = (fields.get("Auction Type") or "").upper().strip()
if auction_type_raw:
deal_type = _AUCTION_TYPE_MAP.get(auction_type_raw)
if not deal_type:
return None # Explicit pero unknown — skip
else:
case_num_upper = (fields.get("Case #") or "").upper()
if "TAXDEED" in case_num_upper or "-TD-" in case_num_upper:
deal_type = "tax_deed"
auction_type_raw = "TAXDEED (inferred)"
else:
deal_type = "foreclosure"
auction_type_raw = "FORECLOSURE (inferred)"
case = {
"deal_type": deal_type,
"case_number": fields.get("Case #"),
"auction_type_raw": auction_type_raw,
}
if fields.get("Certificate #"):
case["certificate_number"] = fields["Certificate #"]
if fields.get("Final Judgment Amount"):
case["final_judgment_amount"] = _parse_money(fields["Final Judgment Amount"])
if fields.get("Opening Bid"):
case["starting_bid"] = _parse_money(fields["Opening Bid"])
if fields.get("Parcel ID"):
case["parcel_id"] = fields["Parcel ID"]
if fields.get("Assessed Value"):
case["assessed_value"] = _parse_money(fields["Assessed Value"])
if fields.get("Plaintiff Max Bid"):
case["plaintiff_max_bid_raw"] = fields["Plaintiff Max Bid"]
addr_parts = _parse_address(fields.get("Property Address", ""), addr_line2 or "")
case.update(addr_parts)
return case
def _build_description(case: dict) -> str:
"""Compact text description from case facts — useful for DealClassifier context."""
bits = []
status = case.get("auction_status")
if status:
bits.append(f"Status: {status}")
if case.get("auction_type_raw"):
bits.append(f"Auction Type: {case['auction_type_raw']}")
if case.get("certificate_number"):
bits.append(f"Tax Cert #: {case['certificate_number']}")
if case.get("final_judgment_amount"):
bits.append(f"Final Judgment Amount: ${case['final_judgment_amount']:,.2f}")
if case.get("starting_bid"):
bits.append(f"Opening Bid: ${case['starting_bid']:,.2f}")
if case.get("assessed_value"):
bits.append(f"Assessed Value (PA): ${case['assessed_value']:,.2f}")
if case.get("parcel_id"):
bits.append(f"Parcel ID: {case['parcel_id']}")
if case.get("plaintiff_max_bid_raw"):
bits.append(f"Plaintiff Max Bid: {case['plaintiff_max_bid_raw']}")
return " | ".join(bits)
def _build_deal_record(case: dict, auction_date_iso: str, county_config: dict) -> dict:
"""Convert clerk case dict → deal record compatible with deals_db.insert_deal.
Reglas pricing (heredadas del Miami-Dade v1.1 fix):
- tax_deed: listing_price = starting_bid
- foreclosure: listing_price = None (bid hidden pre-auction)
- final_judgment_amount stored separately (NOT confused with listing_price)
"""
deal_type = case.get("deal_type")
starting_bid = case.get("starting_bid")
assessed_value = case.get("assessed_value")
final_judgment = case.get("final_judgment_amount")
if deal_type == "tax_deed":
listing_price = starting_bid
elif deal_type == "foreclosure":
listing_price = None
else:
listing_price = starting_bid or assessed_value
subdomain = county_config["subdomain"]
domain = county_config.get("domain", REALAUCTION_DEFAULT_DOMAIN)
source_id = county_config["source_id"]
state = county_config["state"]
county_label = county_config["label"].split(" (")[0] # "Duval (Jacksonville)" → "Duval"
deal = {
"source": source_id,
"source_url": (
f"https://{subdomain}.{domain}/index.cfm"
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate="
f"{auction_date_iso[5:7]}/{auction_date_iso[8:10]}/{auction_date_iso[0:4]}"
),
"address": case.get("address"),
"city": case.get("city"),
"state": case.get("state") or state,
"zip": case.get("zip"),
"county": county_label,
"parcel_id": case.get("parcel_id"),
"listing_price": listing_price,
"deal_type": deal_type,
"starting_bid": starting_bid,
"estimated_arv": assessed_value,
"final_judgment_amount": final_judgment,
"auction_status": case.get("auction_status") or "scheduled",
"case_number": case.get("case_number"),
"auction_date": auction_date_iso,
"listing_description": _build_description(case),
}
return deal
# ════════════════════════════════════════════════════════════════════════════
# Status filtering (REDEEMED/CANCELED bug fix)
# ════════════════════════════════════════════════════════════════════════════
# Cases con estos statuses NO van a auction → NO incluir en results.
# Substring matching: "Canceled per Bankruptcy" → dead (contains "canceled").
_DEAD_STATUS_SUBSTRINGS = (
"redeemed",
"canceled",
"cancelled",
"sold",
"closed", # case closed/disposed
"title transferred",
"withdrawn",
"dismissed",
)
def _is_status_dead(status: Optional[str]) -> bool:
"""Returns True si el case esta inactivo (off-market)."""
if not status:
return False
s = status.strip().lower()
if not s:
return False
return any(dead in s for dead in _DEAD_STATUS_SUBSTRINGS)
def _parse_cases_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[dict]:
"""Parse all auction cases from a Realforeclose page HTML using stdlib only."""
from html.parser import HTMLParser
class AuctionItemParser(HTMLParser):
def __init__(self):
super().__init__()
self.items: list[dict] = []
self.auction_item_depth = 0
self.current_item_depth_start = None
self.div_depth = 0
self.status_label_active = False
self.expecting_status_value = False
self.in_astat_msga = False
self.in_astat_msgb = False
self.astat_msga_text = ""
self.astat_msgb_text = ""
self.in_table = 0
self.current_table: list[list[str]] = []
self.current_row: list[str] = []
self.in_cell = 0
self.cell_text_parts: list[str] = []
self.current_status = ""
self.current_tables: list[list[list[str]]] = []
# Orange-style div-based fields: <div class="AD_LBL">Label:</div>
# <div class="AD_DTA">Value</div>
# We collect these as synthetic [Label, Value] rows so el extractor
# downstream funciona sin cambios.
self.in_ad_lbl = False
self.in_ad_dta = False
self.ad_lbl_text = ""
self.ad_dta_text = ""
self.last_ad_lbl: Optional[str] = None
self.current_ad_rows: list[list[str]] = []
def handle_starttag(self, tag, attrs):
attrs_d = dict(attrs)
classes = (attrs_d.get("class") or "").split()
if tag == "div":
self.div_depth += 1
if "AUCTION_ITEM" in classes:
if self.auction_item_depth == 0:
self.current_item_depth_start = self.div_depth
self.current_status = ""
self.current_tables = []
self.current_ad_rows = []
self.last_ad_lbl = None
self.auction_item_depth += 1
elif self.auction_item_depth:
if "ASTAT_MSGA" in classes:
self.in_astat_msga = True
self.astat_msga_text = ""
elif "ASTAT_MSGB" in classes:
self.in_astat_msgb = True
self.astat_msgb_text = ""
elif "AD_LBL" in classes:
self.in_ad_lbl = True
self.ad_lbl_text = ""
elif "AD_DTA" in classes:
self.in_ad_dta = True
self.ad_dta_text = ""
elif tag == "table" and self.auction_item_depth:
self.in_table += 1
self.current_table = []
elif tag == "tr" and self.in_table:
self.current_row = []
elif tag in ("td", "th") and self.in_table:
self.in_cell += 1
self.cell_text_parts = []
def handle_endtag(self, tag):
if tag == "div":
if self.in_astat_msga:
self.in_astat_msga = False
if "auction status" in self.astat_msga_text.strip().lower():
self.expecting_status_value = True
elif self.in_astat_msgb:
self.in_astat_msgb = False
if self.expecting_status_value and self.auction_item_depth:
self.current_status = self.astat_msgb_text.strip()
self.expecting_status_value = False
elif self.in_ad_lbl:
self.in_ad_lbl = False
self.last_ad_lbl = self.ad_lbl_text.strip()
elif self.in_ad_dta:
self.in_ad_dta = False
# Pair with most recent AD_LBL if any
if self.last_ad_lbl and self.auction_item_depth:
label = self.last_ad_lbl
value = " ".join(self.ad_dta_text.split()).strip()
self.current_ad_rows.append([label, value])
self.last_ad_lbl = None
if self.auction_item_depth and self.div_depth == self.current_item_depth_start:
self.auction_item_depth -= 1
if self.auction_item_depth == 0:
# If item had AD_LBL/AD_DTA pairs (Orange-style), add them
# as a synthetic table so el extractor downstream funciona.
tables = list(self.current_tables)
if self.current_ad_rows:
tables.append(self.current_ad_rows)
self.items.append({
"status": self.current_status,
"tables": tables,
})
self.current_status = ""
self.current_tables = []
self.current_ad_rows = []
self.current_item_depth_start = None
elif self.auction_item_depth:
pass
self.div_depth -= 1
elif tag in ("td", "th") and self.in_cell:
self.in_cell -= 1
text = " ".join("".join(self.cell_text_parts).split()).strip()
self.current_row.append(text)
self.cell_text_parts = []
elif tag == "tr" and self.in_table:
if self.current_row:
self.current_table.append(self.current_row)
self.current_row = []
elif tag == "table":
if self.in_table:
self.in_table -= 1
if self.current_table and self.auction_item_depth:
self.current_tables.append(self.current_table)
self.current_table = []
def handle_data(self, data):
if self.in_cell:
self.cell_text_parts.append(data)
elif self.in_astat_msga:
self.astat_msga_text += data
elif self.in_astat_msgb:
self.astat_msgb_text += data
elif self.in_ad_lbl:
self.ad_lbl_text += data
elif self.in_ad_dta:
self.ad_dta_text += data
parser = AuctionItemParser()
try:
parser.feed(html)
except Exception as e:
if log_fn:
log_fn(f" HTML parser error: {e}")
return []
cases_on_page: list[dict] = []
skipped_dead = 0
for item in parser.items:
status = item["status"] or ""
if _is_status_dead(status):
skipped_dead += 1
continue
for rows in item["tables"]:
try:
case = _extract_case_from_table_rows(rows)
if case and case.get("case_number"):
case["auction_status"] = status or "scheduled"
if not any(c.get("case_number") == case["case_number"] for c in cases_on_page):
cases_on_page.append(case)
break
except Exception as e:
if log_fn:
log_fn(f" table parse error: {e}")
if skipped_dead and log_fn:
log_fn(f" filtered {skipped_dead} dead case(s) (Redeemed/Canceled/Sold/etc)")
return cases_on_page
# ════════════════════════════════════════════════════════════════════════════
# Main scraper entry point
# ════════════════════════════════════════════════════════════════════════════
def scrape_realauction_county(
*,
county: str,
days_ahead: int = 14,
days_back: int = 0,
status_cb: Optional[Callable[[str], None]] = None,
max_dates: Optional[int] = None,
use_cache: bool = True,
cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
) -> list[dict]:
"""Scrape realauction.com calendar para un county especifico.
Args:
county: nombre del county (e.g. "Miami-Dade", "Duval", "Broward").
Debe estar en REALAUCTION_COUNTIES.
days_ahead: dias hacia adelante (default 14)
days_back: dias hacia atras (default 0)
status_cb: callback opcional para logging
max_dates: limita el numero de dias scrapeados (testing)
use_cache: usar cache 24h (default True)
cache_ttl_seconds: TTL del cache
Returns:
list[dict] de deal records listos para deals_db.insert_deal
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
config = get_county_config(county)
subdomain = config["subdomain"]
domain = config.get("domain", REALAUCTION_DEFAULT_DOMAIN)
source_id = config["source_id"]
def _log(msg: str) -> None:
if status_cb:
status_cb(msg)
deals: list[dict] = []
today = datetime.now(timezone.utc).date()
dates_to_scrape = []
for offset in range(-days_back, days_ahead + 1):
d = today + timedelta(days=offset)
dates_to_scrape.append(d)
if max_dates:
dates_to_scrape = dates_to_scrape[:max_dates]
_log(f"{config['label']} Clerk: scraping {len(dates_to_scrape)} dates (cache={'ON' if use_cache else 'OFF'})")
cache_namespace = source_id # e.g. "duval_clerk"
cached_pages: dict[str, str] = {}
dates_needing_fetch: list = []
cache_hits = 0
for date in dates_to_scrape:
date_str = date.strftime("%m/%d/%Y")
url = (
f"https://{subdomain}.{domain}/index.cfm"
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
)
if use_cache:
cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds)
if cached:
cached_pages[url] = cached
cache_hits += 1
continue
dates_needing_fetch.append(date)
_log(f" cache hits: {cache_hits}/{len(dates_to_scrape)}; need to fetch {len(dates_needing_fetch)}")
fresh_pages: dict[str, str] = {}
if dates_needing_fetch:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent=_CHROME_UA,
viewport={"width": 1280, "height": 800},
locale="en-US",
timezone_id="America/New_York",
)
page = context.new_page()
page.set_default_timeout(20_000)
last_request_at = 0.0
for date in dates_needing_fetch:
elapsed = time.time() - last_request_at
if elapsed < _REQUEST_INTERVAL_SECONDS:
time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed)
last_request_at = time.time()
date_str = date.strftime("%m/%d/%Y")
url = (
f"https://{subdomain}.{domain}/index.cfm"
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
)
_log(f" Fetching {date_str}...")
try:
response = page.goto(url, wait_until="networkidle", timeout=20_000)
if response.status != 200:
_log(f" HTTP {response.status} — skipping")
continue
except PlaywrightTimeout:
_log(f" timeout — skipping")
continue
except Exception as e:
_log(f" error {e} — skipping")
continue
try:
time.sleep(1.5)
except Exception:
pass
html = page.content()
fresh_pages[url] = html
if use_cache:
save_cache(cache_namespace, url, html,
status_code=200, ttl_seconds=cache_ttl_seconds)
browser.close()
for date in dates_to_scrape:
date_str = date.strftime("%m/%d/%Y")
auction_date_iso = date.isoformat()
url = (
f"https://{subdomain}.{domain}/index.cfm"
f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}"
)
html = cached_pages.get(url) or fresh_pages.get(url)
if not html:
continue
cases_on_page = _parse_cases_from_html(html, _log)
_log(f" {date_str}: parsed {len(cases_on_page)} case(s)")
for case in cases_on_page:
deal = _build_deal_record(case, auction_date_iso, config)
if not deal.get("address") and not deal.get("listing_price"):
continue
deals.append(deal)
_log(f"{config['label']} Clerk: scraped {len(deals)} total deals")
return deals
def run_scraper_to_db(
*,
county: str,
days_ahead: int = 14,
days_back: int = 0,
auto_classify: bool = True,
status_cb: Optional[Callable[[str], None]] = None,
max_dates: Optional[int] = None,
) -> dict:
"""Full pipeline: scrape county → persist → optionally auto-classify nuevos."""
from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run
init_db()
config = get_county_config(county)
source_id = config["source_id"]
run_id = record_scraper_run(source_id)
errors: list[str] = []
def _log(m: str) -> None:
if status_cb:
status_cb(m)
try:
deals = scrape_realauction_county(
county=county,
days_ahead=days_ahead,
days_back=days_back,
status_cb=status_cb,
max_dates=max_dates,
)
except Exception as e:
errors.append(f"scrape failed: {e}")
deals = []
deals_new = 0
deals_updated = 0
new_deal_ids: list[int] = []
for deal in deals:
try:
deal_id, is_new = insert_deal(deal)
if is_new:
deals_new += 1
new_deal_ids.append(deal_id)
else:
deals_updated += 1
except Exception as e:
errors.append(f"insert fail for {deal.get('case_number')}: {e}")
classified_count = 0
if auto_classify and new_deal_ids:
_log(f"Auto-classifying {len(new_deal_ids)} new deals...")
from deal_classifier import classify_deal
from deals_db import get_deal_by_id, update_classification
for did in new_deal_ids:
try:
d = get_deal_by_id(did)
if not d:
continue
result = classify_deal(d)
update_classification(
deal_id=did,
status=result["classification_status"],
score=result["score"],
reasons=result["reasons"],
strategy=result["strategy"],
)
classified_count += 1
except Exception as e:
errors.append(f"classify fail for deal_id={did}: {e}")
finish_scraper_run(
run_id,
deals_found=len(deals),
deals_new=deals_new,
deals_updated=deals_updated,
errors_count=len(errors),
errors_summary=errors if errors else None,
firecrawl_credits_used=0,
status="success" if not errors else ("partial" if deals else "failed"),
)
return {
"source": source_id,
"scraper_run_id": run_id,
"deals_found": len(deals),
"deals_new": deals_new,
"deals_updated": deals_updated,
"deals_classified": classified_count,
"errors_count": len(errors),
"errors": errors,
}