"""scrapers/realauction_clerk.py — Generic scraper para 5+ Florida counties. REALAUCTION.COM WHITE-LABEL PLATFORM: Multiples county clerks usan el mismo SaaS de realforeclose.com con subdominios distintos. Comparten ~95% del HTML structure → un solo scraper sirve para todos. Counties soportados (FL): Miami-Dade miamidade.realforeclose.com Duval duval.realforeclose.com Broward broward.realforeclose.com Palm Beach mypalmbeach.realforeclose.com Hillsborough hillsborough.realforeclose.com Orange myorangeclerk.realforeclose.com URL PATTERN: https://www.{subdomain}.realforeclose.com/index.cfm ?zaction=AUCTION &zmethod=PREVIEW &AuctionDate=MM/DD/YYYY ESTRUCTURA del page (descubierta via DOM inspection en Miami-Dade, asumida igual en otros county subdomains porque comparten platform): -
wrapper por listing -
{STATUS}
con auction status - con rows key/value (Case#, Parcel ID, Property Address, etc.) - Status posibles dead: Redeemed, Canceled per *, Sold, Title Transferred, Withdrawn, Dismissed (parser filtra estos) ANTI-BOT: Chrome UA real bypassa el 403 que tira el sitio para UA no-standard. RATE LIMIT: 2.5s entre requests del mismo county. USO: from scrapers.realauction_clerk import scrape_realauction_county deals = scrape_realauction_county(county="Duval", days_ahead=7) """ from __future__ import annotations import re import time from datetime import datetime, timedelta, timezone from typing import Callable, Optional from scrapers._cache import get_cached, save_cache, DEFAULT_TTL_SECONDS_DAILY # ════════════════════════════════════════════════════════════════════════════ # Configuration: county registry # ════════════════════════════════════════════════════════════════════════════ # Subdomain mapping. Si un county no esta aqui, scrape_realauction_county() raises. # IMPORTANT: subdomains validados via DOM inspection. Cuando se agrega un county # nuevo, hacer un curl al URL pattern para confirmar HTTP 200. # Configuracion per (county, platform) tuple. The 'domain' field defaults to # 'realforeclose.com' if omitted (backward compat). For tax deed sales it's # 'realtaxdeed.com' instead. Both share the same Realauction.com platform with # identical HTML structure — only the URL host differs. REALAUCTION_DEFAULT_DOMAIN = "realforeclose.com" REALAUCTION_COUNTIES = { # ALL 41 FL counties confirmed working en realforeclose.com (probe 2026-05-14) # Detection: title='RealForeclose- {County} County -Splash Page' # 26 small FL counties NOT en realforeclose (usan plataformas distintas) # ─── Major urban counties ────────────────────────────────────────────── "Miami-Dade": {"subdomain": "miamidade", "source_id": "miami_dade_clerk", "state": "FL", "label": "Miami-Dade"}, "Duval": {"subdomain": "duval", "source_id": "duval_clerk", "state": "FL", "label": "Duval (Jacksonville)"}, "Broward": {"subdomain": "broward", "source_id": "broward_clerk", "state": "FL", "label": "Broward (Fort Lauderdale)"}, "Palm Beach": {"subdomain": "palmbeach", "source_id": "palm_beach_clerk", "state": "FL", "label": "Palm Beach (West Palm)"}, "Hillsborough": {"subdomain": "hillsborough","source_id": "hillsborough_clerk", "state": "FL", "label": "Hillsborough (Tampa)"}, "Orange": {"subdomain": "orange", "source_id": "orange_clerk", "state": "FL", "label": "Orange (Orlando)"}, "Pinellas": {"subdomain": "pinellas", "source_id": "pinellas_clerk", "state": "FL", "label": "Pinellas (St Petersburg)"}, "Lee": {"subdomain": "lee", "source_id": "lee_clerk", "state": "FL", "label": "Lee (Fort Myers)"}, "Polk": {"subdomain": "polk", "source_id": "polk_clerk", "state": "FL", "label": "Polk (Lakeland)"}, "Brevard": {"subdomain": "brevard", "source_id": "brevard_clerk", "state": "FL", "label": "Brevard (Cocoa, Melbourne)"}, "Volusia": {"subdomain": "volusia", "source_id": "volusia_clerk", "state": "FL", "label": "Volusia (Daytona Beach)"}, "Pasco": {"subdomain": "pasco", "source_id": "pasco_clerk", "state": "FL", "label": "Pasco"}, "Sarasota": {"subdomain": "sarasota", "source_id": "sarasota_clerk", "state": "FL", "label": "Sarasota"}, "Manatee": {"subdomain": "manatee", "source_id": "manatee_clerk", "state": "FL", "label": "Manatee (Bradenton)"}, "Seminole": {"subdomain": "seminole", "source_id": "seminole_clerk", "state": "FL", "label": "Seminole (Sanford)"}, # ─── Mid-size counties ───────────────────────────────────────────────── "Marion": {"subdomain": "marion", "source_id": "marion_clerk", "state": "FL", "label": "Marion (Ocala)"}, "Lake": {"subdomain": "lake", "source_id": "lake_clerk", "state": "FL", "label": "Lake (Mt Dora, Tavares)"}, "Osceola": {"subdomain": "osceola", "source_id": "osceola_clerk", "state": "FL", "label": "Osceola (Kissimmee)"}, "Leon": {"subdomain": "leon", "source_id": "leon_clerk", "state": "FL", "label": "Leon (Tallahassee)"}, "Alachua": {"subdomain": "alachua", "source_id": "alachua_clerk", "state": "FL", "label": "Alachua (Gainesville)"}, "Escambia": {"subdomain": "escambia", "source_id": "escambia_clerk", "state": "FL", "label": "Escambia (Pensacola)"}, "Santa Rosa": {"subdomain": "santarosa", "source_id": "santa_rosa_clerk", "state": "FL", "label": "Santa Rosa (Milton)"}, "Bay": {"subdomain": "bay", "source_id": "bay_clerk", "state": "FL", "label": "Bay (Panama City)"}, "St Lucie": {"subdomain": "stlucie", "source_id": "st_lucie_clerk", "state": "FL", "label": "St Lucie (Port St Lucie)"}, "Indian River": {"subdomain": "indianriver", "source_id": "indian_river_clerk", "state": "FL", "label": "Indian River (Vero Beach)"}, "Martin": {"subdomain": "martin", "source_id": "martin_clerk", "state": "FL", "label": "Martin (Stuart)"}, "Citrus": {"subdomain": "citrus", "source_id": "citrus_clerk", "state": "FL", "label": "Citrus (Crystal River)"}, "Charlotte": {"subdomain": "charlotte", "source_id": "charlotte_clerk", "state": "FL", "label": "Charlotte (Port Charlotte)"}, "Clay": {"subdomain": "clay", "source_id": "clay_clerk", "state": "FL", "label": "Clay (Green Cove Springs)"}, "Nassau": {"subdomain": "nassau", "source_id": "nassau_clerk", "state": "FL", "label": "Nassau (Fernandina Beach)"}, "Putnam": {"subdomain": "putnam", "source_id": "putnam_clerk", "state": "FL", "label": "Putnam (Palatka)"}, "Flagler": {"subdomain": "flagler", "source_id": "flagler_clerk", "state": "FL", "label": "Flagler (Palm Coast)"}, "Walton": {"subdomain": "walton", "source_id": "walton_clerk", "state": "FL", "label": "Walton (DeFuniak Springs)"}, "Okeechobee": {"subdomain": "okeechobee", "source_id": "okeechobee_clerk", "state": "FL", "label": "Okeechobee"}, # ─── Small rural counties (small auction volume but still on platform) ─ "Baker": {"subdomain": "baker", "source_id": "baker_clerk", "state": "FL", "label": "Baker (Macclenny)"}, "Calhoun": {"subdomain": "calhoun", "source_id": "calhoun_clerk", "state": "FL", "label": "Calhoun"}, "Gilchrist": {"subdomain": "gilchrist", "source_id": "gilchrist_clerk", "state": "FL", "label": "Gilchrist"}, "Gulf": {"subdomain": "gulf", "source_id": "gulf_clerk", "state": "FL", "label": "Gulf (Port St Joe)"}, "Jackson": {"subdomain": "jackson", "source_id": "jackson_clerk", "state": "FL", "label": "Jackson (Marianna)"}, "Suwannee": {"subdomain": "suwannee", "source_id": "suwannee_clerk", "state": "FL", "label": "Suwannee (Live Oak)"}, "Washington": {"subdomain": "washington", "source_id": "washington_clerk", "state": "FL", "label": "Washington (Chipley)"}, # ═══════════════════════════════════════════════════════════════════════ # COLORADO — mismo platform (realauction.com is multi-state) # Discovered via https://www.realauction.com/clients # ═══════════════════════════════════════════════════════════════════════ "Denver": {"subdomain": "denver", "source_id": "denver_clerk_co", "state": "CO", "label": "Denver County, CO"}, "Eagle": {"subdomain": "eagle", "source_id": "eagle_clerk_co", "state": "CO", "label": "Eagle County, CO"}, "El Paso": {"subdomain": "elpasoco", "source_id": "el_paso_clerk_co", "state": "CO", "label": "El Paso County, CO (Colorado Springs)"}, "Larimer": {"subdomain": "larimer", "source_id": "larimer_clerk_co", "state": "CO", "label": "Larimer County, CO (Fort Collins)"}, "Mesa": {"subdomain": "mesa", "source_id": "mesa_clerk_co", "state": "CO", "label": "Mesa County, CO (Grand Junction)"}, "Summit": {"subdomain": "summit", "source_id": "summit_clerk_co", "state": "CO", "label": "Summit County, CO (Breckenridge)"}, "Weld": {"subdomain": "weld", "source_id": "weld_clerk_co", "state": "CO", "label": "Weld County, CO (Greeley)"}, # ═══════════════════════════════════════════════════════════════════════ # TAX DEED SALES via .realtaxdeed.com (same engine, diff domain) # 17 portales: 3 AZ + 14 FL. SAME HTML structure as realforeclose.com. # ═══════════════════════════════════════════════════════════════════════ # Arizona "Apache TD": {"subdomain": "apache", "domain": "realtaxdeed.com", "source_id": "apache_taxdeed_az", "state": "AZ", "label": "Apache County, AZ — Tax Deed"}, "Coconino TD": {"subdomain": "coconino", "domain": "realtaxdeed.com", "source_id": "coconino_taxdeed_az", "state": "AZ", "label": "Coconino County, AZ — Tax Deed (Flagstaff)"}, "Mohave TD": {"subdomain": "mohave", "domain": "realtaxdeed.com", "source_id": "mohave_taxdeed_az", "state": "AZ", "label": "Mohave County, AZ — Tax Deed (Kingman)"}, # Florida tax deed (distinct from foreclosure entries above) "Alachua TD": {"subdomain": "alachua", "domain": "realtaxdeed.com", "source_id": "alachua_taxdeed", "state": "FL", "label": "Alachua County — Tax Deed"}, "Baker TD": {"subdomain": "baker", "domain": "realtaxdeed.com", "source_id": "baker_taxdeed", "state": "FL", "label": "Baker County — Tax Deed"}, "Bay TD": {"subdomain": "bay", "domain": "realtaxdeed.com", "source_id": "bay_taxdeed", "state": "FL", "label": "Bay County — Tax Deed"}, "Citrus TD": {"subdomain": "citrus", "domain": "realtaxdeed.com", "source_id": "citrus_taxdeed", "state": "FL", "label": "Citrus County — Tax Deed"}, "Clay TD": {"subdomain": "clay", "domain": "realtaxdeed.com", "source_id": "clay_taxdeed", "state": "FL", "label": "Clay County — Tax Deed"}, "Duval TD": {"subdomain": "duval", "domain": "realtaxdeed.com", "source_id": "duval_taxdeed", "state": "FL", "label": "Duval County — Tax Deed (Jacksonville)"}, "Escambia TD": {"subdomain": "escambia", "domain": "realtaxdeed.com", "source_id": "escambia_taxdeed", "state": "FL", "label": "Escambia County — Tax Deed (Pensacola)"}, "Flagler TD": {"subdomain": "flagler", "domain": "realtaxdeed.com", "source_id": "flagler_taxdeed", "state": "FL", "label": "Flagler County — Tax Deed"}, "Gilchrist TD": {"subdomain": "gilchrist", "domain": "realtaxdeed.com", "source_id": "gilchrist_taxdeed", "state": "FL", "label": "Gilchrist County — Tax Deed"}, "Gulf TD": {"subdomain": "gulf", "domain": "realtaxdeed.com", "source_id": "gulf_taxdeed", "state": "FL", "label": "Gulf County — Tax Deed"}, "Hendry TD": {"subdomain": "hendry", "domain": "realtaxdeed.com", "source_id": "hendry_taxdeed", "state": "FL", "label": "Hendry County — Tax Deed"}, "Hernando TD": {"subdomain": "hernando", "domain": "realtaxdeed.com", "source_id": "hernando_taxdeed", "state": "FL", "label": "Hernando County — Tax Deed"}, "Highlands TD": {"subdomain": "highlands", "domain": "realtaxdeed.com", "source_id": "highlands_taxdeed", "state": "FL", "label": "Highlands County — Tax Deed (Sebring)"}, "Hillsborough TD": {"subdomain": "hillsborough", "domain": "realtaxdeed.com", "source_id": "hillsborough_taxdeed", "state": "FL", "label": "Hillsborough County — Tax Deed (Tampa)"}, # ═══════════════════════════════════════════════════════════════════════ # COLORADO TAX DEEDS via .treasurersdeedsale.com (same engine again) # ═══════════════════════════════════════════════════════════════════════ "Adams TD": {"subdomain": "adams", "domain": "treasurersdeedsale.com", "source_id": "adams_taxdeed_co", "state": "CO", "label": "Adams County, CO — Treasurer's Deed"}, "Denver TD": {"subdomain": "denver", "domain": "treasurersdeedsale.com", "source_id": "denver_taxdeed_co", "state": "CO", "label": "Denver County, CO — Treasurer's Deed"}, "Eagle TD": {"subdomain": "eagle", "domain": "treasurersdeedsale.com", "source_id": "eagle_taxdeed_co", "state": "CO", "label": "Eagle County, CO — Treasurer's Deed"}, "El Paso TD": {"subdomain": "elpasoco", "domain": "treasurersdeedsale.com", "source_id": "el_paso_taxdeed_co", "state": "CO", "label": "El Paso County, CO — Treasurer's Deed"}, "Larimer TD": {"subdomain": "larimer", "domain": "treasurersdeedsale.com", "source_id": "larimer_taxdeed_co", "state": "CO", "label": "Larimer County, CO — Treasurer's Deed"}, "Mesa TD": {"subdomain": "mesa", "domain": "treasurersdeedsale.com", "source_id": "mesa_taxdeed_co", "state": "CO", "label": "Mesa County, CO — Treasurer's Deed"}, "Pitkin TD": {"subdomain": "pitkin", "domain": "treasurersdeedsale.com", "source_id": "pitkin_taxdeed_co", "state": "CO", "label": "Pitkin County, CO — Treasurer's Deed (Aspen)"}, "Weld TD": {"subdomain": "weld", "domain": "treasurersdeedsale.com", "source_id": "weld_taxdeed_co", "state": "CO", "label": "Weld County, CO — Treasurer's Deed"}, # ─── Counties NOT en realforeclose.com (parking only — usan plataformas # distintas): Bradford, Collier, Columbia, DeSoto, Dixie, Franklin, # Gadsden, Glades, Hamilton, Hardee, Hendry, Hernando, Highlands, # Holmes, Jefferson, Lafayette, Levy, Liberty, Madison, Monroe, # Okaloosa, St. Johns, Sumter, Taylor, Union, Wakulla. # ─── Otras plataformas Realauction observadas (TODO add support): # - .realtaxdeed.com (Apache/Coconino/Mohave AZ, FL tax deed sales) # - .treasurersdeedsale.com (CO tax deed sales — Adams, Pitkin, etc.) # - california.taxdefaultsale.com (CA Fresno) } def get_county_config(county: str) -> dict: """Returns the realauction config para un county. Raises ValueError si no soportado.""" config = REALAUCTION_COUNTIES.get(county) if not config: valid = ", ".join(REALAUCTION_COUNTIES.keys()) raise ValueError( f"County '{county}' no soportado por realauction_clerk. " f"Validos: {valid}" ) return config # Real Chrome UA — bypassa anti-bot del sitio _CHROME_UA = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/131.0.0.0 Safari/537.36" ) # Rate limit por dominio: 1 request c/2.5s para no joder al sitio _REQUEST_INTERVAL_SECONDS = 2.5 # Mapping del clerk's "Auction Type" → nuestro deal_type canonico _AUCTION_TYPE_MAP = { "FORECLOSURE": "foreclosure", "TAXDEED": "tax_deed", "TAX DEED": "tax_deed", } # ════════════════════════════════════════════════════════════════════════════ # Parsing helpers (county-agnostic) # ════════════════════════════════════════════════════════════════════════════ def _parse_money(s: str) -> Optional[float]: """Parse '$353,041.78' → 353041.78. Return None si invalido.""" if not s: return None cleaned = re.sub(r"[^\d.]", "", s) if not cleaned or cleaned == ".": return None try: return float(cleaned) except ValueError: return None def _parse_address(line1: str, line2: str) -> dict: """Parse property address. line1 = "7355 POINCIANA CT" (street) line2 = "MIAMI LAKES, FL- 33014" (city, state-zip) """ out = {"address": None, "city": None, "state": None, "zip": None} line1 = (line1 or "").strip() line2 = (line2 or "").strip() line2_clean = re.sub(r"\bFL-\s*", "FL ", line2).strip() if line1 and line2_clean: out["address"] = f"{line1}, {line2_clean}" elif line1: out["address"] = line1 elif line2_clean: out["address"] = line2_clean if line2: m = re.match(r"^(.+?),\s*([A-Z]{2})[-\s]\s*(\d{5})(?:-\d{4})?", line2) if m: out["city"] = m.group(1).title() out["state"] = m.group(2) out["zip"] = m.group(3) else: m2 = re.search(r"\b([A-Z]{2})[-\s]\s*(\d{5})", line2) if m2: out["state"] = m2.group(1) out["zip"] = m2.group(2) out["city"] = line2.split(",")[0].strip().title() if "," in line2 else None return out def _extract_case_from_table_rows(rows: list[list[str]]) -> Optional[dict]: """Dado las rows de una tabla case, extrae el case dict. Returns None si no es una tabla de case valida. """ fields: dict[str, str] = {} addr_line2: Optional[str] = None next_row_is_addr_line2 = False _ADDR_LINE2_DISALLOWED_KEYWORDS = ( "Assessed Value", "Plaintiff Max Bid", "Auction Type", "Case #", "Certificate #", "Final Judgment", "Opening Bid", "Parcel ID", ) for row in rows: non_empty = [c for c in row if c] if not non_empty: continue if next_row_is_addr_line2 and len(non_empty) >= 1: candidate = non_empty[0].strip() looks_like_addr = not any( kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS ) if looks_like_addr: addr_line2 = candidate next_row_is_addr_line2 = False if looks_like_addr: continue if len(non_empty) >= 2: key = non_empty[0].rstrip(":").strip() value = non_empty[1].strip() fields[key] = value if key == "Property Address": if value: next_row_is_addr_line2 = False else: next_row_is_addr_line2 = True elif len(non_empty) == 1: if next_row_is_addr_line2: candidate = non_empty[0].strip() if not any(kw in candidate for kw in _ADDR_LINE2_DISALLOWED_KEYWORDS): addr_line2 = candidate next_row_is_addr_line2 = False if not fields.get("Case #"): return None # Auction Type explicit (Miami-Dade/Duval/Broward style) → source of truth. # Si NO esta (Orange style — solo AD_LBL/AD_DTA divs sin Auction Type field), # inferir desde case_number con fallback a foreclosure (mas comun): # - "TD" en case# o "TAXDEED" → tax_deed # - default → foreclosure (clerks listan mayoria foreclosure) auction_type_raw = (fields.get("Auction Type") or "").upper().strip() if auction_type_raw: deal_type = _AUCTION_TYPE_MAP.get(auction_type_raw) if not deal_type: return None # Explicit pero unknown — skip else: case_num_upper = (fields.get("Case #") or "").upper() if "TAXDEED" in case_num_upper or "-TD-" in case_num_upper: deal_type = "tax_deed" auction_type_raw = "TAXDEED (inferred)" else: deal_type = "foreclosure" auction_type_raw = "FORECLOSURE (inferred)" case = { "deal_type": deal_type, "case_number": fields.get("Case #"), "auction_type_raw": auction_type_raw, } if fields.get("Certificate #"): case["certificate_number"] = fields["Certificate #"] if fields.get("Final Judgment Amount"): case["final_judgment_amount"] = _parse_money(fields["Final Judgment Amount"]) if fields.get("Opening Bid"): case["starting_bid"] = _parse_money(fields["Opening Bid"]) if fields.get("Parcel ID"): case["parcel_id"] = fields["Parcel ID"] if fields.get("Assessed Value"): case["assessed_value"] = _parse_money(fields["Assessed Value"]) if fields.get("Plaintiff Max Bid"): case["plaintiff_max_bid_raw"] = fields["Plaintiff Max Bid"] addr_parts = _parse_address(fields.get("Property Address", ""), addr_line2 or "") case.update(addr_parts) return case def _build_description(case: dict) -> str: """Compact text description from case facts — useful for DealClassifier context.""" bits = [] status = case.get("auction_status") if status: bits.append(f"Status: {status}") if case.get("auction_type_raw"): bits.append(f"Auction Type: {case['auction_type_raw']}") if case.get("certificate_number"): bits.append(f"Tax Cert #: {case['certificate_number']}") if case.get("final_judgment_amount"): bits.append(f"Final Judgment Amount: ${case['final_judgment_amount']:,.2f}") if case.get("starting_bid"): bits.append(f"Opening Bid: ${case['starting_bid']:,.2f}") if case.get("assessed_value"): bits.append(f"Assessed Value (PA): ${case['assessed_value']:,.2f}") if case.get("parcel_id"): bits.append(f"Parcel ID: {case['parcel_id']}") if case.get("plaintiff_max_bid_raw"): bits.append(f"Plaintiff Max Bid: {case['plaintiff_max_bid_raw']}") return " | ".join(bits) def _build_deal_record(case: dict, auction_date_iso: str, county_config: dict) -> dict: """Convert clerk case dict → deal record compatible with deals_db.insert_deal. Reglas pricing (heredadas del Miami-Dade v1.1 fix): - tax_deed: listing_price = starting_bid - foreclosure: listing_price = None (bid hidden pre-auction) - final_judgment_amount stored separately (NOT confused with listing_price) """ deal_type = case.get("deal_type") starting_bid = case.get("starting_bid") assessed_value = case.get("assessed_value") final_judgment = case.get("final_judgment_amount") if deal_type == "tax_deed": listing_price = starting_bid elif deal_type == "foreclosure": listing_price = None else: listing_price = starting_bid or assessed_value subdomain = county_config["subdomain"] domain = county_config.get("domain", REALAUCTION_DEFAULT_DOMAIN) source_id = county_config["source_id"] state = county_config["state"] county_label = county_config["label"].split(" (")[0] # "Duval (Jacksonville)" → "Duval" deal = { "source": source_id, "source_url": ( f"https://{subdomain}.{domain}/index.cfm" f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate=" f"{auction_date_iso[5:7]}/{auction_date_iso[8:10]}/{auction_date_iso[0:4]}" ), "address": case.get("address"), "city": case.get("city"), "state": case.get("state") or state, "zip": case.get("zip"), "county": county_label, "parcel_id": case.get("parcel_id"), "listing_price": listing_price, "deal_type": deal_type, "starting_bid": starting_bid, "estimated_arv": assessed_value, "final_judgment_amount": final_judgment, "auction_status": case.get("auction_status") or "scheduled", "case_number": case.get("case_number"), "auction_date": auction_date_iso, "listing_description": _build_description(case), } return deal # ════════════════════════════════════════════════════════════════════════════ # Status filtering (REDEEMED/CANCELED bug fix) # ════════════════════════════════════════════════════════════════════════════ # Cases con estos statuses NO van a auction → NO incluir en results. # Substring matching: "Canceled per Bankruptcy" → dead (contains "canceled"). _DEAD_STATUS_SUBSTRINGS = ( "redeemed", "canceled", "cancelled", "sold", "closed", # case closed/disposed "title transferred", "withdrawn", "dismissed", ) def _is_status_dead(status: Optional[str]) -> bool: """Returns True si el case esta inactivo (off-market).""" if not status: return False s = status.strip().lower() if not s: return False return any(dead in s for dead in _DEAD_STATUS_SUBSTRINGS) def _parse_cases_from_html(html: str, log_fn: Optional[Callable[[str], None]] = None) -> list[dict]: """Parse all auction cases from a Realforeclose page HTML using stdlib only.""" from html.parser import HTMLParser class AuctionItemParser(HTMLParser): def __init__(self): super().__init__() self.items: list[dict] = [] self.auction_item_depth = 0 self.current_item_depth_start = None self.div_depth = 0 self.status_label_active = False self.expecting_status_value = False self.in_astat_msga = False self.in_astat_msgb = False self.astat_msga_text = "" self.astat_msgb_text = "" self.in_table = 0 self.current_table: list[list[str]] = [] self.current_row: list[str] = [] self.in_cell = 0 self.cell_text_parts: list[str] = [] self.current_status = "" self.current_tables: list[list[list[str]]] = [] # Orange-style div-based fields:
Label:
#
Value
# We collect these as synthetic [Label, Value] rows so el extractor # downstream funciona sin cambios. self.in_ad_lbl = False self.in_ad_dta = False self.ad_lbl_text = "" self.ad_dta_text = "" self.last_ad_lbl: Optional[str] = None self.current_ad_rows: list[list[str]] = [] def handle_starttag(self, tag, attrs): attrs_d = dict(attrs) classes = (attrs_d.get("class") or "").split() if tag == "div": self.div_depth += 1 if "AUCTION_ITEM" in classes: if self.auction_item_depth == 0: self.current_item_depth_start = self.div_depth self.current_status = "" self.current_tables = [] self.current_ad_rows = [] self.last_ad_lbl = None self.auction_item_depth += 1 elif self.auction_item_depth: if "ASTAT_MSGA" in classes: self.in_astat_msga = True self.astat_msga_text = "" elif "ASTAT_MSGB" in classes: self.in_astat_msgb = True self.astat_msgb_text = "" elif "AD_LBL" in classes: self.in_ad_lbl = True self.ad_lbl_text = "" elif "AD_DTA" in classes: self.in_ad_dta = True self.ad_dta_text = "" elif tag == "table" and self.auction_item_depth: self.in_table += 1 self.current_table = [] elif tag == "tr" and self.in_table: self.current_row = [] elif tag in ("td", "th") and self.in_table: self.in_cell += 1 self.cell_text_parts = [] def handle_endtag(self, tag): if tag == "div": if self.in_astat_msga: self.in_astat_msga = False if "auction status" in self.astat_msga_text.strip().lower(): self.expecting_status_value = True elif self.in_astat_msgb: self.in_astat_msgb = False if self.expecting_status_value and self.auction_item_depth: self.current_status = self.astat_msgb_text.strip() self.expecting_status_value = False elif self.in_ad_lbl: self.in_ad_lbl = False self.last_ad_lbl = self.ad_lbl_text.strip() elif self.in_ad_dta: self.in_ad_dta = False # Pair with most recent AD_LBL if any if self.last_ad_lbl and self.auction_item_depth: label = self.last_ad_lbl value = " ".join(self.ad_dta_text.split()).strip() self.current_ad_rows.append([label, value]) self.last_ad_lbl = None if self.auction_item_depth and self.div_depth == self.current_item_depth_start: self.auction_item_depth -= 1 if self.auction_item_depth == 0: # If item had AD_LBL/AD_DTA pairs (Orange-style), add them # as a synthetic table so el extractor downstream funciona. tables = list(self.current_tables) if self.current_ad_rows: tables.append(self.current_ad_rows) self.items.append({ "status": self.current_status, "tables": tables, }) self.current_status = "" self.current_tables = [] self.current_ad_rows = [] self.current_item_depth_start = None elif self.auction_item_depth: pass self.div_depth -= 1 elif tag in ("td", "th") and self.in_cell: self.in_cell -= 1 text = " ".join("".join(self.cell_text_parts).split()).strip() self.current_row.append(text) self.cell_text_parts = [] elif tag == "tr" and self.in_table: if self.current_row: self.current_table.append(self.current_row) self.current_row = [] elif tag == "table": if self.in_table: self.in_table -= 1 if self.current_table and self.auction_item_depth: self.current_tables.append(self.current_table) self.current_table = [] def handle_data(self, data): if self.in_cell: self.cell_text_parts.append(data) elif self.in_astat_msga: self.astat_msga_text += data elif self.in_astat_msgb: self.astat_msgb_text += data elif self.in_ad_lbl: self.ad_lbl_text += data elif self.in_ad_dta: self.ad_dta_text += data parser = AuctionItemParser() try: parser.feed(html) except Exception as e: if log_fn: log_fn(f" HTML parser error: {e}") return [] cases_on_page: list[dict] = [] skipped_dead = 0 for item in parser.items: status = item["status"] or "" if _is_status_dead(status): skipped_dead += 1 continue for rows in item["tables"]: try: case = _extract_case_from_table_rows(rows) if case and case.get("case_number"): case["auction_status"] = status or "scheduled" if not any(c.get("case_number") == case["case_number"] for c in cases_on_page): cases_on_page.append(case) break except Exception as e: if log_fn: log_fn(f" table parse error: {e}") if skipped_dead and log_fn: log_fn(f" filtered {skipped_dead} dead case(s) (Redeemed/Canceled/Sold/etc)") return cases_on_page # ════════════════════════════════════════════════════════════════════════════ # Main scraper entry point # ════════════════════════════════════════════════════════════════════════════ def scrape_realauction_county( *, county: str, days_ahead: int = 14, days_back: int = 0, status_cb: Optional[Callable[[str], None]] = None, max_dates: Optional[int] = None, use_cache: bool = True, cache_ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY, ) -> list[dict]: """Scrape realauction.com calendar para un county especifico. Args: county: nombre del county (e.g. "Miami-Dade", "Duval", "Broward"). Debe estar en REALAUCTION_COUNTIES. days_ahead: dias hacia adelante (default 14) days_back: dias hacia atras (default 0) status_cb: callback opcional para logging max_dates: limita el numero de dias scrapeados (testing) use_cache: usar cache 24h (default True) cache_ttl_seconds: TTL del cache Returns: list[dict] de deal records listos para deals_db.insert_deal """ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout config = get_county_config(county) subdomain = config["subdomain"] domain = config.get("domain", REALAUCTION_DEFAULT_DOMAIN) source_id = config["source_id"] def _log(msg: str) -> None: if status_cb: status_cb(msg) deals: list[dict] = [] today = datetime.now(timezone.utc).date() dates_to_scrape = [] for offset in range(-days_back, days_ahead + 1): d = today + timedelta(days=offset) dates_to_scrape.append(d) if max_dates: dates_to_scrape = dates_to_scrape[:max_dates] _log(f"{config['label']} Clerk: scraping {len(dates_to_scrape)} dates (cache={'ON' if use_cache else 'OFF'})") cache_namespace = source_id # e.g. "duval_clerk" cached_pages: dict[str, str] = {} dates_needing_fetch: list = [] cache_hits = 0 for date in dates_to_scrape: date_str = date.strftime("%m/%d/%Y") url = ( f"https://{subdomain}.{domain}/index.cfm" f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}" ) if use_cache: cached = get_cached(cache_namespace, url, ttl_seconds=cache_ttl_seconds) if cached: cached_pages[url] = cached cache_hits += 1 continue dates_needing_fetch.append(date) _log(f" cache hits: {cache_hits}/{len(dates_to_scrape)}; need to fetch {len(dates_needing_fetch)}") fresh_pages: dict[str, str] = {} if dates_needing_fetch: with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent=_CHROME_UA, viewport={"width": 1280, "height": 800}, locale="en-US", timezone_id="America/New_York", ) page = context.new_page() page.set_default_timeout(20_000) last_request_at = 0.0 for date in dates_needing_fetch: elapsed = time.time() - last_request_at if elapsed < _REQUEST_INTERVAL_SECONDS: time.sleep(_REQUEST_INTERVAL_SECONDS - elapsed) last_request_at = time.time() date_str = date.strftime("%m/%d/%Y") url = ( f"https://{subdomain}.{domain}/index.cfm" f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}" ) _log(f" Fetching {date_str}...") try: response = page.goto(url, wait_until="networkidle", timeout=20_000) if response.status != 200: _log(f" HTTP {response.status} — skipping") continue except PlaywrightTimeout: _log(f" timeout — skipping") continue except Exception as e: _log(f" error {e} — skipping") continue try: time.sleep(1.5) except Exception: pass html = page.content() fresh_pages[url] = html if use_cache: save_cache(cache_namespace, url, html, status_code=200, ttl_seconds=cache_ttl_seconds) browser.close() for date in dates_to_scrape: date_str = date.strftime("%m/%d/%Y") auction_date_iso = date.isoformat() url = ( f"https://{subdomain}.{domain}/index.cfm" f"?zaction=AUCTION&zmethod=PREVIEW&AuctionDate={date_str}" ) html = cached_pages.get(url) or fresh_pages.get(url) if not html: continue cases_on_page = _parse_cases_from_html(html, _log) _log(f" {date_str}: parsed {len(cases_on_page)} case(s)") for case in cases_on_page: deal = _build_deal_record(case, auction_date_iso, config) if not deal.get("address") and not deal.get("listing_price"): continue deals.append(deal) _log(f"{config['label']} Clerk: scraped {len(deals)} total deals") return deals def run_scraper_to_db( *, county: str, days_ahead: int = 14, days_back: int = 0, auto_classify: bool = True, status_cb: Optional[Callable[[str], None]] = None, max_dates: Optional[int] = None, ) -> dict: """Full pipeline: scrape county → persist → optionally auto-classify nuevos.""" from deals_db import init_db, insert_deal, record_scraper_run, finish_scraper_run init_db() config = get_county_config(county) source_id = config["source_id"] run_id = record_scraper_run(source_id) errors: list[str] = [] def _log(m: str) -> None: if status_cb: status_cb(m) try: deals = scrape_realauction_county( county=county, days_ahead=days_ahead, days_back=days_back, status_cb=status_cb, max_dates=max_dates, ) except Exception as e: errors.append(f"scrape failed: {e}") deals = [] deals_new = 0 deals_updated = 0 new_deal_ids: list[int] = [] for deal in deals: try: deal_id, is_new = insert_deal(deal) if is_new: deals_new += 1 new_deal_ids.append(deal_id) else: deals_updated += 1 except Exception as e: errors.append(f"insert fail for {deal.get('case_number')}: {e}") classified_count = 0 if auto_classify and new_deal_ids: _log(f"Auto-classifying {len(new_deal_ids)} new deals...") from deal_classifier import classify_deal from deals_db import get_deal_by_id, update_classification for did in new_deal_ids: try: d = get_deal_by_id(did) if not d: continue result = classify_deal(d) update_classification( deal_id=did, status=result["classification_status"], score=result["score"], reasons=result["reasons"], strategy=result["strategy"], ) classified_count += 1 except Exception as e: errors.append(f"classify fail for deal_id={did}: {e}") finish_scraper_run( run_id, deals_found=len(deals), deals_new=deals_new, deals_updated=deals_updated, errors_count=len(errors), errors_summary=errors if errors else None, firecrawl_credits_used=0, status="success" if not errors else ("partial" if deals else "failed"), ) return { "source": source_id, "scraper_run_id": run_id, "deals_found": len(deals), "deals_new": deals_new, "deals_updated": deals_updated, "deals_classified": classified_count, "errors_count": len(errors), "errors": errors, }