"""Full pipeline run: Miami-Dade Clerk scraper → deals.db → auto-classify. Reports: - Total deals scraped (today + N days ahead) - Deals new / updated / errors - Classifications by status (potential_winner / maybe / pass / red_flag) - Sample of top 5 deals by classification_score """ from __future__ import annotations import io, sys, time from pathlib import Path sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) def main() -> int: print("=" * 70) print("Miami-Dade Clerk FULL PIPELINE (scrape + persist + classify)") print("=" * 70) from scrapers.miami_dade_clerk import run_scraper_to_db from deals_db import init_db, list_deals, count_deals_by_status init_db() def log(m: str) -> None: print(f" {m}") t0 = time.perf_counter() summary = run_scraper_to_db( days_ahead=14, days_back=0, auto_classify=True, status_cb=log, # max_dates not set → full 15 days ) elapsed = time.perf_counter() - t0 print() print("=" * 70) print(f"PIPELINE SUMMARY (elapsed {elapsed:.0f}s = {elapsed/60:.1f} min)") print("=" * 70) for k, v in summary.items(): if k == "errors": print(f" {k}: ({len(v)} items)") for e in v[:5]: print(f" - {e}") else: print(f" {k}: {v}") # Show count breakdown print() print("--- deals.db counts by status ---") counts = count_deals_by_status() for s, n in sorted(counts.items()): print(f" {s}: {n}") # Show top 5 by classification score print() print("--- TOP 5 by classification_score ---") top = list_deals( classification=None, source="miami_dade_clerk", limit=200, order_by="classification_score DESC NULLS LAST", )[:5] for i, d in enumerate(top, 1): score = d.get("classification_score") cls = d.get("classification_status") strategy = d.get("classification_strategy") addr = (d.get("address") or "(no address)")[:60] sb = d.get("starting_bid") sb_str = f"${sb:,.0f}" if sb else "Hidden/None" av = d.get("estimated_arv") av_str = f"${av:,.0f}" if av else "N/A" fj = d.get("final_judgment_amount") fj_str = f"${fj:,.0f}" if fj else "N/A" reasons_raw = d.get("classification_reasons", "[]") import json as _json try: reasons = _json.loads(reasons_raw) if reasons_raw else [] except Exception: reasons = [] print(f"\n [{i}] score={score} status={cls} strategy={strategy}") print(f" Case: {d.get('case_number')} | Type: {d.get('deal_type')}") print(f" Address: {addr}") print(f" Starting bid: {sb_str} | Assessed: {av_str} | Final Judgment: {fj_str}") print(f" Reasons:") for r in reasons[:4]: print(f" - {r}") # Show classifications by status print() print("--- Classifications by status (Miami-Dade only) ---") by_class = {} all_md = list_deals(source="miami_dade_clerk", limit=500) for d in all_md: cs = d.get("classification_status") or "(unclassified)" by_class[cs] = by_class.get(cs, 0) + 1 for cs, n in sorted(by_class.items()): print(f" {cs}: {n}") print() print(f"✅ B1.4 COMPLETE — {summary['deals_new']} new deals persisted + classified") return 0 if __name__ == "__main__": sys.exit(main())