"""Full HUD pipeline: scrape FL → persist → classify.""" from __future__ import annotations import io, sys, time from pathlib import Path sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) def main() -> int: print("=" * 70) print("HUD Homestore FULL PIPELINE (FL only)") print("=" * 70) from scrapers.hud_homestore import run_scraper_to_db from deals_db import init_db, list_deals init_db() def log(m): print(f" {m}") t0 = time.perf_counter() summary = run_scraper_to_db(states=["FL"], auto_classify=True, status_cb=log) elapsed = time.perf_counter() - t0 print() print("=" * 70) print(f"PIPELINE SUMMARY (elapsed {elapsed:.0f}s = {elapsed/60:.1f} min)") print("=" * 70) for k, v in summary.items(): if k == "errors": print(f" {k}: ({len(v)} items)") for e in v[:3]: print(f" - {e}") else: print(f" {k}: {v}") # Show classification breakdown for HUD source print() print("--- HUD deals by classification ---") hud = list_deals(source="hud_homestore", limit=200) by_class = {} for d in hud: cs = d.get("classification_status") or "(unclassified)" by_class[cs] = by_class.get(cs, 0) + 1 for cs in sorted(by_class.keys()): print(f" {cs}: {by_class[cs]}") print() print("--- TOP 10 HUD deals by classification_score ---") top = sorted(hud, key=lambda d: (d.get("classification_score") or 0), reverse=True)[:10] print(f"{'#':<3} {'Score':<6} {'Status':<20} {'Strategy':<14} {'Price':<10} {'Beds':<5} Address (county)") print("-" * 130) import json as _json for i, d in enumerate(top, 1): score = d.get("classification_score") or 0 cls = d.get("classification_status") or "?" strat = d.get("classification_strategy") or "?" price = d.get("listing_price") price_str = f"${price:,.0f}" if price else "N/A" beds = d.get("beds") addr = (d.get("address") or "?")[:70] county = d.get("county") or "" print(f"{i:<3} {score:<6} {cls:<20} {strat:<14} {price_str:<10} {beds!s:<5} {addr} ({county})") # Print 3 sample reasons print() print("--- Sample reasons (top 3) ---") for i, d in enumerate(top[:3], 1): print(f"\n [{i}] {d.get('case_number')} — {d.get('classification_status')} score {d.get('classification_score')}") try: reasons = _json.loads(d.get("classification_reasons") or "[]") for r in reasons: print(f" - {r}") except Exception: pass return 0 if __name__ == "__main__": sys.exit(main())