AR-House/scripts/run_scraper_miami_dade_full.py

"""Full pipeline run: Miami-Dade Clerk scraper → deals.db → auto-classify.

Reports:
- Total deals scraped (today + N days ahead)
- Deals new / updated / errors
- Classifications by status (potential_winner / maybe / pass / red_flag)
- Sample of top 5 deals by classification_score
"""
from __future__ import annotations
import io, sys, time
from pathlib import Path

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))


def main() -> int:
    print("=" * 70)
    print("Miami-Dade Clerk FULL PIPELINE (scrape + persist + classify)")
    print("=" * 70)

    from scrapers.miami_dade_clerk import run_scraper_to_db
    from deals_db import init_db, list_deals, count_deals_by_status
    init_db()

    def log(m: str) -> None:
        print(f"  {m}")

    t0 = time.perf_counter()
    summary = run_scraper_to_db(
        days_ahead=14,
        days_back=0,
        auto_classify=True,
        status_cb=log,
        # max_dates not set → full 15 days
    )
    elapsed = time.perf_counter() - t0

    print()
    print("=" * 70)
    print(f"PIPELINE SUMMARY (elapsed {elapsed:.0f}s = {elapsed/60:.1f} min)")
    print("=" * 70)
    for k, v in summary.items():
        if k == "errors":
            print(f"  {k}: ({len(v)} items)")
            for e in v[:5]:
                print(f"    - {e}")
        else:
            print(f"  {k}: {v}")

    # Show count breakdown
    print()
    print("--- deals.db counts by status ---")
    counts = count_deals_by_status()
    for s, n in sorted(counts.items()):
        print(f"  {s}: {n}")

    # Show top 5 by classification score
    print()
    print("--- TOP 5 by classification_score ---")
    top = list_deals(
        classification=None,
        source="miami_dade_clerk",
        limit=200,
        order_by="classification_score DESC NULLS LAST",
    )[:5]
    for i, d in enumerate(top, 1):
        score = d.get("classification_score")
        cls = d.get("classification_status")
        strategy = d.get("classification_strategy")
        addr = (d.get("address") or "(no address)")[:60]
        sb = d.get("starting_bid")
        sb_str = f"${sb:,.0f}" if sb else "Hidden/None"
        av = d.get("estimated_arv")
        av_str = f"${av:,.0f}" if av else "N/A"
        fj = d.get("final_judgment_amount")
        fj_str = f"${fj:,.0f}" if fj else "N/A"
        reasons_raw = d.get("classification_reasons", "[]")
        import json as _json
        try:
            reasons = _json.loads(reasons_raw) if reasons_raw else []
        except Exception:
            reasons = []
        print(f"\n  [{i}] score={score} status={cls} strategy={strategy}")
        print(f"      Case: {d.get('case_number')} | Type: {d.get('deal_type')}")
        print(f"      Address: {addr}")
        print(f"      Starting bid: {sb_str} | Assessed: {av_str} | Final Judgment: {fj_str}")
        print(f"      Reasons:")
        for r in reasons[:4]:
            print(f"        - {r}")

    # Show classifications by status
    print()
    print("--- Classifications by status (Miami-Dade only) ---")
    by_class = {}
    all_md = list_deals(source="miami_dade_clerk", limit=500)
    for d in all_md:
        cs = d.get("classification_status") or "(unclassified)"
        by_class[cs] = by_class.get(cs, 0) + 1
    for cs, n in sorted(by_class.items()):
        print(f"  {cs}: {n}")

    print()
    print(f"✅ B1.4 COMPLETE — {summary['deals_new']} new deals persisted + classified")
    return 0


if __name__ == "__main__":
    sys.exit(main())