Files
AR-House/scripts/run_scraper_miami_dade_full.py
2026-07-03 12:24:58 -04:00

111 lines
3.5 KiB
Python

"""Full pipeline run: Miami-Dade Clerk scraper → deals.db → auto-classify.
Reports:
- Total deals scraped (today + N days ahead)
- Deals new / updated / errors
- Classifications by status (potential_winner / maybe / pass / red_flag)
- Sample of top 5 deals by classification_score
"""
from __future__ import annotations
import io, sys, time
from pathlib import Path
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
def main() -> int:
print("=" * 70)
print("Miami-Dade Clerk FULL PIPELINE (scrape + persist + classify)")
print("=" * 70)
from scrapers.miami_dade_clerk import run_scraper_to_db
from deals_db import init_db, list_deals, count_deals_by_status
init_db()
def log(m: str) -> None:
print(f" {m}")
t0 = time.perf_counter()
summary = run_scraper_to_db(
days_ahead=14,
days_back=0,
auto_classify=True,
status_cb=log,
# max_dates not set → full 15 days
)
elapsed = time.perf_counter() - t0
print()
print("=" * 70)
print(f"PIPELINE SUMMARY (elapsed {elapsed:.0f}s = {elapsed/60:.1f} min)")
print("=" * 70)
for k, v in summary.items():
if k == "errors":
print(f" {k}: ({len(v)} items)")
for e in v[:5]:
print(f" - {e}")
else:
print(f" {k}: {v}")
# Show count breakdown
print()
print("--- deals.db counts by status ---")
counts = count_deals_by_status()
for s, n in sorted(counts.items()):
print(f" {s}: {n}")
# Show top 5 by classification score
print()
print("--- TOP 5 by classification_score ---")
top = list_deals(
classification=None,
source="miami_dade_clerk",
limit=200,
order_by="classification_score DESC NULLS LAST",
)[:5]
for i, d in enumerate(top, 1):
score = d.get("classification_score")
cls = d.get("classification_status")
strategy = d.get("classification_strategy")
addr = (d.get("address") or "(no address)")[:60]
sb = d.get("starting_bid")
sb_str = f"${sb:,.0f}" if sb else "Hidden/None"
av = d.get("estimated_arv")
av_str = f"${av:,.0f}" if av else "N/A"
fj = d.get("final_judgment_amount")
fj_str = f"${fj:,.0f}" if fj else "N/A"
reasons_raw = d.get("classification_reasons", "[]")
import json as _json
try:
reasons = _json.loads(reasons_raw) if reasons_raw else []
except Exception:
reasons = []
print(f"\n [{i}] score={score} status={cls} strategy={strategy}")
print(f" Case: {d.get('case_number')} | Type: {d.get('deal_type')}")
print(f" Address: {addr}")
print(f" Starting bid: {sb_str} | Assessed: {av_str} | Final Judgment: {fj_str}")
print(f" Reasons:")
for r in reasons[:4]:
print(f" - {r}")
# Show classifications by status
print()
print("--- Classifications by status (Miami-Dade only) ---")
by_class = {}
all_md = list_deals(source="miami_dade_clerk", limit=500)
for d in all_md:
cs = d.get("classification_status") or "(unclassified)"
by_class[cs] = by_class.get(cs, 0) + 1
for cs, n in sorted(by_class.items()):
print(f" {cs}: {n}")
print()
print(f"✅ B1.4 COMPLETE — {summary['deals_new']} new deals persisted + classified")
return 0
if __name__ == "__main__":
sys.exit(main())