111 lines
3.5 KiB
Python
111 lines
3.5 KiB
Python
"""Full pipeline run: Miami-Dade Clerk scraper → deals.db → auto-classify.
|
|
|
|
Reports:
|
|
- Total deals scraped (today + N days ahead)
|
|
- Deals new / updated / errors
|
|
- Classifications by status (potential_winner / maybe / pass / red_flag)
|
|
- Sample of top 5 deals by classification_score
|
|
"""
|
|
from __future__ import annotations
|
|
import io, sys, time
|
|
from pathlib import Path
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
|
|
def main() -> int:
|
|
print("=" * 70)
|
|
print("Miami-Dade Clerk FULL PIPELINE (scrape + persist + classify)")
|
|
print("=" * 70)
|
|
|
|
from scrapers.miami_dade_clerk import run_scraper_to_db
|
|
from deals_db import init_db, list_deals, count_deals_by_status
|
|
init_db()
|
|
|
|
def log(m: str) -> None:
|
|
print(f" {m}")
|
|
|
|
t0 = time.perf_counter()
|
|
summary = run_scraper_to_db(
|
|
days_ahead=14,
|
|
days_back=0,
|
|
auto_classify=True,
|
|
status_cb=log,
|
|
# max_dates not set → full 15 days
|
|
)
|
|
elapsed = time.perf_counter() - t0
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print(f"PIPELINE SUMMARY (elapsed {elapsed:.0f}s = {elapsed/60:.1f} min)")
|
|
print("=" * 70)
|
|
for k, v in summary.items():
|
|
if k == "errors":
|
|
print(f" {k}: ({len(v)} items)")
|
|
for e in v[:5]:
|
|
print(f" - {e}")
|
|
else:
|
|
print(f" {k}: {v}")
|
|
|
|
# Show count breakdown
|
|
print()
|
|
print("--- deals.db counts by status ---")
|
|
counts = count_deals_by_status()
|
|
for s, n in sorted(counts.items()):
|
|
print(f" {s}: {n}")
|
|
|
|
# Show top 5 by classification score
|
|
print()
|
|
print("--- TOP 5 by classification_score ---")
|
|
top = list_deals(
|
|
classification=None,
|
|
source="miami_dade_clerk",
|
|
limit=200,
|
|
order_by="classification_score DESC NULLS LAST",
|
|
)[:5]
|
|
for i, d in enumerate(top, 1):
|
|
score = d.get("classification_score")
|
|
cls = d.get("classification_status")
|
|
strategy = d.get("classification_strategy")
|
|
addr = (d.get("address") or "(no address)")[:60]
|
|
sb = d.get("starting_bid")
|
|
sb_str = f"${sb:,.0f}" if sb else "Hidden/None"
|
|
av = d.get("estimated_arv")
|
|
av_str = f"${av:,.0f}" if av else "N/A"
|
|
fj = d.get("final_judgment_amount")
|
|
fj_str = f"${fj:,.0f}" if fj else "N/A"
|
|
reasons_raw = d.get("classification_reasons", "[]")
|
|
import json as _json
|
|
try:
|
|
reasons = _json.loads(reasons_raw) if reasons_raw else []
|
|
except Exception:
|
|
reasons = []
|
|
print(f"\n [{i}] score={score} status={cls} strategy={strategy}")
|
|
print(f" Case: {d.get('case_number')} | Type: {d.get('deal_type')}")
|
|
print(f" Address: {addr}")
|
|
print(f" Starting bid: {sb_str} | Assessed: {av_str} | Final Judgment: {fj_str}")
|
|
print(f" Reasons:")
|
|
for r in reasons[:4]:
|
|
print(f" - {r}")
|
|
|
|
# Show classifications by status
|
|
print()
|
|
print("--- Classifications by status (Miami-Dade only) ---")
|
|
by_class = {}
|
|
all_md = list_deals(source="miami_dade_clerk", limit=500)
|
|
for d in all_md:
|
|
cs = d.get("classification_status") or "(unclassified)"
|
|
by_class[cs] = by_class.get(cs, 0) + 1
|
|
for cs, n in sorted(by_class.items()):
|
|
print(f" {cs}: {n}")
|
|
|
|
print()
|
|
print(f"✅ B1.4 COMPLETE — {summary['deals_new']} new deals persisted + classified")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|