Files
AR-House/scripts/run_scraper_hud_full.py
T
2026-07-03 12:24:58 -04:00

83 lines
2.7 KiB
Python

"""Full HUD pipeline: scrape FL → persist → classify."""
from __future__ import annotations
import io, sys, time
from pathlib import Path
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
def main() -> int:
print("=" * 70)
print("HUD Homestore FULL PIPELINE (FL only)")
print("=" * 70)
from scrapers.hud_homestore import run_scraper_to_db
from deals_db import init_db, list_deals
init_db()
def log(m): print(f" {m}")
t0 = time.perf_counter()
summary = run_scraper_to_db(states=["FL"], auto_classify=True, status_cb=log)
elapsed = time.perf_counter() - t0
print()
print("=" * 70)
print(f"PIPELINE SUMMARY (elapsed {elapsed:.0f}s = {elapsed/60:.1f} min)")
print("=" * 70)
for k, v in summary.items():
if k == "errors":
print(f" {k}: ({len(v)} items)")
for e in v[:3]:
print(f" - {e}")
else:
print(f" {k}: {v}")
# Show classification breakdown for HUD source
print()
print("--- HUD deals by classification ---")
hud = list_deals(source="hud_homestore", limit=200)
by_class = {}
for d in hud:
cs = d.get("classification_status") or "(unclassified)"
by_class[cs] = by_class.get(cs, 0) + 1
for cs in sorted(by_class.keys()):
print(f" {cs}: {by_class[cs]}")
print()
print("--- TOP 10 HUD deals by classification_score ---")
top = sorted(hud, key=lambda d: (d.get("classification_score") or 0), reverse=True)[:10]
print(f"{'#':<3} {'Score':<6} {'Status':<20} {'Strategy':<14} {'Price':<10} {'Beds':<5} Address (county)")
print("-" * 130)
import json as _json
for i, d in enumerate(top, 1):
score = d.get("classification_score") or 0
cls = d.get("classification_status") or "?"
strat = d.get("classification_strategy") or "?"
price = d.get("listing_price")
price_str = f"${price:,.0f}" if price else "N/A"
beds = d.get("beds")
addr = (d.get("address") or "?")[:70]
county = d.get("county") or ""
print(f"{i:<3} {score:<6} {cls:<20} {strat:<14} {price_str:<10} {beds!s:<5} {addr} ({county})")
# Print 3 sample reasons
print()
print("--- Sample reasons (top 3) ---")
for i, d in enumerate(top[:3], 1):
print(f"\n [{i}] {d.get('case_number')}{d.get('classification_status')} score {d.get('classification_score')}")
try:
reasons = _json.loads(d.get("classification_reasons") or "[]")
for r in reasons:
print(f" - {r}")
except Exception:
pass
return 0
if __name__ == "__main__":
sys.exit(main())