Files
AR-House/scripts/test_deal_classifier.py
2026-07-03 12:24:58 -04:00

273 lines
10 KiB
Python

"""Unit + smoke tests para Phase 3A — deals_db + DealClassifier.
Tests:
1. deals_db CRUD: init, insert, dedup, list, update_classification, update_status
2. firecrawl tracking: record_usage, get_month_usage, alert levels
3. DealClassifier: precompute_heuristics, build_prompt, parse output
4. Smoke test: clasificar 4 deals reales (cada uno con expectativa clara)
"""
from __future__ import annotations
import io, sys, time, os
from pathlib import Path
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
def run_unit_tests():
"""Test deals_db CRUD + Firecrawl tracking sin llamar Ollama."""
print("=" * 70)
print("UNIT TESTS — deals_db + Firecrawl tracking")
print("=" * 70)
# Set DB path to a temp location for isolated testing
test_db = ROOT / "data" / "deals_test.db"
if test_db.exists():
test_db.unlink()
os.environ.setdefault("DEALS_DB_PATH", str(test_db.relative_to(ROOT)))
import deals_db
# Override module-level _DB_PATH to point at test DB
deals_db._DB_PATH = test_db
# Reset thread-local connection
if hasattr(deals_db._LOCAL, "conn"):
deals_db._LOCAL.conn.close()
del deals_db._LOCAL.conn
from deals_db import (
init_db, insert_deal, get_deal_by_hash, get_deal_by_id,
update_classification, update_status, list_deals,
record_scraper_run, finish_scraper_run, list_recent_scraper_runs,
record_firecrawl_usage, get_firecrawl_month_usage,
firecrawl_alert_level, is_firecrawl_paused, firecrawl_budget_status,
count_deals_by_status, compute_deal_hash,
)
init_db()
print("init_db OK")
# Test 1: insert + dedup
d1 = {
"source": "miami_dade_clerk",
"source_url": "https://example.com/case/12345",
"address": "123 Main St, Miami, FL 33101",
"city": "Miami", "state": "FL", "zip": "33101", "county": "Miami-Dade",
"listing_price": 150000,
"deal_type": "foreclosure",
"starting_bid": 80000,
"estimated_arv": 240000,
"beds": 3, "baths": 2.0, "sqft": 1400, "year_built": 1985,
"case_number": "2025-CA-001234",
"auction_date": "2026-06-15",
}
id1, is_new = insert_deal(d1)
assert is_new, f"first insert should be new, got is_new={is_new}"
assert id1 > 0
print(f"INSERT 1: id={id1}, is_new={is_new} OK")
# Test 2: re-insert same → should update, not insert
id1b, is_new_b = insert_deal(d1)
assert id1b == id1
assert not is_new_b
print(f"INSERT 1 (re-insert): id={id1b} same as first, is_new={is_new_b} OK")
# Test 3: different source → new row
d2 = dict(d1)
d2["source"] = "zillow"
d2["source_url"] = "https://zillow.com/123"
id2, is_new_2 = insert_deal(d2)
assert is_new_2
assert id2 != id1
print(f"INSERT 2 (different source): id={id2} OK")
# Test 4: hash function deterministic
h1 = compute_deal_hash("miami_dade_clerk", "123 main st miami fl", 150000)
h2 = compute_deal_hash("miami_dade_clerk", "123 Main St Miami FL", 150000)
assert h1 == h2, "case-insensitive hash failed"
print("compute_deal_hash case-insensitive OK")
# Test 5: update_classification
update_classification(
deal_id=id1,
status="potential_winner",
score=85,
reasons=["price_per_sqft $107 in Class C → 25% below market",
"cap_rate_rough 8.5% above buy_hold threshold"],
strategy="buy_hold",
)
deal = get_deal_by_id(id1)
assert deal["classification_status"] == "potential_winner"
assert deal["classification_score"] == 85
assert deal["status"] == "classified", f"status should auto-flip to classified, got {deal['status']}"
assert "Class C" in deal["classification_reasons"]
print("update_classification OK (auto-flipped status new→classified)")
# Test 6: list_deals filter
winners = list_deals(classification="potential_winner")
assert len(winners) == 1
assert winners[0]["id"] == id1
print(f"list_deals(classification=potential_winner): {len(winners)} deal OK")
# Test 7: update_status
update_status(id1, "interesting")
deal = get_deal_by_id(id1)
assert deal["status"] == "interesting"
print("update_status OK")
# Test 8: count_deals_by_status
counts = count_deals_by_status()
print(f"count_deals_by_status: {counts}")
assert counts.get("interesting", 0) == 1
assert counts.get("new", 0) == 1
# Test 9: scraper runs
run_id = record_scraper_run("miami_dade_clerk")
assert run_id > 0
finish_scraper_run(run_id, deals_found=15, deals_new=3, deals_updated=12,
errors_count=0, firecrawl_credits_used=0, status="success")
runs = list_recent_scraper_runs(source="miami_dade_clerk")
assert len(runs) == 1
assert runs[0]["status"] == "success"
assert runs[0]["deals_new"] == 3
print(f"scraper_runs: id={run_id} deals_new={runs[0]['deals_new']} OK")
# Test 10: firecrawl tracking
record_firecrawl_usage(source="zillow_scraper", credits=5, url="https://...")
record_firecrawl_usage(source="realtor_scraper", credits=8, url="https://...")
total = get_firecrawl_month_usage()
assert total == 13, f"expected 13, got {total}"
print(f"firecrawl_month_usage: {total} credits OK")
# Test 11: alert level
level = firecrawl_alert_level()
assert level == "ok", f"with 13 credits and budget 500, should be 'ok', got {level}"
paused = is_firecrawl_paused()
assert not paused
print(f"firecrawl_alert_level: {level} OK, paused={paused}")
# Test 12: simulate hitting 80% threshold
record_firecrawl_usage(source="bulk_test", credits=400)
level = firecrawl_alert_level()
assert level == "warn", f"with 413/500 credits should be 'warn', got {level}"
print(f"firecrawl alert at 82% usage: {level} OK")
# Test 13: simulate hitting 95% pause
record_firecrawl_usage(source="bulk_test", credits=65)
level = firecrawl_alert_level()
assert level == "pause", f"with 478/500 credits should be 'pause', got {level}"
assert is_firecrawl_paused()
print(f"firecrawl auto-pause at 95.6% usage: {level} OK")
# Test 14: budget snapshot
snap = firecrawl_budget_status()
print(f"firecrawl_budget_status: {snap}")
# Cleanup
deals_db._LOCAL.conn.close()
del deals_db._LOCAL.conn
test_db.unlink()
print()
print("=== ALL UNIT TESTS PASSED ===")
return 0
def run_classifier_smoke():
"""Smoke test: clasificar 4 deals reales con expectativas."""
print()
print("=" * 70)
print("SMOKE TEST — DealClassifier con 4 deals reales")
print("=" * 70)
from deal_classifier import classify_deal
test_cases = [
{
"name": "Miami foreclosure $80K starting bid, ARV $240K",
"expected_status": "potential_winner",
"deal": {
"source": "miami_dade_clerk",
"deal_type": "foreclosure",
"address": "789 NE 1st St, Miami, FL 33132",
"city": "Miami", "county": "Miami-Dade", "state": "FL", "zip": "33132",
"listing_price": 80000, "starting_bid": 80000, "estimated_arv": 240000,
"beds": 3, "baths": 2.0, "sqft": 1400, "year_built": 1995,
"case_number": "2025-CA-001234",
"auction_date": "2026-06-15",
},
},
{
"name": "Miami MLS retail $450K Class B normal price",
"expected_status": "maybe", # normal MLS dentro de market
"deal": {
"source": "zillow",
"deal_type": "mls",
"address": "100 Brickell Ave, Miami, FL 33131",
"city": "Miami", "county": "Miami-Dade", "state": "FL", "zip": "33131",
"listing_price": 450000,
"beds": 3, "baths": 2.0, "sqft": 1800, "year_built": 2005,
},
},
{
"name": "Jacksonville $25K tax_deed 1967 build (red flag)",
"expected_status": "red_flag",
"deal": {
"source": "duval_tax_collector",
"deal_type": "tax_deed",
"address": "456 W 21st St, Jacksonville, FL 32209",
"city": "Jacksonville", "county": "Duval", "state": "FL", "zip": "32209",
"listing_price": 25000, "starting_bid": 25000,
"beds": 2, "baths": 1.0, "sqft": 900, "year_built": 1967,
},
},
{
"name": "Hialeah MLS $600K Class C overpriced",
"expected_status": "pass",
"deal": {
"source": "realtor",
"deal_type": "mls",
"address": "1234 W 49th St, Hialeah, FL 33012",
"city": "Hialeah", "county": "Miami-Dade", "state": "FL", "zip": "33012",
"listing_price": 600000,
"beds": 3, "baths": 2.0, "sqft": 1500, "year_built": 1970,
},
},
]
results = []
for i, tc in enumerate(test_cases, 1):
print(f"\n--- [{i}/{len(test_cases)}] {tc['name']} ---")
print(f" Expected: {tc['expected_status']}")
t0 = time.perf_counter()
result = classify_deal(tc["deal"])
dur = time.perf_counter() - t0
actual = result["classification_status"]
match = "✅" if actual == tc["expected_status"] else "⚠️"
print(f" Actual: {actual} (score {result['score']})")
print(f" Strategy: {result['strategy']}")
print(f" Reasons:")
for r in result["reasons"]:
print(f" - {r}")
print(f" Match: {match} | Duration: {dur:.1f}s | tokens: {result['_meta']['tokens']}")
if result["_meta"].get("ollama_error"):
print(f" ❌ Ollama error: {result['_meta']['ollama_error']}")
if result["_meta"].get("parse_error"):
print(f" ⚠️ Parse error: {result['_meta'].get('parse_error_detail')}")
results.append((tc["name"], tc["expected_status"], actual, dur))
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
matches = sum(1 for _, exp, act, _ in results if exp == act)
avg_dur = sum(d for _, _, _, d in results) / len(results)
print(f" Match rate: {matches}/{len(results)}")
print(f" Avg duration: {avg_dur:.1f}s per deal")
return 0
if __name__ == "__main__":
rc1 = run_unit_tests()
rc2 = run_classifier_smoke()
sys.exit(rc1 or rc2)