"""E2E test of Zillow scraper using CACHED markdown (0 Firecrawl credits).""" from __future__ import annotations import io, sys from pathlib import Path sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) import data_fetchers # noqa: F401 — load .env import scrapers.zillow as zillow_mod import scrapers._cache as cache_mod from scrapers.zillow import run_scraper_to_db from deals_db import list_deals, init_db def main() -> int: init_db() # Pre-populate cache with the test markdown from prior exploration test_url = zillow_mod._build_zillow_url("Miami-Dade", "FL", 1) print(f"Cache target URL: {test_url}") md_file = ROOT / "scripts" / "_zillow_miami_md.txt" if not md_file.exists(): print(f"❌ Test markdown not found at {md_file}") return 1 md = md_file.read_text(encoding="utf-8") cache_mod.save_cache( "zillow", test_url, md, status_code=200, ttl_seconds=cache_mod.DEFAULT_TTL_SECONDS_HOURLY, ) print(f"Cached: {len(md):,} chars") # Run pipeline (cache hit, 0 credits, auto_classify=True to test full flow) print() print("Running zillow.run_scraper_to_db (auto_classify=True — ~5s/deal LLM)...") result = run_scraper_to_db( counties=["Miami-Dade"], state="FL", pages_per_county=1, auto_classify=True, status_cb=lambda m: print(f" {m}"), ) print() print("Result:") for k, v in result.items(): print(f" {k}: {v}") # Verify in DB print() print("=== zillow source in deals.db ===") zd = list_deals(source="zillow", limit=20) print(f"Total zillow deals: {len(zd)}") for d in zd[:5]: addr = (d.get("address") or "?")[:55] price = d.get("listing_price") or 0 beds = d.get("beds") baths = d.get("baths") sqft = d.get("sqft") print(f" zpid {d.get('case_number'):<10} | ${price:>11,.0f} | {beds!s:>2}bd/{baths!s:>3}ba/{sqft!s:>5}sqft | {addr}") return 0 if __name__ == "__main__": sys.exit(main())