68 lines
2.1 KiB
Python
68 lines
2.1 KiB
Python
"""E2E test of Zillow scraper using CACHED markdown (0 Firecrawl credits)."""
|
|
from __future__ import annotations
|
|
import io, sys
|
|
from pathlib import Path
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
import data_fetchers # noqa: F401 — load .env
|
|
|
|
import scrapers.zillow as zillow_mod
|
|
import scrapers._cache as cache_mod
|
|
from scrapers.zillow import run_scraper_to_db
|
|
from deals_db import list_deals, init_db
|
|
|
|
|
|
def main() -> int:
|
|
init_db()
|
|
|
|
# Pre-populate cache with the test markdown from prior exploration
|
|
test_url = zillow_mod._build_zillow_url("Miami-Dade", "FL", 1)
|
|
print(f"Cache target URL: {test_url}")
|
|
|
|
md_file = ROOT / "scripts" / "_zillow_miami_md.txt"
|
|
if not md_file.exists():
|
|
print(f"❌ Test markdown not found at {md_file}")
|
|
return 1
|
|
|
|
md = md_file.read_text(encoding="utf-8")
|
|
cache_mod.save_cache(
|
|
"zillow", test_url, md,
|
|
status_code=200, ttl_seconds=cache_mod.DEFAULT_TTL_SECONDS_HOURLY,
|
|
)
|
|
print(f"Cached: {len(md):,} chars")
|
|
|
|
# Run pipeline (cache hit, 0 credits, auto_classify=True to test full flow)
|
|
print()
|
|
print("Running zillow.run_scraper_to_db (auto_classify=True — ~5s/deal LLM)...")
|
|
result = run_scraper_to_db(
|
|
counties=["Miami-Dade"], state="FL", pages_per_county=1,
|
|
auto_classify=True,
|
|
status_cb=lambda m: print(f" {m}"),
|
|
)
|
|
print()
|
|
print("Result:")
|
|
for k, v in result.items():
|
|
print(f" {k}: {v}")
|
|
|
|
# Verify in DB
|
|
print()
|
|
print("=== zillow source in deals.db ===")
|
|
zd = list_deals(source="zillow", limit=20)
|
|
print(f"Total zillow deals: {len(zd)}")
|
|
for d in zd[:5]:
|
|
addr = (d.get("address") or "?")[:55]
|
|
price = d.get("listing_price") or 0
|
|
beds = d.get("beds")
|
|
baths = d.get("baths")
|
|
sqft = d.get("sqft")
|
|
print(f" zpid {d.get('case_number'):<10} | ${price:>11,.0f} | {beds!s:>2}bd/{baths!s:>3}ba/{sqft!s:>5}sqft | {addr}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|