Files
AR-House/scripts/explore_zillow_firecrawl.py
2026-07-03 12:24:58 -04:00

96 lines
2.8 KiB
Python

"""Test Firecrawl scrape on Zillow Miami-Dade county page.
EXPECTED COST: ~3-5 Firecrawl credits (1 page scrape).
Goals:
1. Verify Firecrawl can bypass Zillow's anti-bot
2. Inspect the markdown structure of search results
3. Decide on parser strategy (regex vs LLM extract)
"""
from __future__ import annotations
import io, os, sys
from pathlib import Path
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
# Force-load .env via data_fetchers
import data_fetchers # noqa: F401
from firecrawl import FirecrawlApp
def main() -> int:
api_key = os.getenv("FIRECRAWL_API_KEY", "")
if not api_key:
print("❌ FIRECRAWL_API_KEY not set")
return 1
print("=" * 70)
print("Zillow Firecrawl smoke test — Miami-Dade County FL")
print("=" * 70)
print(f"Using API key: {api_key[:10]}...")
url = "https://www.zillow.com/miami-dade-county-fl/houses/"
print(f"URL: {url}")
print()
app = FirecrawlApp(api_key=api_key)
# Print credit usage before
try:
usage_before = app.get_credit_usage()
print(f"Credit usage BEFORE: {usage_before}")
except Exception as e:
print(f"(could not fetch credit usage before: {e})")
print()
print("Calling scrape() (formats=markdown)...")
try:
result = app.scrape(url, formats=["markdown"])
except Exception as e:
print(f"❌ Firecrawl call failed: {type(e).__name__}: {e}")
return 1
# Print credit usage after
try:
usage_after = app.get_credit_usage()
print(f"Credit usage AFTER: {usage_after}")
except Exception:
pass
print(f"Result type: {type(result).__name__}")
# Save full markdown for inspection
md = result.markdown if hasattr(result, "markdown") else (result.get("markdown") if isinstance(result, dict) else None)
if md:
print(f"Markdown length: {len(md):,} chars")
out_file = ROOT / "scripts" / "_zillow_miami_md.txt"
out_file.write_text(md, encoding="utf-8")
print(f"Saved: {out_file}")
# Sample of markdown content
print()
print("--- FIRST 3000 CHARS ---")
print(md[:3000])
print()
print("--- CHARS 3000-6000 (probably listings) ---")
print(md[3000:6000])
# Inspect result attrs / keys (Firecrawl SDK may have changed)
print()
print("--- RESULT META ---")
if hasattr(result, "metadata"):
print(f"metadata: {result.metadata}")
if hasattr(result, "credits_used"):
print(f"credits_used: {result.credits_used}")
if isinstance(result, dict):
for k in result.keys():
print(f" key: {k}")
return 0
if __name__ == "__main__":
sys.exit(main())