"""Test Firecrawl scrape on Zillow Miami-Dade county page. EXPECTED COST: ~3-5 Firecrawl credits (1 page scrape). Goals: 1. Verify Firecrawl can bypass Zillow's anti-bot 2. Inspect the markdown structure of search results 3. Decide on parser strategy (regex vs LLM extract) """ from __future__ import annotations import io, os, sys from pathlib import Path sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) # Force-load .env via data_fetchers import data_fetchers # noqa: F401 from firecrawl import FirecrawlApp def main() -> int: api_key = os.getenv("FIRECRAWL_API_KEY", "") if not api_key: print("❌ FIRECRAWL_API_KEY not set") return 1 print("=" * 70) print("Zillow Firecrawl smoke test — Miami-Dade County FL") print("=" * 70) print(f"Using API key: {api_key[:10]}...") url = "https://www.zillow.com/miami-dade-county-fl/houses/" print(f"URL: {url}") print() app = FirecrawlApp(api_key=api_key) # Print credit usage before try: usage_before = app.get_credit_usage() print(f"Credit usage BEFORE: {usage_before}") except Exception as e: print(f"(could not fetch credit usage before: {e})") print() print("Calling scrape() (formats=markdown)...") try: result = app.scrape(url, formats=["markdown"]) except Exception as e: print(f"❌ Firecrawl call failed: {type(e).__name__}: {e}") return 1 # Print credit usage after try: usage_after = app.get_credit_usage() print(f"Credit usage AFTER: {usage_after}") except Exception: pass print(f"Result type: {type(result).__name__}") # Save full markdown for inspection md = result.markdown if hasattr(result, "markdown") else (result.get("markdown") if isinstance(result, dict) else None) if md: print(f"Markdown length: {len(md):,} chars") out_file = ROOT / "scripts" / "_zillow_miami_md.txt" out_file.write_text(md, encoding="utf-8") print(f"Saved: {out_file}") # Sample of markdown content print() print("--- FIRST 3000 CHARS ---") print(md[:3000]) print() print("--- CHARS 3000-6000 (probably listings) ---") print(md[3000:6000]) # Inspect result attrs / keys (Firecrawl SDK may have changed) print() print("--- RESULT META ---") if hasattr(result, "metadata"): print(f"metadata: {result.metadata}") if hasattr(result, "credits_used"): print(f"credits_used: {result.credits_used}") if isinstance(result, dict): for k in result.keys(): print(f" key: {k}") return 0 if __name__ == "__main__": sys.exit(main())