feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
"""Test Firecrawl scrape on Zillow Miami-Dade county page.
|
||||
|
||||
EXPECTED COST: ~3-5 Firecrawl credits (1 page scrape).
|
||||
Goals:
|
||||
1. Verify Firecrawl can bypass Zillow's anti-bot
|
||||
2. Inspect the markdown structure of search results
|
||||
3. Decide on parser strategy (regex vs LLM extract)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import io, os, sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
# Force-load .env via data_fetchers
|
||||
import data_fetchers # noqa: F401
|
||||
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
|
||||
def main() -> int:
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "")
|
||||
if not api_key:
|
||||
print("❌ FIRECRAWL_API_KEY not set")
|
||||
return 1
|
||||
|
||||
print("=" * 70)
|
||||
print("Zillow Firecrawl smoke test — Miami-Dade County FL")
|
||||
print("=" * 70)
|
||||
print(f"Using API key: {api_key[:10]}...")
|
||||
|
||||
url = "https://www.zillow.com/miami-dade-county-fl/houses/"
|
||||
print(f"URL: {url}")
|
||||
print()
|
||||
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
|
||||
# Print credit usage before
|
||||
try:
|
||||
usage_before = app.get_credit_usage()
|
||||
print(f"Credit usage BEFORE: {usage_before}")
|
||||
except Exception as e:
|
||||
print(f"(could not fetch credit usage before: {e})")
|
||||
|
||||
print()
|
||||
print("Calling scrape() (formats=markdown)...")
|
||||
try:
|
||||
result = app.scrape(url, formats=["markdown"])
|
||||
except Exception as e:
|
||||
print(f"❌ Firecrawl call failed: {type(e).__name__}: {e}")
|
||||
return 1
|
||||
|
||||
# Print credit usage after
|
||||
try:
|
||||
usage_after = app.get_credit_usage()
|
||||
print(f"Credit usage AFTER: {usage_after}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f"Result type: {type(result).__name__}")
|
||||
|
||||
# Save full markdown for inspection
|
||||
md = result.markdown if hasattr(result, "markdown") else (result.get("markdown") if isinstance(result, dict) else None)
|
||||
if md:
|
||||
print(f"Markdown length: {len(md):,} chars")
|
||||
out_file = ROOT / "scripts" / "_zillow_miami_md.txt"
|
||||
out_file.write_text(md, encoding="utf-8")
|
||||
print(f"Saved: {out_file}")
|
||||
|
||||
# Sample of markdown content
|
||||
print()
|
||||
print("--- FIRST 3000 CHARS ---")
|
||||
print(md[:3000])
|
||||
print()
|
||||
print("--- CHARS 3000-6000 (probably listings) ---")
|
||||
print(md[3000:6000])
|
||||
|
||||
# Inspect result attrs / keys (Firecrawl SDK may have changed)
|
||||
print()
|
||||
print("--- RESULT META ---")
|
||||
if hasattr(result, "metadata"):
|
||||
print(f"metadata: {result.metadata}")
|
||||
if hasattr(result, "credits_used"):
|
||||
print(f"credits_used: {result.credits_used}")
|
||||
if isinstance(result, dict):
|
||||
for k in result.keys():
|
||||
print(f" key: {k}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user