feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,127 @@
|
||||
"""data_fetchers/zillow_photo_lookup.py — Buscar fotos de Zillow por address.
|
||||
|
||||
PROPOSITO:
|
||||
Los scrapers de county clerks (Miami-Dade, Duval, Broward, etc.) no exponen
|
||||
fotos de la propiedad. Sin embargo, Zillow tiene fotos para casi cualquier
|
||||
address en USA (incluso para foreclosures off-market).
|
||||
|
||||
Estrategia:
|
||||
1. Construir URL de Zillow address search: https://www.zillow.com/homes/{slug}_rb/
|
||||
2. Firecrawl scrape → markdown
|
||||
3. Regex sobre markdown para extraer photos.zillowstatic.com URLs
|
||||
4. Retornar list[str] de URLs (cap 5)
|
||||
|
||||
COSTO: 1 Firecrawl credit por address lookup.
|
||||
|
||||
USO:
|
||||
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
|
||||
urls = fetch_zillow_photos_by_address("2837 BLACK BUCK CIR, JACKSONVILLE, FL")
|
||||
# → ["https://photos.zillowstatic.com/fp/X.jpg", ...]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
# Photo URL pattern (Zillow CDN)
|
||||
_PHOTO_PAT = re.compile(
|
||||
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png|jpeg))\)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _build_address_search_url(address: str) -> str:
|
||||
"""Build Zillow address search URL.
|
||||
|
||||
Format: https://www.zillow.com/homes/{slug}_rb/
|
||||
Slug = uppercased address with dashes, no commas/extras.
|
||||
|
||||
e.g., "2837 BLACK BUCK CIR, JACKSONVILLE, FL" →
|
||||
https://www.zillow.com/homes/2837-BLACK-BUCK-CIR-JACKSONVILLE-FL_rb/
|
||||
"""
|
||||
s = address.upper().replace(",", "").replace(".", "")
|
||||
s = re.sub(r"\s+", "-", s.strip())
|
||||
s = re.sub(r"-+", "-", s)
|
||||
return f"https://www.zillow.com/homes/{s}_rb/"
|
||||
|
||||
|
||||
def fetch_zillow_photos_by_address(
|
||||
address: str,
|
||||
max_photos: int = 1, # Solo 1 foto (la principal). Las demas ve user en Zillow directly.
|
||||
debug: bool = False,
|
||||
) -> tuple[list[str], dict]:
|
||||
"""Fetch photo URLs from Zillow address search.
|
||||
|
||||
Returns (photo_urls, metadata).
|
||||
metadata: {url_attempted, address_matched_in_md, credits_used, error}
|
||||
|
||||
Caveats:
|
||||
- Si Zillow no tiene la propiedad, returns ([], {...}) silently.
|
||||
- Si Firecrawl falla, returns ([], {"error": ...}).
|
||||
- Caller debe persistir el resultado y NO reintentar si vacío
|
||||
(perderia credits sin ganar nada).
|
||||
"""
|
||||
meta = {
|
||||
"url_attempted": None,
|
||||
"address_matched_in_md": False,
|
||||
"credits_used": 0,
|
||||
"error": None,
|
||||
"markdown_size": 0,
|
||||
}
|
||||
|
||||
if not address or len(address.strip()) < 5:
|
||||
meta["error"] = "address too short / empty"
|
||||
return [], meta
|
||||
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "")
|
||||
if not api_key:
|
||||
meta["error"] = "FIRECRAWL_API_KEY not configured"
|
||||
return [], meta
|
||||
|
||||
url = _build_address_search_url(address)
|
||||
meta["url_attempted"] = url
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
app = FirecrawlApp(api_key=api_key)
|
||||
resp = app.scrape(url, formats=["markdown"])
|
||||
md = resp.markdown if hasattr(resp, "markdown") else resp.get("markdown", "")
|
||||
meta["markdown_size"] = len(md)
|
||||
meta["credits_used"] = 1
|
||||
except Exception as e:
|
||||
meta["error"] = f"firecrawl error: {type(e).__name__}: {e}"
|
||||
return [], meta
|
||||
|
||||
# Verify address actually matched (sanity check — Zillow sometimes returns
|
||||
# a "no results" page or different property)
|
||||
# Extract street number + a distinctive street word
|
||||
addr_upper = address.upper()
|
||||
street_num_match = re.match(r"(\d+)", addr_upper.strip())
|
||||
street_num = street_num_match.group(1) if street_num_match else ""
|
||||
|
||||
# Check if street number AND some distinctive word from address appears in markdown
|
||||
addr_in_md = bool(street_num and street_num in md)
|
||||
if addr_in_md:
|
||||
# Look for at least one >3-char word from address
|
||||
words = [w for w in re.findall(r"[A-Z]+", addr_upper) if len(w) >= 4]
|
||||
word_match = any(w in md.upper() for w in words[:3])
|
||||
addr_in_md = addr_in_md and word_match
|
||||
meta["address_matched_in_md"] = addr_in_md
|
||||
|
||||
# Extract photos
|
||||
photos = _PHOTO_PAT.findall(md)
|
||||
# Dedup keeping order
|
||||
seen = set()
|
||||
unique = []
|
||||
for p in photos:
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
unique.append(p)
|
||||
|
||||
# Only return photos if address matched (defensive)
|
||||
if not addr_in_md and not debug:
|
||||
meta["error"] = "address not matched in Zillow markdown (no result page)"
|
||||
return [], meta
|
||||
|
||||
return unique[:max_photos], meta
|
||||
Reference in New Issue
Block a user