128 lines
4.3 KiB
Python
128 lines
4.3 KiB
Python
"""data_fetchers/zillow_photo_lookup.py — Buscar fotos de Zillow por address.
|
|
|
|
PROPOSITO:
|
|
Los scrapers de county clerks (Miami-Dade, Duval, Broward, etc.) no exponen
|
|
fotos de la propiedad. Sin embargo, Zillow tiene fotos para casi cualquier
|
|
address en USA (incluso para foreclosures off-market).
|
|
|
|
Estrategia:
|
|
1. Construir URL de Zillow address search: https://www.zillow.com/homes/{slug}_rb/
|
|
2. Firecrawl scrape → markdown
|
|
3. Regex sobre markdown para extraer photos.zillowstatic.com URLs
|
|
4. Retornar list[str] de URLs (cap 5)
|
|
|
|
COSTO: 1 Firecrawl credit por address lookup.
|
|
|
|
USO:
|
|
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
|
|
urls = fetch_zillow_photos_by_address("2837 BLACK BUCK CIR, JACKSONVILLE, FL")
|
|
# → ["https://photos.zillowstatic.com/fp/X.jpg", ...]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import re
|
|
from typing import Optional
|
|
|
|
# Photo URL pattern (Zillow CDN)
|
|
_PHOTO_PAT = re.compile(
|
|
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png|jpeg))\)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _build_address_search_url(address: str) -> str:
|
|
"""Build Zillow address search URL.
|
|
|
|
Format: https://www.zillow.com/homes/{slug}_rb/
|
|
Slug = uppercased address with dashes, no commas/extras.
|
|
|
|
e.g., "2837 BLACK BUCK CIR, JACKSONVILLE, FL" →
|
|
https://www.zillow.com/homes/2837-BLACK-BUCK-CIR-JACKSONVILLE-FL_rb/
|
|
"""
|
|
s = address.upper().replace(",", "").replace(".", "")
|
|
s = re.sub(r"\s+", "-", s.strip())
|
|
s = re.sub(r"-+", "-", s)
|
|
return f"https://www.zillow.com/homes/{s}_rb/"
|
|
|
|
|
|
def fetch_zillow_photos_by_address(
|
|
address: str,
|
|
max_photos: int = 1, # Solo 1 foto (la principal). Las demas ve user en Zillow directly.
|
|
debug: bool = False,
|
|
) -> tuple[list[str], dict]:
|
|
"""Fetch photo URLs from Zillow address search.
|
|
|
|
Returns (photo_urls, metadata).
|
|
metadata: {url_attempted, address_matched_in_md, credits_used, error}
|
|
|
|
Caveats:
|
|
- Si Zillow no tiene la propiedad, returns ([], {...}) silently.
|
|
- Si Firecrawl falla, returns ([], {"error": ...}).
|
|
- Caller debe persistir el resultado y NO reintentar si vacío
|
|
(perderia credits sin ganar nada).
|
|
"""
|
|
meta = {
|
|
"url_attempted": None,
|
|
"address_matched_in_md": False,
|
|
"credits_used": 0,
|
|
"error": None,
|
|
"markdown_size": 0,
|
|
}
|
|
|
|
if not address or len(address.strip()) < 5:
|
|
meta["error"] = "address too short / empty"
|
|
return [], meta
|
|
|
|
api_key = os.getenv("FIRECRAWL_API_KEY", "")
|
|
if not api_key:
|
|
meta["error"] = "FIRECRAWL_API_KEY not configured"
|
|
return [], meta
|
|
|
|
url = _build_address_search_url(address)
|
|
meta["url_attempted"] = url
|
|
|
|
try:
|
|
from firecrawl import FirecrawlApp
|
|
app = FirecrawlApp(api_key=api_key)
|
|
resp = app.scrape(url, formats=["markdown"])
|
|
md = resp.markdown if hasattr(resp, "markdown") else resp.get("markdown", "")
|
|
meta["markdown_size"] = len(md)
|
|
meta["credits_used"] = 1
|
|
except Exception as e:
|
|
meta["error"] = f"firecrawl error: {type(e).__name__}: {e}"
|
|
return [], meta
|
|
|
|
# Verify address actually matched (sanity check — Zillow sometimes returns
|
|
# a "no results" page or different property)
|
|
# Extract street number + a distinctive street word
|
|
addr_upper = address.upper()
|
|
street_num_match = re.match(r"(\d+)", addr_upper.strip())
|
|
street_num = street_num_match.group(1) if street_num_match else ""
|
|
|
|
# Check if street number AND some distinctive word from address appears in markdown
|
|
addr_in_md = bool(street_num and street_num in md)
|
|
if addr_in_md:
|
|
# Look for at least one >3-char word from address
|
|
words = [w for w in re.findall(r"[A-Z]+", addr_upper) if len(w) >= 4]
|
|
word_match = any(w in md.upper() for w in words[:3])
|
|
addr_in_md = addr_in_md and word_match
|
|
meta["address_matched_in_md"] = addr_in_md
|
|
|
|
# Extract photos
|
|
photos = _PHOTO_PAT.findall(md)
|
|
# Dedup keeping order
|
|
seen = set()
|
|
unique = []
|
|
for p in photos:
|
|
if p not in seen:
|
|
seen.add(p)
|
|
unique.append(p)
|
|
|
|
# Only return photos if address matched (defensive)
|
|
if not addr_in_md and not debug:
|
|
meta["error"] = "address not matched in Zillow markdown (no result page)"
|
|
return [], meta
|
|
|
|
return unique[:max_photos], meta
|