Files
AR-House/data_fetchers/zillow_photo_lookup.py
T
2026-07-03 12:24:58 -04:00

128 lines
4.3 KiB
Python

"""data_fetchers/zillow_photo_lookup.py — Buscar fotos de Zillow por address.
PROPOSITO:
Los scrapers de county clerks (Miami-Dade, Duval, Broward, etc.) no exponen
fotos de la propiedad. Sin embargo, Zillow tiene fotos para casi cualquier
address en USA (incluso para foreclosures off-market).
Estrategia:
1. Construir URL de Zillow address search: https://www.zillow.com/homes/{slug}_rb/
2. Firecrawl scrape → markdown
3. Regex sobre markdown para extraer photos.zillowstatic.com URLs
4. Retornar list[str] de URLs (cap 5)
COSTO: 1 Firecrawl credit por address lookup.
USO:
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
urls = fetch_zillow_photos_by_address("2837 BLACK BUCK CIR, JACKSONVILLE, FL")
# → ["https://photos.zillowstatic.com/fp/X.jpg", ...]
"""
from __future__ import annotations
import os
import re
from typing import Optional
# Photo URL pattern (Zillow CDN)
_PHOTO_PAT = re.compile(
r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png|jpeg))\)",
re.IGNORECASE,
)
def _build_address_search_url(address: str) -> str:
"""Build Zillow address search URL.
Format: https://www.zillow.com/homes/{slug}_rb/
Slug = uppercased address with dashes, no commas/extras.
e.g., "2837 BLACK BUCK CIR, JACKSONVILLE, FL" →
https://www.zillow.com/homes/2837-BLACK-BUCK-CIR-JACKSONVILLE-FL_rb/
"""
s = address.upper().replace(",", "").replace(".", "")
s = re.sub(r"\s+", "-", s.strip())
s = re.sub(r"-+", "-", s)
return f"https://www.zillow.com/homes/{s}_rb/"
def fetch_zillow_photos_by_address(
address: str,
max_photos: int = 1, # Solo 1 foto (la principal). Las demas ve user en Zillow directly.
debug: bool = False,
) -> tuple[list[str], dict]:
"""Fetch photo URLs from Zillow address search.
Returns (photo_urls, metadata).
metadata: {url_attempted, address_matched_in_md, credits_used, error}
Caveats:
- Si Zillow no tiene la propiedad, returns ([], {...}) silently.
- Si Firecrawl falla, returns ([], {"error": ...}).
- Caller debe persistir el resultado y NO reintentar si vacío
(perderia credits sin ganar nada).
"""
meta = {
"url_attempted": None,
"address_matched_in_md": False,
"credits_used": 0,
"error": None,
"markdown_size": 0,
}
if not address or len(address.strip()) < 5:
meta["error"] = "address too short / empty"
return [], meta
api_key = os.getenv("FIRECRAWL_API_KEY", "")
if not api_key:
meta["error"] = "FIRECRAWL_API_KEY not configured"
return [], meta
url = _build_address_search_url(address)
meta["url_attempted"] = url
try:
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=api_key)
resp = app.scrape(url, formats=["markdown"])
md = resp.markdown if hasattr(resp, "markdown") else resp.get("markdown", "")
meta["markdown_size"] = len(md)
meta["credits_used"] = 1
except Exception as e:
meta["error"] = f"firecrawl error: {type(e).__name__}: {e}"
return [], meta
# Verify address actually matched (sanity check — Zillow sometimes returns
# a "no results" page or different property)
# Extract street number + a distinctive street word
addr_upper = address.upper()
street_num_match = re.match(r"(\d+)", addr_upper.strip())
street_num = street_num_match.group(1) if street_num_match else ""
# Check if street number AND some distinctive word from address appears in markdown
addr_in_md = bool(street_num and street_num in md)
if addr_in_md:
# Look for at least one >3-char word from address
words = [w for w in re.findall(r"[A-Z]+", addr_upper) if len(w) >= 4]
word_match = any(w in md.upper() for w in words[:3])
addr_in_md = addr_in_md and word_match
meta["address_matched_in_md"] = addr_in_md
# Extract photos
photos = _PHOTO_PAT.findall(md)
# Dedup keeping order
seen = set()
unique = []
for p in photos:
if p not in seen:
seen.add(p)
unique.append(p)
# Only return photos if address matched (defensive)
if not addr_in_md and not debug:
meta["error"] = "address not matched in Zillow markdown (no result page)"
return [], meta
return unique[:max_photos], meta