"""data_fetchers/zillow_photo_lookup.py — Buscar fotos de Zillow por address. PROPOSITO: Los scrapers de county clerks (Miami-Dade, Duval, Broward, etc.) no exponen fotos de la propiedad. Sin embargo, Zillow tiene fotos para casi cualquier address en USA (incluso para foreclosures off-market). Estrategia: 1. Construir URL de Zillow address search: https://www.zillow.com/homes/{slug}_rb/ 2. Firecrawl scrape → markdown 3. Regex sobre markdown para extraer photos.zillowstatic.com URLs 4. Retornar list[str] de URLs (cap 5) COSTO: 1 Firecrawl credit por address lookup. USO: from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address urls = fetch_zillow_photos_by_address("2837 BLACK BUCK CIR, JACKSONVILLE, FL") # → ["https://photos.zillowstatic.com/fp/X.jpg", ...] """ from __future__ import annotations import os import re from typing import Optional # Photo URL pattern (Zillow CDN) _PHOTO_PAT = re.compile( r"!\[[^\]]*\]\((https?://photos\.zillowstatic\.com/[^)]+\.(?:webp|jpg|png|jpeg))\)", re.IGNORECASE, ) def _build_address_search_url(address: str) -> str: """Build Zillow address search URL. Format: https://www.zillow.com/homes/{slug}_rb/ Slug = uppercased address with dashes, no commas/extras. e.g., "2837 BLACK BUCK CIR, JACKSONVILLE, FL" → https://www.zillow.com/homes/2837-BLACK-BUCK-CIR-JACKSONVILLE-FL_rb/ """ s = address.upper().replace(",", "").replace(".", "") s = re.sub(r"\s+", "-", s.strip()) s = re.sub(r"-+", "-", s) return f"https://www.zillow.com/homes/{s}_rb/" def fetch_zillow_photos_by_address( address: str, max_photos: int = 1, # Solo 1 foto (la principal). Las demas ve user en Zillow directly. debug: bool = False, ) -> tuple[list[str], dict]: """Fetch photo URLs from Zillow address search. Returns (photo_urls, metadata). metadata: {url_attempted, address_matched_in_md, credits_used, error} Caveats: - Si Zillow no tiene la propiedad, returns ([], {...}) silently. - Si Firecrawl falla, returns ([], {"error": ...}). - Caller debe persistir el resultado y NO reintentar si vacío (perderia credits sin ganar nada). """ meta = { "url_attempted": None, "address_matched_in_md": False, "credits_used": 0, "error": None, "markdown_size": 0, } if not address or len(address.strip()) < 5: meta["error"] = "address too short / empty" return [], meta api_key = os.getenv("FIRECRAWL_API_KEY", "") if not api_key: meta["error"] = "FIRECRAWL_API_KEY not configured" return [], meta url = _build_address_search_url(address) meta["url_attempted"] = url try: from firecrawl import FirecrawlApp app = FirecrawlApp(api_key=api_key) resp = app.scrape(url, formats=["markdown"]) md = resp.markdown if hasattr(resp, "markdown") else resp.get("markdown", "") meta["markdown_size"] = len(md) meta["credits_used"] = 1 except Exception as e: meta["error"] = f"firecrawl error: {type(e).__name__}: {e}" return [], meta # Verify address actually matched (sanity check — Zillow sometimes returns # a "no results" page or different property) # Extract street number + a distinctive street word addr_upper = address.upper() street_num_match = re.match(r"(\d+)", addr_upper.strip()) street_num = street_num_match.group(1) if street_num_match else "" # Check if street number AND some distinctive word from address appears in markdown addr_in_md = bool(street_num and street_num in md) if addr_in_md: # Look for at least one >3-char word from address words = [w for w in re.findall(r"[A-Z]+", addr_upper) if len(w) >= 4] word_match = any(w in md.upper() for w in words[:3]) addr_in_md = addr_in_md and word_match meta["address_matched_in_md"] = addr_in_md # Extract photos photos = _PHOTO_PAT.findall(md) # Dedup keeping order seen = set() unique = [] for p in photos: if p not in seen: seen.add(p) unique.append(p) # Only return photos if address matched (defensive) if not addr_in_md and not debug: meta["error"] = "address not matched in Zillow markdown (no result page)" return [], meta return unique[:max_photos], meta