""" Scraper de Zillow usando el perfil real de Chrome. Extrae datos completos del JSON embebido en la página. """ import re, time, random, json, os from playwright.sync_api import sync_playwright CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe" TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter" def human_delay(a=1.5, b=3.5): time.sleep(random.uniform(a, b)) def slow_scroll(page, steps=4): for _ in range(steps): page.mouse.wheel(0, random.randint(300, 600)) time.sleep(random.uniform(0.5, 1.0)) def parse_zillow_listings(html, max_price=230000, min_price=40000): """Extrae listings del JSON embebido en Zillow.""" results = [] # Buscar el JSON principal de Zillow (__NEXT_DATA__ o searchResults) match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.*?)', html, re.DOTALL) if match: try: data = json.loads(match.group(1)) # Navegar en la estructura JSON de Zillow props = data.get("props", {}) page_props = props.get("pageProps", {}) search = page_props.get("searchPageState", {}) cat1 = search.get("cat1", {}) search_results = cat1.get("searchResults", {}) list_results = search_results.get("listResults", []) for prop in list_results: price = prop.get("unformattedPrice", 0) if not (min_price <= price <= max_price): continue results.append({ "source": "zillow", "address": prop.get("address", "?"), "price": price, "beds": prop.get("beds", 0), "baths": prop.get("baths", 0), "sqft": prop.get("area", 0), "city": prop.get("addressCity", ""), "state": prop.get("addressState", ""), "zip": prop.get("addressZipcode", ""), "status": prop.get("statusType", ""), "url": "https://www.zillow.com" + prop.get("detailUrl", ""), "img": prop.get("imgSrc", ""), "property_type": prop.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""), }) except Exception as e: print(f"Error parsing __NEXT_DATA__: {e}") if not results: # Fallback: buscar unformattedPrice directamente raw_prices = re.findall(r'"unformattedPrice":\s*(\d+)', html) raw_addrs = re.findall(r'"address":\s*"([^"]+)"', html) for i, (p, a) in enumerate(zip(raw_prices, raw_addrs)): price = int(p) if min_price <= price <= max_price: results.append({"source": "zillow", "price": price, "address": a}) return results cities_fl = [ ("Vero Beach FL", "https://www.zillow.com/homes/for_sale/vero-beach-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"), ("Jacksonville FL", "https://www.zillow.com/homes/for_sale/jacksonville-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"), ("Melbourne FL", "https://www.zillow.com/homes/for_sale/melbourne-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"), ("Stuart FL", "https://www.zillow.com/homes/for_sale/stuart-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"), ] all_results = [] with sync_playwright() as p: ctx = p.chromium.launch_persistent_context( user_data_dir=TEMP_PROFILE, executable_path=CHROME_PATH, headless=False, args=[ "--profile-directory=Default", "--disable-blink-features=AutomationControlled", "--start-maximized", "--no-first-run", "--no-default-browser-check", ], viewport={"width": 1366, "height": 768}, ) page = ctx.new_page() for city, url in cities_fl: print(f"\n--- {city} ---") try: page.goto(url, wait_until="load", timeout=45000) human_delay(2, 3) slow_scroll(page, 4) human_delay(1, 2) html = page.content() title = page.title() print(f"Título: {title[:60]}") listings = parse_zillow_listings(html) print(f"Listings encontrados: {len(listings)}") for l in listings[:5]: print(f" ${l['price']:,} | {l.get('beds','?')}bd | {l.get('address','?')[:50]}, {l.get('city','?')}, {l.get('state','?')}") all_results.extend(listings) except Exception as e: print(f"ERROR en {city}: {e}") ctx.close() print(f"\n=== TOTAL: {len(all_results)} listings en rango $40K-$230K ===") with open("zillow_all.json", "w", encoding="utf-8") as f: json.dump(all_results, f, indent=2, ensure_ascii=False) print("Guardado en zillow_all.json")