Files

121 lines
5.0 KiB
Python

"""
Scraper de Zillow usando el perfil real de Chrome.
Extrae datos completos del JSON embebido en la página.
"""
import re, time, random, json, os
from playwright.sync_api import sync_playwright
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
def human_delay(a=1.5, b=3.5):
time.sleep(random.uniform(a, b))
def slow_scroll(page, steps=4):
for _ in range(steps):
page.mouse.wheel(0, random.randint(300, 600))
time.sleep(random.uniform(0.5, 1.0))
def parse_zillow_listings(html, max_price=230000, min_price=40000):
"""Extrae listings del JSON embebido en Zillow."""
results = []
# Buscar el JSON principal de Zillow (__NEXT_DATA__ o searchResults)
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
# Navegar en la estructura JSON de Zillow
props = data.get("props", {})
page_props = props.get("pageProps", {})
search = page_props.get("searchPageState", {})
cat1 = search.get("cat1", {})
search_results = cat1.get("searchResults", {})
list_results = search_results.get("listResults", [])
for prop in list_results:
price = prop.get("unformattedPrice", 0)
if not (min_price <= price <= max_price):
continue
results.append({
"source": "zillow",
"address": prop.get("address", "?"),
"price": price,
"beds": prop.get("beds", 0),
"baths": prop.get("baths", 0),
"sqft": prop.get("area", 0),
"city": prop.get("addressCity", ""),
"state": prop.get("addressState", ""),
"zip": prop.get("addressZipcode", ""),
"status": prop.get("statusType", ""),
"url": "https://www.zillow.com" + prop.get("detailUrl", ""),
"img": prop.get("imgSrc", ""),
"property_type": prop.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""),
})
except Exception as e:
print(f"Error parsing __NEXT_DATA__: {e}")
if not results:
# Fallback: buscar unformattedPrice directamente
raw_prices = re.findall(r'"unformattedPrice":\s*(\d+)', html)
raw_addrs = re.findall(r'"address":\s*"([^"]+)"', html)
for i, (p, a) in enumerate(zip(raw_prices, raw_addrs)):
price = int(p)
if min_price <= price <= max_price:
results.append({"source": "zillow", "price": price, "address": a})
return results
cities_fl = [
("Vero Beach FL", "https://www.zillow.com/homes/for_sale/vero-beach-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
("Jacksonville FL", "https://www.zillow.com/homes/for_sale/jacksonville-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
("Melbourne FL", "https://www.zillow.com/homes/for_sale/melbourne-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
("Stuart FL", "https://www.zillow.com/homes/for_sale/stuart-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
]
all_results = []
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=TEMP_PROFILE,
executable_path=CHROME_PATH,
headless=False,
args=[
"--profile-directory=Default",
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--no-first-run",
"--no-default-browser-check",
],
viewport={"width": 1366, "height": 768},
)
page = ctx.new_page()
for city, url in cities_fl:
print(f"\n--- {city} ---")
try:
page.goto(url, wait_until="load", timeout=45000)
human_delay(2, 3)
slow_scroll(page, 4)
human_delay(1, 2)
html = page.content()
title = page.title()
print(f"Título: {title[:60]}")
listings = parse_zillow_listings(html)
print(f"Listings encontrados: {len(listings)}")
for l in listings[:5]:
print(f" ${l['price']:,} | {l.get('beds','?')}bd | {l.get('address','?')[:50]}, {l.get('city','?')}, {l.get('state','?')}")
all_results.extend(listings)
except Exception as e:
print(f"ERROR en {city}: {e}")
ctx.close()
print(f"\n=== TOTAL: {len(all_results)} listings en rango $40K-$230K ===")
with open("zillow_all.json", "w", encoding="utf-8") as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
print("Guardado en zillow_all.json")