121 lines
5.0 KiB
Python
121 lines
5.0 KiB
Python
"""
|
|
Scraper de Zillow usando el perfil real de Chrome.
|
|
Extrae datos completos del JSON embebido en la página.
|
|
"""
|
|
import re, time, random, json, os
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
|
|
|
|
def human_delay(a=1.5, b=3.5):
|
|
time.sleep(random.uniform(a, b))
|
|
|
|
def slow_scroll(page, steps=4):
|
|
for _ in range(steps):
|
|
page.mouse.wheel(0, random.randint(300, 600))
|
|
time.sleep(random.uniform(0.5, 1.0))
|
|
|
|
def parse_zillow_listings(html, max_price=230000, min_price=40000):
|
|
"""Extrae listings del JSON embebido en Zillow."""
|
|
results = []
|
|
|
|
# Buscar el JSON principal de Zillow (__NEXT_DATA__ o searchResults)
|
|
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group(1))
|
|
# Navegar en la estructura JSON de Zillow
|
|
props = data.get("props", {})
|
|
page_props = props.get("pageProps", {})
|
|
search = page_props.get("searchPageState", {})
|
|
cat1 = search.get("cat1", {})
|
|
search_results = cat1.get("searchResults", {})
|
|
list_results = search_results.get("listResults", [])
|
|
|
|
for prop in list_results:
|
|
price = prop.get("unformattedPrice", 0)
|
|
if not (min_price <= price <= max_price):
|
|
continue
|
|
results.append({
|
|
"source": "zillow",
|
|
"address": prop.get("address", "?"),
|
|
"price": price,
|
|
"beds": prop.get("beds", 0),
|
|
"baths": prop.get("baths", 0),
|
|
"sqft": prop.get("area", 0),
|
|
"city": prop.get("addressCity", ""),
|
|
"state": prop.get("addressState", ""),
|
|
"zip": prop.get("addressZipcode", ""),
|
|
"status": prop.get("statusType", ""),
|
|
"url": "https://www.zillow.com" + prop.get("detailUrl", ""),
|
|
"img": prop.get("imgSrc", ""),
|
|
"property_type": prop.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""),
|
|
})
|
|
except Exception as e:
|
|
print(f"Error parsing __NEXT_DATA__: {e}")
|
|
|
|
if not results:
|
|
# Fallback: buscar unformattedPrice directamente
|
|
raw_prices = re.findall(r'"unformattedPrice":\s*(\d+)', html)
|
|
raw_addrs = re.findall(r'"address":\s*"([^"]+)"', html)
|
|
for i, (p, a) in enumerate(zip(raw_prices, raw_addrs)):
|
|
price = int(p)
|
|
if min_price <= price <= max_price:
|
|
results.append({"source": "zillow", "price": price, "address": a})
|
|
|
|
return results
|
|
|
|
cities_fl = [
|
|
("Vero Beach FL", "https://www.zillow.com/homes/for_sale/vero-beach-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
|
|
("Jacksonville FL", "https://www.zillow.com/homes/for_sale/jacksonville-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
|
|
("Melbourne FL", "https://www.zillow.com/homes/for_sale/melbourne-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
|
|
("Stuart FL", "https://www.zillow.com/homes/for_sale/stuart-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%2C%22min%22%3A40000%7D%7D%7D"),
|
|
]
|
|
|
|
all_results = []
|
|
|
|
with sync_playwright() as p:
|
|
ctx = p.chromium.launch_persistent_context(
|
|
user_data_dir=TEMP_PROFILE,
|
|
executable_path=CHROME_PATH,
|
|
headless=False,
|
|
args=[
|
|
"--profile-directory=Default",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
],
|
|
viewport={"width": 1366, "height": 768},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
for city, url in cities_fl:
|
|
print(f"\n--- {city} ---")
|
|
try:
|
|
page.goto(url, wait_until="load", timeout=45000)
|
|
human_delay(2, 3)
|
|
slow_scroll(page, 4)
|
|
human_delay(1, 2)
|
|
|
|
html = page.content()
|
|
title = page.title()
|
|
print(f"Título: {title[:60]}")
|
|
|
|
listings = parse_zillow_listings(html)
|
|
print(f"Listings encontrados: {len(listings)}")
|
|
for l in listings[:5]:
|
|
print(f" ${l['price']:,} | {l.get('beds','?')}bd | {l.get('address','?')[:50]}, {l.get('city','?')}, {l.get('state','?')}")
|
|
all_results.extend(listings)
|
|
|
|
except Exception as e:
|
|
print(f"ERROR en {city}: {e}")
|
|
|
|
ctx.close()
|
|
|
|
print(f"\n=== TOTAL: {len(all_results)} listings en rango $40K-$230K ===")
|
|
with open("zillow_all.json", "w", encoding="utf-8") as f:
|
|
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
|
print("Guardado en zillow_all.json")
|