156 lines
5.5 KiB
Python
156 lines
5.5 KiB
Python
"""
|
|
Scraper Zillow: usa el search box como humano + extrae JSON de la página.
|
|
"""
|
|
import re, time, random, json
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
|
|
|
|
def hd(a=1.0, b=2.5):
|
|
time.sleep(random.uniform(a, b))
|
|
|
|
def scroll(page, steps=4):
|
|
for _ in range(steps):
|
|
page.mouse.wheel(0, random.randint(250, 550))
|
|
time.sleep(random.uniform(0.4, 0.9))
|
|
|
|
def parse_listings(html, min_p=40000, max_p=230000):
|
|
results = []
|
|
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
if m:
|
|
try:
|
|
data = json.loads(m.group(1))
|
|
list_results = (data.get("props",{}).get("pageProps",{})
|
|
.get("searchPageState",{}).get("cat1",{})
|
|
.get("searchResults",{}).get("listResults",[]))
|
|
for p in list_results:
|
|
price = p.get("unformattedPrice", 0)
|
|
if min_p <= price <= max_p:
|
|
city = p.get("addressCity", "")
|
|
state = p.get("addressState", "")
|
|
results.append({
|
|
"source": "zillow",
|
|
"address": p.get("address","?"),
|
|
"price": price,
|
|
"beds": p.get("beds", 0),
|
|
"baths": p.get("baths", 0),
|
|
"sqft": p.get("area", 0),
|
|
"city": city,
|
|
"state": state,
|
|
"zip": p.get("addressZipcode",""),
|
|
"status": p.get("statusType",""),
|
|
"url": "https://www.zillow.com" + p.get("detailUrl",""),
|
|
"img": p.get("imgSrc",""),
|
|
"type": p.get("hdpData",{}).get("homeInfo",{}).get("homeType",""),
|
|
})
|
|
except Exception as e:
|
|
print(f" Parse error: {e}")
|
|
return results
|
|
|
|
def search_city(page, city_query, max_price=230000):
|
|
"""Busca una ciudad en Zillow usando el search box."""
|
|
print(f"\n--- Buscando: {city_query} ---")
|
|
try:
|
|
# Ir a zillow.com
|
|
page.goto("https://www.zillow.com", wait_until="load", timeout=30000)
|
|
hd(1.5, 2.5)
|
|
|
|
# Encontrar el search box y escribir la ciudad
|
|
search_box = page.query_selector("input[id*='search'], input[placeholder*='address'], input[placeholder*='city']")
|
|
if not search_box:
|
|
# Probar selectores alternativos
|
|
search_box = page.query_selector("#search-box-input, [data-testid='search-input'], input[name='searchQueryState']")
|
|
|
|
if search_box:
|
|
search_box.click()
|
|
hd(0.3, 0.6)
|
|
page.keyboard.down("Control")
|
|
page.keyboard.press("a")
|
|
page.keyboard.up("Control")
|
|
hd(0.2, 0.4)
|
|
page.keyboard.press("Delete")
|
|
hd(0.3, 0.5)
|
|
# Escribir como humano, caracter por caracter
|
|
for char in city_query:
|
|
page.keyboard.type(char)
|
|
time.sleep(random.uniform(0.07, 0.18))
|
|
hd(1.0, 1.8)
|
|
page.keyboard.press("Enter")
|
|
page.wait_for_load_state("load", timeout=30000)
|
|
hd(2, 3)
|
|
scroll(page, 4)
|
|
hd(1, 2)
|
|
else:
|
|
# Si no encuentra search box, usar URL directamente
|
|
city_slug = city_query.lower().replace(" ", "-").replace(",", "")
|
|
url = f"https://www.zillow.com/homes/for_sale/{city_slug}/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A40000%7D%7D%7D"
|
|
page.goto(url, wait_until="load", timeout=45000)
|
|
hd(2, 3)
|
|
scroll(page, 4)
|
|
|
|
title = page.title()
|
|
html = page.content()
|
|
print(f" Título: {title[:60]}")
|
|
|
|
listings = parse_listings(html)
|
|
print(f" Encontrados: {len(listings)} en rango $40K-$230K")
|
|
for l in listings[:3]:
|
|
print(f" ${l['price']:,} | {l.get('beds','?')}bd | {l['address'][:50]}, {l['city']}")
|
|
return listings
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
return []
|
|
|
|
# Ciudades objetivo
|
|
CITIES = [
|
|
"Vero Beach, FL",
|
|
"Melbourne, FL",
|
|
"Jacksonville, FL",
|
|
"Stuart, FL",
|
|
"Daytona Beach, FL",
|
|
"St. Augustine, FL",
|
|
"Palm Coast, FL",
|
|
"New Smyrna Beach, FL",
|
|
]
|
|
|
|
all_results = []
|
|
|
|
with sync_playwright() as p:
|
|
ctx = p.chromium.launch_persistent_context(
|
|
user_data_dir=TEMP_PROFILE,
|
|
executable_path=CHROME_PATH,
|
|
headless=False,
|
|
args=[
|
|
"--profile-directory=Default",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
],
|
|
viewport={"width": 1366, "height": 768},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
for city in CITIES:
|
|
listings = search_city(page, city)
|
|
all_results.extend(listings)
|
|
hd(2, 4) # pausa entre ciudades
|
|
|
|
ctx.close()
|
|
|
|
# Deduplicar por dirección
|
|
seen = set()
|
|
unique = []
|
|
for r in all_results:
|
|
key = r["address"].lower().strip()
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(r)
|
|
|
|
print(f"\n=== TOTAL: {len(unique)} listings únicos en $40K-$230K ===")
|
|
with open("zillow_final.json", "w", encoding="utf-8") as f:
|
|
json.dump(unique, f, indent=2, ensure_ascii=False)
|
|
print("Guardado en zillow_final.json")
|