138 lines
4.9 KiB
Python
138 lines
4.9 KiB
Python
"""
|
|
Usa el perfil real de Chrome (con sesión de Google) para scrapear Zillow y Realtor.com.
|
|
Primero copia el perfil a un directorio temporal para no corromper el original.
|
|
"""
|
|
import re, time, random, shutil, os, json
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data"
|
|
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
|
|
|
|
def human_delay(a=1.5, b=3.5):
|
|
time.sleep(random.uniform(a, b))
|
|
|
|
def slow_scroll(page, steps=4):
|
|
for _ in range(steps):
|
|
page.mouse.wheel(0, random.randint(300, 600))
|
|
time.sleep(random.uniform(0.4, 0.9))
|
|
|
|
def wait_for_content(page, selector, timeout=60):
|
|
print(f" Esperando '{selector}' (max {timeout}s)...")
|
|
start = time.time()
|
|
while time.time() - start < timeout:
|
|
try:
|
|
items = page.query_selector_all(selector)
|
|
if items:
|
|
return items
|
|
except Exception:
|
|
pass
|
|
time.sleep(2)
|
|
return []
|
|
|
|
# Copiar perfil si no existe el temporal
|
|
if not os.path.exists(TEMP_PROFILE):
|
|
print("Copiando perfil de Chrome (solo Default)...")
|
|
os.makedirs(TEMP_PROFILE, exist_ok=True)
|
|
src_default = os.path.join(CHROME_PROFILE, "Default")
|
|
dst_default = os.path.join(TEMP_PROFILE, "Default")
|
|
# Solo copiar archivos de sesión, no caché (para ir más rápido)
|
|
for item in ["Cookies", "Login Data", "Web Data", "Preferences"]:
|
|
src = os.path.join(src_default, item)
|
|
if os.path.exists(src):
|
|
os.makedirs(dst_default, exist_ok=True)
|
|
shutil.copy2(src, dst_default)
|
|
print("Perfil copiado.")
|
|
else:
|
|
print("Usando perfil temporal existente.")
|
|
|
|
print("\n=== Zillow con sesión de Google ===")
|
|
with sync_playwright() as p:
|
|
ctx = p.chromium.launch_persistent_context(
|
|
user_data_dir=TEMP_PROFILE,
|
|
executable_path=CHROME_PATH,
|
|
headless=False,
|
|
args=[
|
|
"--profile-directory=Default",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
],
|
|
viewport={"width": 1366, "height": 768},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
# Ir a Zillow directamente
|
|
print("Navegando a Zillow...")
|
|
try:
|
|
page.goto(
|
|
"https://www.zillow.com/homes/for_sale/vero-beach-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%7D%7D%7D",
|
|
wait_until="load", timeout=45000
|
|
)
|
|
except Exception as e:
|
|
print(f"Timeout en load (continuando): {e}")
|
|
|
|
human_delay(2, 3)
|
|
|
|
# Si hay captcha Cloudflare, esperar que el usuario lo resuelva
|
|
print("Título actual:", page.title()[:60])
|
|
if "denied" in page.title().lower() or "verification" in page.title().lower():
|
|
print(">> Cloudflare challenge detectado. Resuélvelo en el browser (90s)...")
|
|
time.sleep(90)
|
|
|
|
slow_scroll(page, 5)
|
|
human_delay(1, 2)
|
|
|
|
# Buscar listings
|
|
cards = wait_for_content(page, "[data-test='property-card']", timeout=30)
|
|
print(f"\nListings Zillow: {len(cards)}")
|
|
results = []
|
|
for card in cards[:10]:
|
|
txt = card.inner_text()
|
|
price_m = re.search(r'\$[\d,]+', txt)
|
|
addr_lines = txt.strip().split('\n')
|
|
price = price_m.group() if price_m else "?"
|
|
addr = addr_lines[0] if addr_lines else "?"
|
|
print(f" {price} | {addr[:60]}")
|
|
results.append({"price": price, "address": addr})
|
|
|
|
if results:
|
|
with open("zillow_results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nGuardado en zillow_results.json ({len(results)} propiedades)")
|
|
else:
|
|
content = page.content()
|
|
prices = re.findall(r'"unformattedPrice":\s*(\d+)', content)
|
|
print("Precios en HTML:", prices[:5])
|
|
|
|
input("\n[Enter para ir a Realtor.com]")
|
|
|
|
print("\n=== Realtor.com con sesión de Google ===")
|
|
page.goto(
|
|
"https://www.realtor.com/realestateandhomes-search/Vero-Beach_FL/price-na-230000",
|
|
wait_until="load", timeout=45000
|
|
)
|
|
human_delay(2, 3)
|
|
slow_scroll(page, 5)
|
|
|
|
cards2 = wait_for_content(page, "[data-testid='property-card-content']", timeout=30)
|
|
print(f"\nListings Realtor: {len(cards2)}")
|
|
results2 = []
|
|
for card in cards2[:10]:
|
|
txt = card.inner_text()
|
|
price_m = re.search(r'\$[\d,]+', txt)
|
|
lines = txt.strip().split('\n')
|
|
price = price_m.group() if price_m else "?"
|
|
addr = lines[0] if lines else "?"
|
|
print(f" {price} | {addr[:60]}")
|
|
results2.append({"price": price, "address": addr})
|
|
|
|
if results2:
|
|
with open("realtor_results.json", "w") as f:
|
|
json.dump(results2, f, indent=2)
|
|
print(f"Guardado en realtor_results.json ({len(results2)} propiedades)")
|
|
|
|
input("\n[Enter para cerrar el browser]")
|
|
ctx.close()
|