Files

138 lines
4.9 KiB
Python

"""
Usa el perfil real de Chrome (con sesión de Google) para scrapear Zillow y Realtor.com.
Primero copia el perfil a un directorio temporal para no corromper el original.
"""
import re, time, random, shutil, os, json
from playwright.sync_api import sync_playwright
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data"
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
def human_delay(a=1.5, b=3.5):
time.sleep(random.uniform(a, b))
def slow_scroll(page, steps=4):
for _ in range(steps):
page.mouse.wheel(0, random.randint(300, 600))
time.sleep(random.uniform(0.4, 0.9))
def wait_for_content(page, selector, timeout=60):
print(f" Esperando '{selector}' (max {timeout}s)...")
start = time.time()
while time.time() - start < timeout:
try:
items = page.query_selector_all(selector)
if items:
return items
except Exception:
pass
time.sleep(2)
return []
# Copiar perfil si no existe el temporal
if not os.path.exists(TEMP_PROFILE):
print("Copiando perfil de Chrome (solo Default)...")
os.makedirs(TEMP_PROFILE, exist_ok=True)
src_default = os.path.join(CHROME_PROFILE, "Default")
dst_default = os.path.join(TEMP_PROFILE, "Default")
# Solo copiar archivos de sesión, no caché (para ir más rápido)
for item in ["Cookies", "Login Data", "Web Data", "Preferences"]:
src = os.path.join(src_default, item)
if os.path.exists(src):
os.makedirs(dst_default, exist_ok=True)
shutil.copy2(src, dst_default)
print("Perfil copiado.")
else:
print("Usando perfil temporal existente.")
print("\n=== Zillow con sesión de Google ===")
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=TEMP_PROFILE,
executable_path=CHROME_PATH,
headless=False,
args=[
"--profile-directory=Default",
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--no-first-run",
"--no-default-browser-check",
],
viewport={"width": 1366, "height": 768},
)
page = ctx.new_page()
# Ir a Zillow directamente
print("Navegando a Zillow...")
try:
page.goto(
"https://www.zillow.com/homes/for_sale/vero-beach-fl/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A230000%7D%7D%7D",
wait_until="load", timeout=45000
)
except Exception as e:
print(f"Timeout en load (continuando): {e}")
human_delay(2, 3)
# Si hay captcha Cloudflare, esperar que el usuario lo resuelva
print("Título actual:", page.title()[:60])
if "denied" in page.title().lower() or "verification" in page.title().lower():
print(">> Cloudflare challenge detectado. Resuélvelo en el browser (90s)...")
time.sleep(90)
slow_scroll(page, 5)
human_delay(1, 2)
# Buscar listings
cards = wait_for_content(page, "[data-test='property-card']", timeout=30)
print(f"\nListings Zillow: {len(cards)}")
results = []
for card in cards[:10]:
txt = card.inner_text()
price_m = re.search(r'\$[\d,]+', txt)
addr_lines = txt.strip().split('\n')
price = price_m.group() if price_m else "?"
addr = addr_lines[0] if addr_lines else "?"
print(f" {price} | {addr[:60]}")
results.append({"price": price, "address": addr})
if results:
with open("zillow_results.json", "w") as f:
json.dump(results, f, indent=2)
print(f"\nGuardado en zillow_results.json ({len(results)} propiedades)")
else:
content = page.content()
prices = re.findall(r'"unformattedPrice":\s*(\d+)', content)
print("Precios en HTML:", prices[:5])
input("\n[Enter para ir a Realtor.com]")
print("\n=== Realtor.com con sesión de Google ===")
page.goto(
"https://www.realtor.com/realestateandhomes-search/Vero-Beach_FL/price-na-230000",
wait_until="load", timeout=45000
)
human_delay(2, 3)
slow_scroll(page, 5)
cards2 = wait_for_content(page, "[data-testid='property-card-content']", timeout=30)
print(f"\nListings Realtor: {len(cards2)}")
results2 = []
for card in cards2[:10]:
txt = card.inner_text()
price_m = re.search(r'\$[\d,]+', txt)
lines = txt.strip().split('\n')
price = price_m.group() if price_m else "?"
addr = lines[0] if lines else "?"
print(f" {price} | {addr[:60]}")
results2.append({"price": price, "address": addr})
if results2:
with open("realtor_results.json", "w") as f:
json.dump(results2, f, indent=2)
print(f"Guardado en realtor_results.json ({len(results2)} propiedades)")
input("\n[Enter para cerrar el browser]")
ctx.close()