feat: Agente-Marketing initial commit
This commit is contained in:
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Scraper Zillow: usa el search box como humano + extrae JSON de la página.
|
||||
"""
|
||||
import re, time, random, json
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
||||
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
|
||||
|
||||
def hd(a=1.0, b=2.5):
|
||||
time.sleep(random.uniform(a, b))
|
||||
|
||||
def scroll(page, steps=4):
|
||||
for _ in range(steps):
|
||||
page.mouse.wheel(0, random.randint(250, 550))
|
||||
time.sleep(random.uniform(0.4, 0.9))
|
||||
|
||||
def parse_listings(html, min_p=40000, max_p=230000):
|
||||
results = []
|
||||
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if m:
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
list_results = (data.get("props",{}).get("pageProps",{})
|
||||
.get("searchPageState",{}).get("cat1",{})
|
||||
.get("searchResults",{}).get("listResults",[]))
|
||||
for p in list_results:
|
||||
price = p.get("unformattedPrice", 0)
|
||||
if min_p <= price <= max_p:
|
||||
city = p.get("addressCity", "")
|
||||
state = p.get("addressState", "")
|
||||
results.append({
|
||||
"source": "zillow",
|
||||
"address": p.get("address","?"),
|
||||
"price": price,
|
||||
"beds": p.get("beds", 0),
|
||||
"baths": p.get("baths", 0),
|
||||
"sqft": p.get("area", 0),
|
||||
"city": city,
|
||||
"state": state,
|
||||
"zip": p.get("addressZipcode",""),
|
||||
"status": p.get("statusType",""),
|
||||
"url": "https://www.zillow.com" + p.get("detailUrl",""),
|
||||
"img": p.get("imgSrc",""),
|
||||
"type": p.get("hdpData",{}).get("homeInfo",{}).get("homeType",""),
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" Parse error: {e}")
|
||||
return results
|
||||
|
||||
def search_city(page, city_query, max_price=230000):
|
||||
"""Busca una ciudad en Zillow usando el search box."""
|
||||
print(f"\n--- Buscando: {city_query} ---")
|
||||
try:
|
||||
# Ir a zillow.com
|
||||
page.goto("https://www.zillow.com", wait_until="load", timeout=30000)
|
||||
hd(1.5, 2.5)
|
||||
|
||||
# Encontrar el search box y escribir la ciudad
|
||||
search_box = page.query_selector("input[id*='search'], input[placeholder*='address'], input[placeholder*='city']")
|
||||
if not search_box:
|
||||
# Probar selectores alternativos
|
||||
search_box = page.query_selector("#search-box-input, [data-testid='search-input'], input[name='searchQueryState']")
|
||||
|
||||
if search_box:
|
||||
search_box.click()
|
||||
hd(0.3, 0.6)
|
||||
page.keyboard.down("Control")
|
||||
page.keyboard.press("a")
|
||||
page.keyboard.up("Control")
|
||||
hd(0.2, 0.4)
|
||||
page.keyboard.press("Delete")
|
||||
hd(0.3, 0.5)
|
||||
# Escribir como humano, caracter por caracter
|
||||
for char in city_query:
|
||||
page.keyboard.type(char)
|
||||
time.sleep(random.uniform(0.07, 0.18))
|
||||
hd(1.0, 1.8)
|
||||
page.keyboard.press("Enter")
|
||||
page.wait_for_load_state("load", timeout=30000)
|
||||
hd(2, 3)
|
||||
scroll(page, 4)
|
||||
hd(1, 2)
|
||||
else:
|
||||
# Si no encuentra search box, usar URL directamente
|
||||
city_slug = city_query.lower().replace(" ", "-").replace(",", "")
|
||||
url = f"https://www.zillow.com/homes/for_sale/{city_slug}/?searchQueryState=%7B%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A40000%7D%7D%7D"
|
||||
page.goto(url, wait_until="load", timeout=45000)
|
||||
hd(2, 3)
|
||||
scroll(page, 4)
|
||||
|
||||
title = page.title()
|
||||
html = page.content()
|
||||
print(f" Título: {title[:60]}")
|
||||
|
||||
listings = parse_listings(html)
|
||||
print(f" Encontrados: {len(listings)} en rango $40K-$230K")
|
||||
for l in listings[:3]:
|
||||
print(f" ${l['price']:,} | {l.get('beds','?')}bd | {l['address'][:50]}, {l['city']}")
|
||||
return listings
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
return []
|
||||
|
||||
# Ciudades objetivo
|
||||
CITIES = [
|
||||
"Vero Beach, FL",
|
||||
"Melbourne, FL",
|
||||
"Jacksonville, FL",
|
||||
"Stuart, FL",
|
||||
"Daytona Beach, FL",
|
||||
"St. Augustine, FL",
|
||||
"Palm Coast, FL",
|
||||
"New Smyrna Beach, FL",
|
||||
]
|
||||
|
||||
all_results = []
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=TEMP_PROFILE,
|
||||
executable_path=CHROME_PATH,
|
||||
headless=False,
|
||||
args=[
|
||||
"--profile-directory=Default",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
],
|
||||
viewport={"width": 1366, "height": 768},
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
for city in CITIES:
|
||||
listings = search_city(page, city)
|
||||
all_results.extend(listings)
|
||||
hd(2, 4) # pausa entre ciudades
|
||||
|
||||
ctx.close()
|
||||
|
||||
# Deduplicar por dirección
|
||||
seen = set()
|
||||
unique = []
|
||||
for r in all_results:
|
||||
key = r["address"].lower().strip()
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(r)
|
||||
|
||||
print(f"\n=== TOTAL: {len(unique)} listings únicos en $40K-$230K ===")
|
||||
with open("zillow_final.json", "w", encoding="utf-8") as f:
|
||||
json.dump(unique, f, indent=2, ensure_ascii=False)
|
||||
print("Guardado en zillow_final.json")
|
||||
Reference in New Issue
Block a user