Files

462 lines
19 KiB
Python

import re, json, time, random, shutil, os
import requests
from bs4 import BeautifulSoup
from datetime import datetime
# ── Config ────────────────────────────────────────────────────────────────────
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data"
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
CITY_COUNTY_MAP = {
"vero beach": "Indian River", "sebastian": "Indian River",
"fellsmere": "Indian River", "indian river shores": "Indian River",
"stuart": "Martin", "jensen beach": "Martin", "hobe sound": "Martin",
"palm city": "Martin", "port salerno": "Martin",
"fort pierce": "St. Lucie", "port st. lucie": "St. Lucie",
"melbourne": "Brevard", "palm bay": "Brevard", "titusville": "Brevard",
"cocoa": "Brevard", "cocoa beach": "Brevard", "rockledge": "Brevard",
"merritt island": "Brevard", "cape canaveral": "Brevard",
"satellite beach": "Brevard", "west melbourne": "Brevard",
"daytona beach": "Volusia", "ormond beach": "Volusia",
"new smyrna beach": "Volusia", "edgewater": "Volusia",
"port orange": "Volusia", "deltona": "Volusia",
"palm coast": "Flagler", "flagler beach": "Flagler", "bunnell": "Flagler",
"st. augustine": "St. Johns", "ponte vedra beach": "St. Johns",
"nocatee": "St. Johns", "st. augustine beach": "St. Johns",
"jacksonville": "Duval", "jacksonville beach": "Duval",
"atlantic beach": "Duval", "neptune beach": "Duval",
"fernandina beach": "Nassau", "yulee": "Nassau",
}
COUNTY_CODES = {
"Brevard": "9", "Duval": "31", "Flagler": "35", "Indian River": "61",
"Martin": "86", "Nassau": "89", "St. Johns": "109", "St. Lucie": "111",
"Volusia": "127",
}
def get_county_for_city(city: str) -> str:
return CITY_COUNTY_MAP.get(city.lower().strip(), "")
def score_property(prop: dict, search_cities: list, max_price: int) -> int:
score = 40
price = prop.get("price", 0)
if not price or price <= 0:
return 0
ratio = price / max_price
if ratio <= 0.60:
score += 35
elif ratio <= 0.75:
score += 25
elif ratio <= 0.90:
score += 15
elif ratio <= 1.0:
score += 8
city = (prop.get("city") or "").lower()
county = (prop.get("county") or "").lower()
for s in [c.lower() for c in search_cities]:
if s in city or s in county or city in s or county in s:
score += 12
break
beds = prop.get("beds") or 0
if beds >= 3:
score += 8
elif beds >= 2:
score += 4
status = (prop.get("status") or "").lower()
if any(w in status for w in ["foreclosure", "reo", "bank owned", "hud", "price reduced"]):
score += 10
elif any(w in status for w in ["new construction", "newly built"]):
score += 5
return min(score, 100)
# ── Chrome profile setup ───────────────────────────────────────────────────────
def ensure_chrome_profile():
"""Copia el perfil de Chrome si no existe el temporal."""
if os.path.exists(os.path.join(TEMP_PROFILE, "Default")):
return True
if not os.path.exists(CHROME_PROFILE):
return False
try:
os.makedirs(os.path.join(TEMP_PROFILE, "Default"), exist_ok=True)
src = os.path.join(CHROME_PROFILE, "Default")
dst = os.path.join(TEMP_PROFILE, "Default")
for item in ["Cookies", "Login Data", "Web Data", "Preferences"]:
s = os.path.join(src, item)
if os.path.exists(s):
shutil.copy2(s, dst)
return True
except Exception as e:
print(f" Profile copy warning: {e}")
return False
# ── Playwright helpers ─────────────────────────────────────────────────────────
def _hd(a=1.2, b=3.0):
time.sleep(random.uniform(a, b))
def _scroll(page, steps=4):
for _ in range(steps):
page.mouse.wheel(0, random.randint(250, 600))
time.sleep(random.uniform(0.4, 0.9))
def _type_human(page, text):
for ch in text:
page.keyboard.type(ch)
time.sleep(random.uniform(0.07, 0.16))
def _parse_zillow_html(html, min_p=40000, max_p=230000):
results = []
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not m:
return results
try:
data = json.loads(m.group(1))
list_results = (data.get("props", {}).get("pageProps", {})
.get("searchPageState", {}).get("cat1", {})
.get("searchResults", {}).get("listResults", []))
for p in list_results:
price = p.get("unformattedPrice", 0)
if min_p <= price <= max_p:
city_val = p.get("addressCity", "")
results.append({
"source": "Zillow",
"address": p.get("address", ""),
"price": price,
"beds": p.get("beds", 0),
"baths": p.get("baths", 0),
"sqft": p.get("area", 0),
"city": city_val,
"state": p.get("addressState", "FL"),
"county": get_county_for_city(city_val),
"zipcode": str(p.get("addressZipcode", "")),
"status": p.get("statusType", "For Sale"),
"url": "https://www.zillow.com" + p.get("detailUrl", ""),
"image_url": p.get("imgSrc", ""),
"property_type": p.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""),
})
except Exception as e:
print(f" JSON parse error: {e}")
return results
# ── Zillow via Playwright + Chrome profile ────────────────────────────────────
def scrape_zillow(cities: list, max_price: int) -> list:
try:
from playwright.sync_api import sync_playwright
except ImportError:
print(" Playwright no instalado — saltando Zillow")
return []
ensure_chrome_profile()
all_results = []
MIN_PRICE = 40000
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=TEMP_PROFILE,
executable_path=CHROME_PATH,
headless=False,
args=[
"--profile-directory=Default",
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--no-first-run",
"--no-default-browser-check",
],
viewport={"width": 1366, "height": 768},
)
page = ctx.new_page()
for city in cities:
city_q = f"{city}, FL"
print(f" Zillow: buscando {city_q}...")
try:
page.goto("https://www.zillow.com", wait_until="load", timeout=30000)
_hd(1.5, 2.5)
search = page.query_selector(
"#search-box-input, input[id*='search'], "
"input[placeholder*='address'], input[placeholder*='city']"
)
if search:
search.click()
_hd(0.3, 0.6)
page.keyboard.down("Control")
page.keyboard.press("a")
page.keyboard.up("Control")
page.keyboard.press("Delete")
_hd(0.2, 0.4)
_type_human(page, city_q)
_hd(0.8, 1.5)
page.keyboard.press("Enter")
page.wait_for_load_state("load", timeout=30000)
_hd(2, 3)
_scroll(page, 4)
_hd(1, 2)
else:
# fallback: URL directa
slug = re.sub(r"[^a-z0-9\s-]", "", city.lower()).strip().replace(" ", "-")
url = (f"https://www.zillow.com/homes/for_sale/{slug}-fl/"
f"?searchQueryState=%7B%22filterState%22%3A%7B"
f"%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A{MIN_PRICE}%7D%7D%7D")
page.goto(url, wait_until="load", timeout=45000)
_hd(2, 3)
_scroll(page, 4)
html = page.content()
title = page.title()
# Si Cloudflare bloqueó, esperar hasta que el usuario lo resuelva (max 90s)
if "denied" in title.lower() or "px-captcha" in html or "cf-browser-verification" in html:
print(f" >> Cloudflare en {city}: resuelve el challenge en el browser (90s max)...")
deadline = time.time() + 90
while time.time() < deadline:
time.sleep(4)
html = page.content()
t2 = page.title()
if "denied" not in t2.lower() and "px-captcha" not in html:
print(f" Challenge resuelto!")
break
else:
print(f" Timeout esperando challenge - saltando {city}")
continue
listings = _parse_zillow_html(html, MIN_PRICE, max_price)
print(f" -> {len(listings)} encontradas")
all_results.extend(listings)
except Exception as e:
print(f" ERROR {city}: {e}")
_hd(4, 7) # pausa entre ciudades para evitar bloqueo
ctx.close()
# Deduplicar
seen, unique = set(), []
for r in all_results:
key = r["address"].lower().strip()
if key and key not in seen:
seen.add(key)
unique.append(r)
return unique
# ── Realtor.com via Playwright + Chrome profile ───────────────────────────────
def scrape_realtor(cities: list, max_price: int) -> list:
try:
from playwright.sync_api import sync_playwright
except ImportError:
return []
ensure_chrome_profile()
all_results = []
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=TEMP_PROFILE,
executable_path=CHROME_PATH,
headless=False,
args=[
"--profile-directory=Default",
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--no-first-run",
],
viewport={"width": 1366, "height": 768},
)
page = ctx.new_page()
for city in cities:
city_slug = re.sub(r"[^a-z0-9\s]", "", city.lower()).strip().replace(" ", "_")
url = f"https://www.realtor.com/realestateandhomes-search/{city_slug}_FL/price-na-{max_price}"
print(f" Realtor.com: buscando {city}...")
try:
page.goto(url, wait_until="load", timeout=45000)
_hd(2, 3)
_scroll(page, 4)
_hd(1, 2)
html = page.content()
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not m:
continue
data = json.loads(m.group(1))
properties = (data.get("props", {}).get("pageProps", {})
.get("properties", []))
for item in properties:
price = item.get("list_price", 0)
if not isinstance(price, (int, float)):
continue
price = int(price)
if 40000 <= price <= max_price:
loc = item.get("location", {}).get("address", {})
city_val = loc.get("city", city)
desc = item.get("description", {})
all_results.append({
"source": "Realtor.com",
"address": loc.get("line", ""),
"city": city_val,
"county": get_county_for_city(city_val),
"zipcode": str(loc.get("postal_code", "")),
"price": price,
"beds": desc.get("beds", 0),
"baths": desc.get("baths_consolidated", 0),
"sqft": desc.get("sqft", 0),
"status": item.get("status", "For Sale"),
"url": "https://www.realtor.com" + item.get("permalink", ""),
"image_url": item.get("primary_photo", {}).get("href", ""),
"property_type": desc.get("type", ""),
})
print(f" -> {len(properties)} revisadas")
except Exception as e:
print(f" ERROR {city}: {e}")
_hd(3, 5)
ctx.close()
seen, unique = set(), []
for r in all_results:
key = r["address"].lower().strip()
if key and key not in seen:
seen.add(key)
unique.append(r)
return unique
# ── HUD Homes (requests - gobierno, sin anti-bot) ─────────────────────────────
def scrape_hud(cities: list, max_price: int) -> list:
results = []
counties = set()
for city in cities:
c = get_county_for_city(city)
if c:
counties.add(c)
if not counties:
counties = {"Brevard", "Indian River", "Duval", "St. Johns", "Volusia"}
for county in list(counties)[:6]:
code = COUNTY_CODES.get(county, "")
if not code:
continue
url = (f"https://www.hudhomestore.gov/HudHomes/Index.aspx"
f"?sState=FL&sCounty={code}&sPriceMax={max_price}&sPriceMin=30000")
try:
r = requests.get(url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(r.text, "html.parser")
rows = soup.select("tr.propRow, .property-row, tr[id^='prop']")
for row in rows[:8]:
text = row.get_text(" ", strip=True)
price_m = re.search(r'\$[\d,]+', text)
if not price_m:
continue
price = int(re.sub(r'[^\d]', '', price_m.group()))
if 0 < price <= max_price:
addr_el = row.select_one("a")
prop = {
"source": "HUD Homes",
"address": addr_el.get_text(strip=True) if addr_el else text[:80],
"city": county, "county": county,
"price": price, "url": url,
"status": "HUD Foreclosure",
}
prop["score"] = score_property(prop, cities, max_price)
results.append(prop)
except Exception as e:
print(f" HUD {county}: {e}")
return results
# ── Fannie Mae HomePath (REO) ──────────────────────────────────────────────────
def scrape_homepath(cities: list, max_price: int) -> list:
results = []
for term in cities[:8]:
url = (f"https://www.homepath.fanniemae.com/listings"
f"?searchTerm={requests.utils.quote(term + ' FL')}"
f"&maxPrice={max_price}&state=FL")
try:
r = requests.get(url, headers=HEADERS, timeout=15)
data = r.json() if "json" in r.headers.get("content-type", "") else {}
listings = data.get("listings", data.get("results", []))
for item in listings[:6]:
price = item.get("listPrice", item.get("price", 0))
if 0 < price <= max_price:
city_val = item.get("city", term)
prop = {
"source": "Fannie Mae HomePath",
"address": item.get("address", item.get("streetAddress", "")),
"city": city_val,
"county": get_county_for_city(city_val),
"zipcode": str(item.get("postalCode", "")),
"price": price,
"beds": item.get("bedrooms", 0),
"baths": item.get("bathrooms", 0),
"sqft": item.get("squareFeet", 0),
"url": f"https://www.homepath.fanniemae.com/listings/{item.get('id','')}",
"status": "Fannie Mae REO",
}
prop["score"] = score_property(prop, cities, max_price)
results.append(prop)
except Exception as e:
print(f" HomePath {term}: {e}")
return results
# ── Main runner ───────────────────────────────────────────────────────────────
def run_all_scrapers(cities: list = None, max_price: int = 230000) -> dict:
if not cities:
cities = ["Vero Beach", "Jacksonville", "Melbourne", "St. Augustine"]
all_props = []
log = {}
sources = {
"Zillow (Browser)": lambda: scrape_zillow(cities, max_price),
"Realtor.com (Browser)": lambda: scrape_realtor(cities, max_price),
"HUD Homes": lambda: scrape_hud(cities, max_price),
"Fannie Mae HomePath": lambda: scrape_homepath(cities, max_price),
}
for name, fn in sources.items():
try:
print(f"\n[{name}]")
props = fn()
seen, unique = set(), []
for p in props:
key = ((p.get("address") or "").lower().strip(), p.get("price", 0))
if key[0] and key not in seen:
seen.add(key)
p["score"] = score_property(p, cities, max_price)
unique.append(p)
all_props.extend(unique)
log[name] = {"found": len(unique), "status": "ok"}
print(f" -> {len(unique)} propiedades validas")
except Exception as e:
log[name] = {"found": 0, "status": f"error: {e}"}
print(f" ERROR {name}: {e}")
all_props.sort(key=lambda x: x.get("score", 0), reverse=True)
return {
"properties": all_props,
"log": log,
"cities_searched": cities,
"max_price": max_price,
"ran_at": datetime.utcnow().isoformat()
}