462 lines
19 KiB
Python
462 lines
19 KiB
Python
import re, json, time, random, shutil, os
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
|
|
# ── Config ────────────────────────────────────────────────────────────────────
|
|
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data"
|
|
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
}
|
|
|
|
CITY_COUNTY_MAP = {
|
|
"vero beach": "Indian River", "sebastian": "Indian River",
|
|
"fellsmere": "Indian River", "indian river shores": "Indian River",
|
|
"stuart": "Martin", "jensen beach": "Martin", "hobe sound": "Martin",
|
|
"palm city": "Martin", "port salerno": "Martin",
|
|
"fort pierce": "St. Lucie", "port st. lucie": "St. Lucie",
|
|
"melbourne": "Brevard", "palm bay": "Brevard", "titusville": "Brevard",
|
|
"cocoa": "Brevard", "cocoa beach": "Brevard", "rockledge": "Brevard",
|
|
"merritt island": "Brevard", "cape canaveral": "Brevard",
|
|
"satellite beach": "Brevard", "west melbourne": "Brevard",
|
|
"daytona beach": "Volusia", "ormond beach": "Volusia",
|
|
"new smyrna beach": "Volusia", "edgewater": "Volusia",
|
|
"port orange": "Volusia", "deltona": "Volusia",
|
|
"palm coast": "Flagler", "flagler beach": "Flagler", "bunnell": "Flagler",
|
|
"st. augustine": "St. Johns", "ponte vedra beach": "St. Johns",
|
|
"nocatee": "St. Johns", "st. augustine beach": "St. Johns",
|
|
"jacksonville": "Duval", "jacksonville beach": "Duval",
|
|
"atlantic beach": "Duval", "neptune beach": "Duval",
|
|
"fernandina beach": "Nassau", "yulee": "Nassau",
|
|
}
|
|
|
|
COUNTY_CODES = {
|
|
"Brevard": "9", "Duval": "31", "Flagler": "35", "Indian River": "61",
|
|
"Martin": "86", "Nassau": "89", "St. Johns": "109", "St. Lucie": "111",
|
|
"Volusia": "127",
|
|
}
|
|
|
|
|
|
def get_county_for_city(city: str) -> str:
|
|
return CITY_COUNTY_MAP.get(city.lower().strip(), "")
|
|
|
|
|
|
def score_property(prop: dict, search_cities: list, max_price: int) -> int:
|
|
score = 40
|
|
price = prop.get("price", 0)
|
|
if not price or price <= 0:
|
|
return 0
|
|
|
|
ratio = price / max_price
|
|
if ratio <= 0.60:
|
|
score += 35
|
|
elif ratio <= 0.75:
|
|
score += 25
|
|
elif ratio <= 0.90:
|
|
score += 15
|
|
elif ratio <= 1.0:
|
|
score += 8
|
|
|
|
city = (prop.get("city") or "").lower()
|
|
county = (prop.get("county") or "").lower()
|
|
for s in [c.lower() for c in search_cities]:
|
|
if s in city or s in county or city in s or county in s:
|
|
score += 12
|
|
break
|
|
|
|
beds = prop.get("beds") or 0
|
|
if beds >= 3:
|
|
score += 8
|
|
elif beds >= 2:
|
|
score += 4
|
|
|
|
status = (prop.get("status") or "").lower()
|
|
if any(w in status for w in ["foreclosure", "reo", "bank owned", "hud", "price reduced"]):
|
|
score += 10
|
|
elif any(w in status for w in ["new construction", "newly built"]):
|
|
score += 5
|
|
|
|
return min(score, 100)
|
|
|
|
|
|
# ── Chrome profile setup ───────────────────────────────────────────────────────
|
|
def ensure_chrome_profile():
|
|
"""Copia el perfil de Chrome si no existe el temporal."""
|
|
if os.path.exists(os.path.join(TEMP_PROFILE, "Default")):
|
|
return True
|
|
if not os.path.exists(CHROME_PROFILE):
|
|
return False
|
|
try:
|
|
os.makedirs(os.path.join(TEMP_PROFILE, "Default"), exist_ok=True)
|
|
src = os.path.join(CHROME_PROFILE, "Default")
|
|
dst = os.path.join(TEMP_PROFILE, "Default")
|
|
for item in ["Cookies", "Login Data", "Web Data", "Preferences"]:
|
|
s = os.path.join(src, item)
|
|
if os.path.exists(s):
|
|
shutil.copy2(s, dst)
|
|
return True
|
|
except Exception as e:
|
|
print(f" Profile copy warning: {e}")
|
|
return False
|
|
|
|
|
|
# ── Playwright helpers ─────────────────────────────────────────────────────────
|
|
def _hd(a=1.2, b=3.0):
|
|
time.sleep(random.uniform(a, b))
|
|
|
|
def _scroll(page, steps=4):
|
|
for _ in range(steps):
|
|
page.mouse.wheel(0, random.randint(250, 600))
|
|
time.sleep(random.uniform(0.4, 0.9))
|
|
|
|
def _type_human(page, text):
|
|
for ch in text:
|
|
page.keyboard.type(ch)
|
|
time.sleep(random.uniform(0.07, 0.16))
|
|
|
|
def _parse_zillow_html(html, min_p=40000, max_p=230000):
|
|
results = []
|
|
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
if not m:
|
|
return results
|
|
try:
|
|
data = json.loads(m.group(1))
|
|
list_results = (data.get("props", {}).get("pageProps", {})
|
|
.get("searchPageState", {}).get("cat1", {})
|
|
.get("searchResults", {}).get("listResults", []))
|
|
for p in list_results:
|
|
price = p.get("unformattedPrice", 0)
|
|
if min_p <= price <= max_p:
|
|
city_val = p.get("addressCity", "")
|
|
results.append({
|
|
"source": "Zillow",
|
|
"address": p.get("address", ""),
|
|
"price": price,
|
|
"beds": p.get("beds", 0),
|
|
"baths": p.get("baths", 0),
|
|
"sqft": p.get("area", 0),
|
|
"city": city_val,
|
|
"state": p.get("addressState", "FL"),
|
|
"county": get_county_for_city(city_val),
|
|
"zipcode": str(p.get("addressZipcode", "")),
|
|
"status": p.get("statusType", "For Sale"),
|
|
"url": "https://www.zillow.com" + p.get("detailUrl", ""),
|
|
"image_url": p.get("imgSrc", ""),
|
|
"property_type": p.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""),
|
|
})
|
|
except Exception as e:
|
|
print(f" JSON parse error: {e}")
|
|
return results
|
|
|
|
|
|
# ── Zillow via Playwright + Chrome profile ────────────────────────────────────
|
|
def scrape_zillow(cities: list, max_price: int) -> list:
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
except ImportError:
|
|
print(" Playwright no instalado — saltando Zillow")
|
|
return []
|
|
|
|
ensure_chrome_profile()
|
|
all_results = []
|
|
MIN_PRICE = 40000
|
|
|
|
with sync_playwright() as p:
|
|
ctx = p.chromium.launch_persistent_context(
|
|
user_data_dir=TEMP_PROFILE,
|
|
executable_path=CHROME_PATH,
|
|
headless=False,
|
|
args=[
|
|
"--profile-directory=Default",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
],
|
|
viewport={"width": 1366, "height": 768},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
for city in cities:
|
|
city_q = f"{city}, FL"
|
|
print(f" Zillow: buscando {city_q}...")
|
|
try:
|
|
page.goto("https://www.zillow.com", wait_until="load", timeout=30000)
|
|
_hd(1.5, 2.5)
|
|
|
|
search = page.query_selector(
|
|
"#search-box-input, input[id*='search'], "
|
|
"input[placeholder*='address'], input[placeholder*='city']"
|
|
)
|
|
if search:
|
|
search.click()
|
|
_hd(0.3, 0.6)
|
|
page.keyboard.down("Control")
|
|
page.keyboard.press("a")
|
|
page.keyboard.up("Control")
|
|
page.keyboard.press("Delete")
|
|
_hd(0.2, 0.4)
|
|
_type_human(page, city_q)
|
|
_hd(0.8, 1.5)
|
|
page.keyboard.press("Enter")
|
|
page.wait_for_load_state("load", timeout=30000)
|
|
_hd(2, 3)
|
|
_scroll(page, 4)
|
|
_hd(1, 2)
|
|
else:
|
|
# fallback: URL directa
|
|
slug = re.sub(r"[^a-z0-9\s-]", "", city.lower()).strip().replace(" ", "-")
|
|
url = (f"https://www.zillow.com/homes/for_sale/{slug}-fl/"
|
|
f"?searchQueryState=%7B%22filterState%22%3A%7B"
|
|
f"%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A{MIN_PRICE}%7D%7D%7D")
|
|
page.goto(url, wait_until="load", timeout=45000)
|
|
_hd(2, 3)
|
|
_scroll(page, 4)
|
|
|
|
html = page.content()
|
|
title = page.title()
|
|
# Si Cloudflare bloqueó, esperar hasta que el usuario lo resuelva (max 90s)
|
|
if "denied" in title.lower() or "px-captcha" in html or "cf-browser-verification" in html:
|
|
print(f" >> Cloudflare en {city}: resuelve el challenge en el browser (90s max)...")
|
|
deadline = time.time() + 90
|
|
while time.time() < deadline:
|
|
time.sleep(4)
|
|
html = page.content()
|
|
t2 = page.title()
|
|
if "denied" not in t2.lower() and "px-captcha" not in html:
|
|
print(f" Challenge resuelto!")
|
|
break
|
|
else:
|
|
print(f" Timeout esperando challenge - saltando {city}")
|
|
continue
|
|
|
|
listings = _parse_zillow_html(html, MIN_PRICE, max_price)
|
|
print(f" -> {len(listings)} encontradas")
|
|
all_results.extend(listings)
|
|
|
|
except Exception as e:
|
|
print(f" ERROR {city}: {e}")
|
|
|
|
_hd(4, 7) # pausa entre ciudades para evitar bloqueo
|
|
|
|
ctx.close()
|
|
|
|
# Deduplicar
|
|
seen, unique = set(), []
|
|
for r in all_results:
|
|
key = r["address"].lower().strip()
|
|
if key and key not in seen:
|
|
seen.add(key)
|
|
unique.append(r)
|
|
return unique
|
|
|
|
|
|
# ── Realtor.com via Playwright + Chrome profile ───────────────────────────────
|
|
def scrape_realtor(cities: list, max_price: int) -> list:
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
except ImportError:
|
|
return []
|
|
|
|
ensure_chrome_profile()
|
|
all_results = []
|
|
|
|
with sync_playwright() as p:
|
|
ctx = p.chromium.launch_persistent_context(
|
|
user_data_dir=TEMP_PROFILE,
|
|
executable_path=CHROME_PATH,
|
|
headless=False,
|
|
args=[
|
|
"--profile-directory=Default",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--no-first-run",
|
|
],
|
|
viewport={"width": 1366, "height": 768},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
for city in cities:
|
|
city_slug = re.sub(r"[^a-z0-9\s]", "", city.lower()).strip().replace(" ", "_")
|
|
url = f"https://www.realtor.com/realestateandhomes-search/{city_slug}_FL/price-na-{max_price}"
|
|
print(f" Realtor.com: buscando {city}...")
|
|
try:
|
|
page.goto(url, wait_until="load", timeout=45000)
|
|
_hd(2, 3)
|
|
_scroll(page, 4)
|
|
_hd(1, 2)
|
|
|
|
html = page.content()
|
|
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
if not m:
|
|
continue
|
|
data = json.loads(m.group(1))
|
|
properties = (data.get("props", {}).get("pageProps", {})
|
|
.get("properties", []))
|
|
for item in properties:
|
|
price = item.get("list_price", 0)
|
|
if not isinstance(price, (int, float)):
|
|
continue
|
|
price = int(price)
|
|
if 40000 <= price <= max_price:
|
|
loc = item.get("location", {}).get("address", {})
|
|
city_val = loc.get("city", city)
|
|
desc = item.get("description", {})
|
|
all_results.append({
|
|
"source": "Realtor.com",
|
|
"address": loc.get("line", ""),
|
|
"city": city_val,
|
|
"county": get_county_for_city(city_val),
|
|
"zipcode": str(loc.get("postal_code", "")),
|
|
"price": price,
|
|
"beds": desc.get("beds", 0),
|
|
"baths": desc.get("baths_consolidated", 0),
|
|
"sqft": desc.get("sqft", 0),
|
|
"status": item.get("status", "For Sale"),
|
|
"url": "https://www.realtor.com" + item.get("permalink", ""),
|
|
"image_url": item.get("primary_photo", {}).get("href", ""),
|
|
"property_type": desc.get("type", ""),
|
|
})
|
|
print(f" -> {len(properties)} revisadas")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR {city}: {e}")
|
|
|
|
_hd(3, 5)
|
|
|
|
ctx.close()
|
|
|
|
seen, unique = set(), []
|
|
for r in all_results:
|
|
key = r["address"].lower().strip()
|
|
if key and key not in seen:
|
|
seen.add(key)
|
|
unique.append(r)
|
|
return unique
|
|
|
|
|
|
# ── HUD Homes (requests - gobierno, sin anti-bot) ─────────────────────────────
|
|
def scrape_hud(cities: list, max_price: int) -> list:
|
|
results = []
|
|
counties = set()
|
|
for city in cities:
|
|
c = get_county_for_city(city)
|
|
if c:
|
|
counties.add(c)
|
|
if not counties:
|
|
counties = {"Brevard", "Indian River", "Duval", "St. Johns", "Volusia"}
|
|
|
|
for county in list(counties)[:6]:
|
|
code = COUNTY_CODES.get(county, "")
|
|
if not code:
|
|
continue
|
|
url = (f"https://www.hudhomestore.gov/HudHomes/Index.aspx"
|
|
f"?sState=FL&sCounty={code}&sPriceMax={max_price}&sPriceMin=30000")
|
|
try:
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
rows = soup.select("tr.propRow, .property-row, tr[id^='prop']")
|
|
for row in rows[:8]:
|
|
text = row.get_text(" ", strip=True)
|
|
price_m = re.search(r'\$[\d,]+', text)
|
|
if not price_m:
|
|
continue
|
|
price = int(re.sub(r'[^\d]', '', price_m.group()))
|
|
if 0 < price <= max_price:
|
|
addr_el = row.select_one("a")
|
|
prop = {
|
|
"source": "HUD Homes",
|
|
"address": addr_el.get_text(strip=True) if addr_el else text[:80],
|
|
"city": county, "county": county,
|
|
"price": price, "url": url,
|
|
"status": "HUD Foreclosure",
|
|
}
|
|
prop["score"] = score_property(prop, cities, max_price)
|
|
results.append(prop)
|
|
except Exception as e:
|
|
print(f" HUD {county}: {e}")
|
|
return results
|
|
|
|
|
|
# ── Fannie Mae HomePath (REO) ──────────────────────────────────────────────────
|
|
def scrape_homepath(cities: list, max_price: int) -> list:
|
|
results = []
|
|
for term in cities[:8]:
|
|
url = (f"https://www.homepath.fanniemae.com/listings"
|
|
f"?searchTerm={requests.utils.quote(term + ' FL')}"
|
|
f"&maxPrice={max_price}&state=FL")
|
|
try:
|
|
r = requests.get(url, headers=HEADERS, timeout=15)
|
|
data = r.json() if "json" in r.headers.get("content-type", "") else {}
|
|
listings = data.get("listings", data.get("results", []))
|
|
for item in listings[:6]:
|
|
price = item.get("listPrice", item.get("price", 0))
|
|
if 0 < price <= max_price:
|
|
city_val = item.get("city", term)
|
|
prop = {
|
|
"source": "Fannie Mae HomePath",
|
|
"address": item.get("address", item.get("streetAddress", "")),
|
|
"city": city_val,
|
|
"county": get_county_for_city(city_val),
|
|
"zipcode": str(item.get("postalCode", "")),
|
|
"price": price,
|
|
"beds": item.get("bedrooms", 0),
|
|
"baths": item.get("bathrooms", 0),
|
|
"sqft": item.get("squareFeet", 0),
|
|
"url": f"https://www.homepath.fanniemae.com/listings/{item.get('id','')}",
|
|
"status": "Fannie Mae REO",
|
|
}
|
|
prop["score"] = score_property(prop, cities, max_price)
|
|
results.append(prop)
|
|
except Exception as e:
|
|
print(f" HomePath {term}: {e}")
|
|
return results
|
|
|
|
|
|
# ── Main runner ───────────────────────────────────────────────────────────────
|
|
def run_all_scrapers(cities: list = None, max_price: int = 230000) -> dict:
|
|
if not cities:
|
|
cities = ["Vero Beach", "Jacksonville", "Melbourne", "St. Augustine"]
|
|
|
|
all_props = []
|
|
log = {}
|
|
|
|
sources = {
|
|
"Zillow (Browser)": lambda: scrape_zillow(cities, max_price),
|
|
"Realtor.com (Browser)": lambda: scrape_realtor(cities, max_price),
|
|
"HUD Homes": lambda: scrape_hud(cities, max_price),
|
|
"Fannie Mae HomePath": lambda: scrape_homepath(cities, max_price),
|
|
}
|
|
|
|
for name, fn in sources.items():
|
|
try:
|
|
print(f"\n[{name}]")
|
|
props = fn()
|
|
seen, unique = set(), []
|
|
for p in props:
|
|
key = ((p.get("address") or "").lower().strip(), p.get("price", 0))
|
|
if key[0] and key not in seen:
|
|
seen.add(key)
|
|
p["score"] = score_property(p, cities, max_price)
|
|
unique.append(p)
|
|
all_props.extend(unique)
|
|
log[name] = {"found": len(unique), "status": "ok"}
|
|
print(f" -> {len(unique)} propiedades validas")
|
|
except Exception as e:
|
|
log[name] = {"found": 0, "status": f"error: {e}"}
|
|
print(f" ERROR {name}: {e}")
|
|
|
|
all_props.sort(key=lambda x: x.get("score", 0), reverse=True)
|
|
return {
|
|
"properties": all_props,
|
|
"log": log,
|
|
"cities_searched": cities,
|
|
"max_price": max_price,
|
|
"ran_at": datetime.utcnow().isoformat()
|
|
}
|