import re, json, time, random, shutil, os import requests from bs4 import BeautifulSoup from datetime import datetime # ── Config ──────────────────────────────────────────────────────────────────── CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe" CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data" TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", } CITY_COUNTY_MAP = { "vero beach": "Indian River", "sebastian": "Indian River", "fellsmere": "Indian River", "indian river shores": "Indian River", "stuart": "Martin", "jensen beach": "Martin", "hobe sound": "Martin", "palm city": "Martin", "port salerno": "Martin", "fort pierce": "St. Lucie", "port st. lucie": "St. Lucie", "melbourne": "Brevard", "palm bay": "Brevard", "titusville": "Brevard", "cocoa": "Brevard", "cocoa beach": "Brevard", "rockledge": "Brevard", "merritt island": "Brevard", "cape canaveral": "Brevard", "satellite beach": "Brevard", "west melbourne": "Brevard", "daytona beach": "Volusia", "ormond beach": "Volusia", "new smyrna beach": "Volusia", "edgewater": "Volusia", "port orange": "Volusia", "deltona": "Volusia", "palm coast": "Flagler", "flagler beach": "Flagler", "bunnell": "Flagler", "st. augustine": "St. Johns", "ponte vedra beach": "St. Johns", "nocatee": "St. Johns", "st. augustine beach": "St. Johns", "jacksonville": "Duval", "jacksonville beach": "Duval", "atlantic beach": "Duval", "neptune beach": "Duval", "fernandina beach": "Nassau", "yulee": "Nassau", } COUNTY_CODES = { "Brevard": "9", "Duval": "31", "Flagler": "35", "Indian River": "61", "Martin": "86", "Nassau": "89", "St. Johns": "109", "St. Lucie": "111", "Volusia": "127", } def get_county_for_city(city: str) -> str: return CITY_COUNTY_MAP.get(city.lower().strip(), "") def score_property(prop: dict, search_cities: list, max_price: int) -> int: score = 40 price = prop.get("price", 0) if not price or price <= 0: return 0 ratio = price / max_price if ratio <= 0.60: score += 35 elif ratio <= 0.75: score += 25 elif ratio <= 0.90: score += 15 elif ratio <= 1.0: score += 8 city = (prop.get("city") or "").lower() county = (prop.get("county") or "").lower() for s in [c.lower() for c in search_cities]: if s in city or s in county or city in s or county in s: score += 12 break beds = prop.get("beds") or 0 if beds >= 3: score += 8 elif beds >= 2: score += 4 status = (prop.get("status") or "").lower() if any(w in status for w in ["foreclosure", "reo", "bank owned", "hud", "price reduced"]): score += 10 elif any(w in status for w in ["new construction", "newly built"]): score += 5 return min(score, 100) # ── Chrome profile setup ─────────────────────────────────────────────────────── def ensure_chrome_profile(): """Copia el perfil de Chrome si no existe el temporal.""" if os.path.exists(os.path.join(TEMP_PROFILE, "Default")): return True if not os.path.exists(CHROME_PROFILE): return False try: os.makedirs(os.path.join(TEMP_PROFILE, "Default"), exist_ok=True) src = os.path.join(CHROME_PROFILE, "Default") dst = os.path.join(TEMP_PROFILE, "Default") for item in ["Cookies", "Login Data", "Web Data", "Preferences"]: s = os.path.join(src, item) if os.path.exists(s): shutil.copy2(s, dst) return True except Exception as e: print(f" Profile copy warning: {e}") return False # ── Playwright helpers ───────────────────────────────────────────────────────── def _hd(a=1.2, b=3.0): time.sleep(random.uniform(a, b)) def _scroll(page, steps=4): for _ in range(steps): page.mouse.wheel(0, random.randint(250, 600)) time.sleep(random.uniform(0.4, 0.9)) def _type_human(page, text): for ch in text: page.keyboard.type(ch) time.sleep(random.uniform(0.07, 0.16)) def _parse_zillow_html(html, min_p=40000, max_p=230000): results = [] m = re.search(r']*id="__NEXT_DATA__"[^>]*>(.*?)', html, re.DOTALL) if not m: return results try: data = json.loads(m.group(1)) list_results = (data.get("props", {}).get("pageProps", {}) .get("searchPageState", {}).get("cat1", {}) .get("searchResults", {}).get("listResults", [])) for p in list_results: price = p.get("unformattedPrice", 0) if min_p <= price <= max_p: city_val = p.get("addressCity", "") results.append({ "source": "Zillow", "address": p.get("address", ""), "price": price, "beds": p.get("beds", 0), "baths": p.get("baths", 0), "sqft": p.get("area", 0), "city": city_val, "state": p.get("addressState", "FL"), "county": get_county_for_city(city_val), "zipcode": str(p.get("addressZipcode", "")), "status": p.get("statusType", "For Sale"), "url": "https://www.zillow.com" + p.get("detailUrl", ""), "image_url": p.get("imgSrc", ""), "property_type": p.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""), }) except Exception as e: print(f" JSON parse error: {e}") return results # ── Zillow via Playwright + Chrome profile ──────────────────────────────────── def scrape_zillow(cities: list, max_price: int) -> list: try: from playwright.sync_api import sync_playwright except ImportError: print(" Playwright no instalado — saltando Zillow") return [] ensure_chrome_profile() all_results = [] MIN_PRICE = 40000 with sync_playwright() as p: ctx = p.chromium.launch_persistent_context( user_data_dir=TEMP_PROFILE, executable_path=CHROME_PATH, headless=False, args=[ "--profile-directory=Default", "--disable-blink-features=AutomationControlled", "--start-maximized", "--no-first-run", "--no-default-browser-check", ], viewport={"width": 1366, "height": 768}, ) page = ctx.new_page() for city in cities: city_q = f"{city}, FL" print(f" Zillow: buscando {city_q}...") try: page.goto("https://www.zillow.com", wait_until="load", timeout=30000) _hd(1.5, 2.5) search = page.query_selector( "#search-box-input, input[id*='search'], " "input[placeholder*='address'], input[placeholder*='city']" ) if search: search.click() _hd(0.3, 0.6) page.keyboard.down("Control") page.keyboard.press("a") page.keyboard.up("Control") page.keyboard.press("Delete") _hd(0.2, 0.4) _type_human(page, city_q) _hd(0.8, 1.5) page.keyboard.press("Enter") page.wait_for_load_state("load", timeout=30000) _hd(2, 3) _scroll(page, 4) _hd(1, 2) else: # fallback: URL directa slug = re.sub(r"[^a-z0-9\s-]", "", city.lower()).strip().replace(" ", "-") url = (f"https://www.zillow.com/homes/for_sale/{slug}-fl/" f"?searchQueryState=%7B%22filterState%22%3A%7B" f"%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A{MIN_PRICE}%7D%7D%7D") page.goto(url, wait_until="load", timeout=45000) _hd(2, 3) _scroll(page, 4) html = page.content() title = page.title() # Si Cloudflare bloqueó, esperar hasta que el usuario lo resuelva (max 90s) if "denied" in title.lower() or "px-captcha" in html or "cf-browser-verification" in html: print(f" >> Cloudflare en {city}: resuelve el challenge en el browser (90s max)...") deadline = time.time() + 90 while time.time() < deadline: time.sleep(4) html = page.content() t2 = page.title() if "denied" not in t2.lower() and "px-captcha" not in html: print(f" Challenge resuelto!") break else: print(f" Timeout esperando challenge - saltando {city}") continue listings = _parse_zillow_html(html, MIN_PRICE, max_price) print(f" -> {len(listings)} encontradas") all_results.extend(listings) except Exception as e: print(f" ERROR {city}: {e}") _hd(4, 7) # pausa entre ciudades para evitar bloqueo ctx.close() # Deduplicar seen, unique = set(), [] for r in all_results: key = r["address"].lower().strip() if key and key not in seen: seen.add(key) unique.append(r) return unique # ── Realtor.com via Playwright + Chrome profile ─────────────────────────────── def scrape_realtor(cities: list, max_price: int) -> list: try: from playwright.sync_api import sync_playwright except ImportError: return [] ensure_chrome_profile() all_results = [] with sync_playwright() as p: ctx = p.chromium.launch_persistent_context( user_data_dir=TEMP_PROFILE, executable_path=CHROME_PATH, headless=False, args=[ "--profile-directory=Default", "--disable-blink-features=AutomationControlled", "--start-maximized", "--no-first-run", ], viewport={"width": 1366, "height": 768}, ) page = ctx.new_page() for city in cities: city_slug = re.sub(r"[^a-z0-9\s]", "", city.lower()).strip().replace(" ", "_") url = f"https://www.realtor.com/realestateandhomes-search/{city_slug}_FL/price-na-{max_price}" print(f" Realtor.com: buscando {city}...") try: page.goto(url, wait_until="load", timeout=45000) _hd(2, 3) _scroll(page, 4) _hd(1, 2) html = page.content() m = re.search(r']*id="__NEXT_DATA__"[^>]*>(.*?)', html, re.DOTALL) if not m: continue data = json.loads(m.group(1)) properties = (data.get("props", {}).get("pageProps", {}) .get("properties", [])) for item in properties: price = item.get("list_price", 0) if not isinstance(price, (int, float)): continue price = int(price) if 40000 <= price <= max_price: loc = item.get("location", {}).get("address", {}) city_val = loc.get("city", city) desc = item.get("description", {}) all_results.append({ "source": "Realtor.com", "address": loc.get("line", ""), "city": city_val, "county": get_county_for_city(city_val), "zipcode": str(loc.get("postal_code", "")), "price": price, "beds": desc.get("beds", 0), "baths": desc.get("baths_consolidated", 0), "sqft": desc.get("sqft", 0), "status": item.get("status", "For Sale"), "url": "https://www.realtor.com" + item.get("permalink", ""), "image_url": item.get("primary_photo", {}).get("href", ""), "property_type": desc.get("type", ""), }) print(f" -> {len(properties)} revisadas") except Exception as e: print(f" ERROR {city}: {e}") _hd(3, 5) ctx.close() seen, unique = set(), [] for r in all_results: key = r["address"].lower().strip() if key and key not in seen: seen.add(key) unique.append(r) return unique # ── HUD Homes (requests - gobierno, sin anti-bot) ───────────────────────────── def scrape_hud(cities: list, max_price: int) -> list: results = [] counties = set() for city in cities: c = get_county_for_city(city) if c: counties.add(c) if not counties: counties = {"Brevard", "Indian River", "Duval", "St. Johns", "Volusia"} for county in list(counties)[:6]: code = COUNTY_CODES.get(county, "") if not code: continue url = (f"https://www.hudhomestore.gov/HudHomes/Index.aspx" f"?sState=FL&sCounty={code}&sPriceMax={max_price}&sPriceMin=30000") try: r = requests.get(url, headers=HEADERS, timeout=15) soup = BeautifulSoup(r.text, "html.parser") rows = soup.select("tr.propRow, .property-row, tr[id^='prop']") for row in rows[:8]: text = row.get_text(" ", strip=True) price_m = re.search(r'\$[\d,]+', text) if not price_m: continue price = int(re.sub(r'[^\d]', '', price_m.group())) if 0 < price <= max_price: addr_el = row.select_one("a") prop = { "source": "HUD Homes", "address": addr_el.get_text(strip=True) if addr_el else text[:80], "city": county, "county": county, "price": price, "url": url, "status": "HUD Foreclosure", } prop["score"] = score_property(prop, cities, max_price) results.append(prop) except Exception as e: print(f" HUD {county}: {e}") return results # ── Fannie Mae HomePath (REO) ────────────────────────────────────────────────── def scrape_homepath(cities: list, max_price: int) -> list: results = [] for term in cities[:8]: url = (f"https://www.homepath.fanniemae.com/listings" f"?searchTerm={requests.utils.quote(term + ' FL')}" f"&maxPrice={max_price}&state=FL") try: r = requests.get(url, headers=HEADERS, timeout=15) data = r.json() if "json" in r.headers.get("content-type", "") else {} listings = data.get("listings", data.get("results", [])) for item in listings[:6]: price = item.get("listPrice", item.get("price", 0)) if 0 < price <= max_price: city_val = item.get("city", term) prop = { "source": "Fannie Mae HomePath", "address": item.get("address", item.get("streetAddress", "")), "city": city_val, "county": get_county_for_city(city_val), "zipcode": str(item.get("postalCode", "")), "price": price, "beds": item.get("bedrooms", 0), "baths": item.get("bathrooms", 0), "sqft": item.get("squareFeet", 0), "url": f"https://www.homepath.fanniemae.com/listings/{item.get('id','')}", "status": "Fannie Mae REO", } prop["score"] = score_property(prop, cities, max_price) results.append(prop) except Exception as e: print(f" HomePath {term}: {e}") return results # ── Main runner ─────────────────────────────────────────────────────────────── def run_all_scrapers(cities: list = None, max_price: int = 230000) -> dict: if not cities: cities = ["Vero Beach", "Jacksonville", "Melbourne", "St. Augustine"] all_props = [] log = {} sources = { "Zillow (Browser)": lambda: scrape_zillow(cities, max_price), "Realtor.com (Browser)": lambda: scrape_realtor(cities, max_price), "HUD Homes": lambda: scrape_hud(cities, max_price), "Fannie Mae HomePath": lambda: scrape_homepath(cities, max_price), } for name, fn in sources.items(): try: print(f"\n[{name}]") props = fn() seen, unique = set(), [] for p in props: key = ((p.get("address") or "").lower().strip(), p.get("price", 0)) if key[0] and key not in seen: seen.add(key) p["score"] = score_property(p, cities, max_price) unique.append(p) all_props.extend(unique) log[name] = {"found": len(unique), "status": "ok"} print(f" -> {len(unique)} propiedades validas") except Exception as e: log[name] = {"found": 0, "status": f"error: {e}"} print(f" ERROR {name}: {e}") all_props.sort(key=lambda x: x.get("score", 0), reverse=True) return { "properties": all_props, "log": log, "cities_searched": cities, "max_price": max_price, "ran_at": datetime.utcnow().isoformat() }