Agente-Marketing/casa-hunter/app/scrapers.py

import re, json, time, random, shutil, os
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# ── Config ────────────────────────────────────────────────────────────────────
CHROME_PATH    = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data"
TEMP_PROFILE   = r"C:\Temp\chrome_casa_hunter"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

CITY_COUNTY_MAP = {
    "vero beach": "Indian River", "sebastian": "Indian River",
    "fellsmere": "Indian River", "indian river shores": "Indian River",
    "stuart": "Martin", "jensen beach": "Martin", "hobe sound": "Martin",
    "palm city": "Martin", "port salerno": "Martin",
    "fort pierce": "St. Lucie", "port st. lucie": "St. Lucie",
    "melbourne": "Brevard", "palm bay": "Brevard", "titusville": "Brevard",
    "cocoa": "Brevard", "cocoa beach": "Brevard", "rockledge": "Brevard",
    "merritt island": "Brevard", "cape canaveral": "Brevard",
    "satellite beach": "Brevard", "west melbourne": "Brevard",
    "daytona beach": "Volusia", "ormond beach": "Volusia",
    "new smyrna beach": "Volusia", "edgewater": "Volusia",
    "port orange": "Volusia", "deltona": "Volusia",
    "palm coast": "Flagler", "flagler beach": "Flagler", "bunnell": "Flagler",
    "st. augustine": "St. Johns", "ponte vedra beach": "St. Johns",
    "nocatee": "St. Johns", "st. augustine beach": "St. Johns",
    "jacksonville": "Duval", "jacksonville beach": "Duval",
    "atlantic beach": "Duval", "neptune beach": "Duval",
    "fernandina beach": "Nassau", "yulee": "Nassau",
}

COUNTY_CODES = {
    "Brevard": "9", "Duval": "31", "Flagler": "35", "Indian River": "61",
    "Martin": "86", "Nassau": "89", "St. Johns": "109", "St. Lucie": "111",
    "Volusia": "127",
}


def get_county_for_city(city: str) -> str:
    return CITY_COUNTY_MAP.get(city.lower().strip(), "")


def score_property(prop: dict, search_cities: list, max_price: int) -> int:
    score = 40
    price = prop.get("price", 0)
    if not price or price <= 0:
        return 0

    ratio = price / max_price
    if ratio <= 0.60:
        score += 35
    elif ratio <= 0.75:
        score += 25
    elif ratio <= 0.90:
        score += 15
    elif ratio <= 1.0:
        score += 8

    city = (prop.get("city") or "").lower()
    county = (prop.get("county") or "").lower()
    for s in [c.lower() for c in search_cities]:
        if s in city or s in county or city in s or county in s:
            score += 12
            break

    beds = prop.get("beds") or 0
    if beds >= 3:
        score += 8
    elif beds >= 2:
        score += 4

    status = (prop.get("status") or "").lower()
    if any(w in status for w in ["foreclosure", "reo", "bank owned", "hud", "price reduced"]):
        score += 10
    elif any(w in status for w in ["new construction", "newly built"]):
        score += 5

    return min(score, 100)


# ── Chrome profile setup ───────────────────────────────────────────────────────
def ensure_chrome_profile():
    """Copia el perfil de Chrome si no existe el temporal."""
    if os.path.exists(os.path.join(TEMP_PROFILE, "Default")):
        return True
    if not os.path.exists(CHROME_PROFILE):
        return False
    try:
        os.makedirs(os.path.join(TEMP_PROFILE, "Default"), exist_ok=True)
        src = os.path.join(CHROME_PROFILE, "Default")
        dst = os.path.join(TEMP_PROFILE, "Default")
        for item in ["Cookies", "Login Data", "Web Data", "Preferences"]:
            s = os.path.join(src, item)
            if os.path.exists(s):
                shutil.copy2(s, dst)
        return True
    except Exception as e:
        print(f"  Profile copy warning: {e}")
        return False


# ── Playwright helpers ─────────────────────────────────────────────────────────
def _hd(a=1.2, b=3.0):
    time.sleep(random.uniform(a, b))

def _scroll(page, steps=4):
    for _ in range(steps):
        page.mouse.wheel(0, random.randint(250, 600))
        time.sleep(random.uniform(0.4, 0.9))

def _type_human(page, text):
    for ch in text:
        page.keyboard.type(ch)
        time.sleep(random.uniform(0.07, 0.16))

def _parse_zillow_html(html, min_p=40000, max_p=230000):
    results = []
    m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
    if not m:
        return results
    try:
        data = json.loads(m.group(1))
        list_results = (data.get("props", {}).get("pageProps", {})
                        .get("searchPageState", {}).get("cat1", {})
                        .get("searchResults", {}).get("listResults", []))
        for p in list_results:
            price = p.get("unformattedPrice", 0)
            if min_p <= price <= max_p:
                city_val = p.get("addressCity", "")
                results.append({
                    "source": "Zillow",
                    "address": p.get("address", ""),
                    "price": price,
                    "beds": p.get("beds", 0),
                    "baths": p.get("baths", 0),
                    "sqft": p.get("area", 0),
                    "city": city_val,
                    "state": p.get("addressState", "FL"),
                    "county": get_county_for_city(city_val),
                    "zipcode": str(p.get("addressZipcode", "")),
                    "status": p.get("statusType", "For Sale"),
                    "url": "https://www.zillow.com" + p.get("detailUrl", ""),
                    "image_url": p.get("imgSrc", ""),
                    "property_type": p.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""),
                })
    except Exception as e:
        print(f"  JSON parse error: {e}")
    return results


# ── Zillow via Playwright + Chrome profile ────────────────────────────────────
def scrape_zillow(cities: list, max_price: int) -> list:
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        print("  Playwright no instalado — saltando Zillow")
        return []

    ensure_chrome_profile()
    all_results = []
    MIN_PRICE = 40000

    with sync_playwright() as p:
        ctx = p.chromium.launch_persistent_context(
            user_data_dir=TEMP_PROFILE,
            executable_path=CHROME_PATH,
            headless=False,
            args=[
                "--profile-directory=Default",
                "--disable-blink-features=AutomationControlled",
                "--start-maximized",
                "--no-first-run",
                "--no-default-browser-check",
            ],
            viewport={"width": 1366, "height": 768},
        )
        page = ctx.new_page()

        for city in cities:
            city_q = f"{city}, FL"
            print(f"  Zillow: buscando {city_q}...")
            try:
                page.goto("https://www.zillow.com", wait_until="load", timeout=30000)
                _hd(1.5, 2.5)

                search = page.query_selector(
                    "#search-box-input, input[id*='search'], "
                    "input[placeholder*='address'], input[placeholder*='city']"
                )
                if search:
                    search.click()
                    _hd(0.3, 0.6)
                    page.keyboard.down("Control")
                    page.keyboard.press("a")
                    page.keyboard.up("Control")
                    page.keyboard.press("Delete")
                    _hd(0.2, 0.4)
                    _type_human(page, city_q)
                    _hd(0.8, 1.5)
                    page.keyboard.press("Enter")
                    page.wait_for_load_state("load", timeout=30000)
                    _hd(2, 3)
                    _scroll(page, 4)
                    _hd(1, 2)
                else:
                    # fallback: URL directa
                    slug = re.sub(r"[^a-z0-9\s-]", "", city.lower()).strip().replace(" ", "-")
                    url = (f"https://www.zillow.com/homes/for_sale/{slug}-fl/"
                           f"?searchQueryState=%7B%22filterState%22%3A%7B"
                           f"%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A{MIN_PRICE}%7D%7D%7D")
                    page.goto(url, wait_until="load", timeout=45000)
                    _hd(2, 3)
                    _scroll(page, 4)

                html = page.content()
                title = page.title()
                # Si Cloudflare bloqueó, esperar hasta que el usuario lo resuelva (max 90s)
                if "denied" in title.lower() or "px-captcha" in html or "cf-browser-verification" in html:
                    print(f"    >> Cloudflare en {city}: resuelve el challenge en el browser (90s max)...")
                    deadline = time.time() + 90
                    while time.time() < deadline:
                        time.sleep(4)
                        html = page.content()
                        t2 = page.title()
                        if "denied" not in t2.lower() and "px-captcha" not in html:
                            print(f"    Challenge resuelto!")
                            break
                    else:
                        print(f"    Timeout esperando challenge - saltando {city}")
                        continue

                listings = _parse_zillow_html(html, MIN_PRICE, max_price)
                print(f"    -> {len(listings)} encontradas")
                all_results.extend(listings)

            except Exception as e:
                print(f"    ERROR {city}: {e}")

            _hd(4, 7)  # pausa entre ciudades para evitar bloqueo

        ctx.close()

    # Deduplicar
    seen, unique = set(), []
    for r in all_results:
        key = r["address"].lower().strip()
        if key and key not in seen:
            seen.add(key)
            unique.append(r)
    return unique


# ── Realtor.com via Playwright + Chrome profile ───────────────────────────────
def scrape_realtor(cities: list, max_price: int) -> list:
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        return []

    ensure_chrome_profile()
    all_results = []

    with sync_playwright() as p:
        ctx = p.chromium.launch_persistent_context(
            user_data_dir=TEMP_PROFILE,
            executable_path=CHROME_PATH,
            headless=False,
            args=[
                "--profile-directory=Default",
                "--disable-blink-features=AutomationControlled",
                "--start-maximized",
                "--no-first-run",
            ],
            viewport={"width": 1366, "height": 768},
        )
        page = ctx.new_page()

        for city in cities:
            city_slug = re.sub(r"[^a-z0-9\s]", "", city.lower()).strip().replace(" ", "_")
            url = f"https://www.realtor.com/realestateandhomes-search/{city_slug}_FL/price-na-{max_price}"
            print(f"  Realtor.com: buscando {city}...")
            try:
                page.goto(url, wait_until="load", timeout=45000)
                _hd(2, 3)
                _scroll(page, 4)
                _hd(1, 2)

                html = page.content()
                m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
                if not m:
                    continue
                data = json.loads(m.group(1))
                properties = (data.get("props", {}).get("pageProps", {})
                              .get("properties", []))
                for item in properties:
                    price = item.get("list_price", 0)
                    if not isinstance(price, (int, float)):
                        continue
                    price = int(price)
                    if 40000 <= price <= max_price:
                        loc = item.get("location", {}).get("address", {})
                        city_val = loc.get("city", city)
                        desc = item.get("description", {})
                        all_results.append({
                            "source": "Realtor.com",
                            "address": loc.get("line", ""),
                            "city": city_val,
                            "county": get_county_for_city(city_val),
                            "zipcode": str(loc.get("postal_code", "")),
                            "price": price,
                            "beds": desc.get("beds", 0),
                            "baths": desc.get("baths_consolidated", 0),
                            "sqft": desc.get("sqft", 0),
                            "status": item.get("status", "For Sale"),
                            "url": "https://www.realtor.com" + item.get("permalink", ""),
                            "image_url": item.get("primary_photo", {}).get("href", ""),
                            "property_type": desc.get("type", ""),
                        })
                print(f"    -> {len(properties)} revisadas")

            except Exception as e:
                print(f"    ERROR {city}: {e}")

            _hd(3, 5)

        ctx.close()

    seen, unique = set(), []
    for r in all_results:
        key = r["address"].lower().strip()
        if key and key not in seen:
            seen.add(key)
            unique.append(r)
    return unique


# ── HUD Homes (requests - gobierno, sin anti-bot) ─────────────────────────────
def scrape_hud(cities: list, max_price: int) -> list:
    results = []
    counties = set()
    for city in cities:
        c = get_county_for_city(city)
        if c:
            counties.add(c)
    if not counties:
        counties = {"Brevard", "Indian River", "Duval", "St. Johns", "Volusia"}

    for county in list(counties)[:6]:
        code = COUNTY_CODES.get(county, "")
        if not code:
            continue
        url = (f"https://www.hudhomestore.gov/HudHomes/Index.aspx"
               f"?sState=FL&sCounty={code}&sPriceMax={max_price}&sPriceMin=30000")
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            soup = BeautifulSoup(r.text, "html.parser")
            rows = soup.select("tr.propRow, .property-row, tr[id^='prop']")
            for row in rows[:8]:
                text = row.get_text(" ", strip=True)
                price_m = re.search(r'\$[\d,]+', text)
                if not price_m:
                    continue
                price = int(re.sub(r'[^\d]', '', price_m.group()))
                if 0 < price <= max_price:
                    addr_el = row.select_one("a")
                    prop = {
                        "source": "HUD Homes",
                        "address": addr_el.get_text(strip=True) if addr_el else text[:80],
                        "city": county, "county": county,
                        "price": price, "url": url,
                        "status": "HUD Foreclosure",
                    }
                    prop["score"] = score_property(prop, cities, max_price)
                    results.append(prop)
        except Exception as e:
            print(f"  HUD {county}: {e}")
    return results


# ── Fannie Mae HomePath (REO) ──────────────────────────────────────────────────
def scrape_homepath(cities: list, max_price: int) -> list:
    results = []
    for term in cities[:8]:
        url = (f"https://www.homepath.fanniemae.com/listings"
               f"?searchTerm={requests.utils.quote(term + ' FL')}"
               f"&maxPrice={max_price}&state=FL")
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            data = r.json() if "json" in r.headers.get("content-type", "") else {}
            listings = data.get("listings", data.get("results", []))
            for item in listings[:6]:
                price = item.get("listPrice", item.get("price", 0))
                if 0 < price <= max_price:
                    city_val = item.get("city", term)
                    prop = {
                        "source": "Fannie Mae HomePath",
                        "address": item.get("address", item.get("streetAddress", "")),
                        "city": city_val,
                        "county": get_county_for_city(city_val),
                        "zipcode": str(item.get("postalCode", "")),
                        "price": price,
                        "beds": item.get("bedrooms", 0),
                        "baths": item.get("bathrooms", 0),
                        "sqft": item.get("squareFeet", 0),
                        "url": f"https://www.homepath.fanniemae.com/listings/{item.get('id','')}",
                        "status": "Fannie Mae REO",
                    }
                    prop["score"] = score_property(prop, cities, max_price)
                    results.append(prop)
        except Exception as e:
            print(f"  HomePath {term}: {e}")
    return results


# ── Main runner ───────────────────────────────────────────────────────────────
def run_all_scrapers(cities: list = None, max_price: int = 230000) -> dict:
    if not cities:
        cities = ["Vero Beach", "Jacksonville", "Melbourne", "St. Augustine"]

    all_props = []
    log = {}

    sources = {
        "Zillow (Browser)":        lambda: scrape_zillow(cities, max_price),
        "Realtor.com (Browser)":   lambda: scrape_realtor(cities, max_price),
        "HUD Homes":               lambda: scrape_hud(cities, max_price),
        "Fannie Mae HomePath":     lambda: scrape_homepath(cities, max_price),
    }

    for name, fn in sources.items():
        try:
            print(f"\n[{name}]")
            props = fn()
            seen, unique = set(), []
            for p in props:
                key = ((p.get("address") or "").lower().strip(), p.get("price", 0))
                if key[0] and key not in seen:
                    seen.add(key)
                    p["score"] = score_property(p, cities, max_price)
                    unique.append(p)
            all_props.extend(unique)
            log[name] = {"found": len(unique), "status": "ok"}
            print(f"  -> {len(unique)} propiedades validas")
        except Exception as e:
            log[name] = {"found": 0, "status": f"error: {e}"}
            print(f"  ERROR {name}: {e}")

    all_props.sort(key=lambda x: x.get("score", 0), reverse=True)
    return {
        "properties": all_props,
        "log": log,
        "cities_searched": cities,
        "max_price": max_price,
        "ran_at": datetime.utcnow().isoformat()
    }