BoatAndShip-Finder/scraper_test.py

"""
scraper_test.py — Prueba individual de scrapers sin levantar el servidor Flask.

Uso:
    python scraper_test.py                    # prueba los 5 scrapers con query por defecto
    python scraper_test.py "catalina 30"      # query personalizada
    python scraper_test.py "beneteau" ebay    # solo eBay
    python scraper_test.py "sailboat" yachtworld boattrader

Scrapers disponibles: ebay, yachtworld, boattrader, boats, hibid
"""

import sys
import re
import time
import random
import threading
import requests
import urllib3
from bs4 import BeautifulSoup

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# ── User-Agents ──────────────────────────────────────────────────────────────
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
]

_interleave_lock = threading.Lock()
_interleave_idx  = 0
_interleave_sites = ["https://miami.craigslist.org", "https://www.ebay.com"]


def get_headers(referer=None):
    h = {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
    }
    if referer:
        h['Referer'] = referer
    return h


def polite_pause(source_name: str):
    global _interleave_idx
    with _interleave_lock:
        site = _interleave_sites[_interleave_idx % len(_interleave_sites)]
        _interleave_idx += 1
    try:
        requests.get(site, headers=get_headers(), timeout=5, verify=False)
    except Exception:
        pass
    time.sleep(random.uniform(2.0, 4.0))
    print(f"[{source_name}] Pausa cortés lista — continuando...")


def _extract_best_src(img_tag) -> str:
    candidates = [
        img_tag.get("src",""), img_tag.get("data-src",""),
        img_tag.get("data-lazy-src",""), img_tag.get("data-original",""),
        img_tag.get("data-lazy",""), img_tag.get("data-image",""),
    ]
    srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","")
    if srcset:
        parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()]
        candidates.extend(parts)
    for c in candidates:
        c = c.strip()
        if c and c.startswith("http") and not c.startswith("data:"):
            return c
    return ""


# ══════════════════════════════════════════════════════════════════════════════
# SCRAPERS
# ══════════════════════════════════════════════════════════════════════════════

def scrape_ebay(query: str, filters: dict = None) -> list:
    filters = filters or {}
    src = {
        "name": "eBay Marine",
        "search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10",
        "type": "classifieds",
        "category": "Clasificados USA",
    }
    results = []
    seen    = set()

    raw_url = src["search_url"]
    clean_q = " ".join(dict.fromkeys(query.strip().split()))
    url     = raw_url.replace("{query}", requests.utils.quote(clean_q))

    # Ajuste de categoría por tipo de embarcación
    vtype = filters.get("type","").lower()
    EBAY_CAT = {
        "sailboat":"36431","sail":"36431","velero":"36431",
        "motor":"36432","motorboat":"36432","yacht":"36432",
        "fishing":"36432","tug":"36432","barge":"36432",
    }
    if vtype and vtype in EBAY_CAT:
        url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url)

    print(f"[eBay] URL: {url}")

    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True,
                args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
            )
            context = browser.new_context(
                viewport={"width":1280,"height":900},
                user_agent=random.choice(USER_AGENTS),
                locale="en-US", timezone_id="America/New_York",
                ignore_https_errors=True,
            )
            context.add_init_script(
                "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
                "window.chrome={runtime:{}};"
            )
            page = context.new_page()
            try:
                page.goto(url, timeout=30000, wait_until="domcontentloaded")
                page.wait_for_timeout(random.randint(1500,2500))
                page.evaluate("window.scrollBy(0,600)")
                page.wait_for_timeout(800)
                html = page.content()
            except Exception as e:
                print(f"[eBay] Playwright error: {e}")
                html = ""
            finally:
                try: page.close()
                except: pass
            browser.close()

        if not html:
            print("[eBay] Sin HTML")
            return []

        soup  = BeautifulSoup(html, "html.parser")
        cards = soup.find_all("li", class_="s-card")
        print(f"[eBay] Tarjetas nuevo layout: {len(cards)}")

        if not cards:
            # layout antiguo li.s-item
            for item in soup.find_all("li", class_="s-item"):
                try:
                    link = item.find("a", class_="s-item__link")
                    if not link: continue
                    href = link.get("href","")
                    m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href)
                    if not m: continue
                    href = m.group(1)
                    if href in seen: continue
                    seen.add(href)
                    title_el = (item.find("span", class_="BOLD") or
                                item.find("div",  class_="s-item__title") or
                                item.find("span", class_="s-item__title"))
                    title = (title_el or link).get_text(strip=True)
                    if not title or title.lower().startswith("shop on ebay"): continue
                    price_el = item.find("span", class_="s-item__price")
                    price    = price_el.get_text(strip=True) if price_el else ""
                    results.append({"title":title,"url":href,"price":price,"source":"eBay"})
                except Exception:
                    continue
            print(f"[eBay] {len(results)} resultados (layout antiguo)")
            return results

        for card in cards:
            try:
                title_link = None
                for a in card.find_all("a", class_="s-card__link"):
                    if "image-treatment" in (a.get("class") or []): continue
                    t = a.get_text(strip=True)
                    if t and not t.lower().startswith("shop on ebay"):
                        title_link = a; break
                if not title_link: continue
                href = title_link.get("href","")
                if "/itm/" not in href: continue
                m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href)
                if not m: continue
                href = m.group(1)
                if href in seen: continue
                seen.add(href)
                title = re.sub(r'\s*Opens in a new window or tab.*','',
                               title_link.get_text(strip=True), flags=re.I).strip()
                price_el = (card.find(class_="s-card__price") or
                            card.find(class_="s-item__price"))
                price = price_el.get_text(strip=True) if price_el else ""
                results.append({"title":title,"url":href,"price":price,"source":"eBay"})
            except Exception:
                continue

        print(f"[eBay] {len(results)} resultados")
    except Exception as e:
        print(f"[eBay] Error: {e}")

    return results


def scrape_yachtworld(query: str, filters: dict = None) -> list:
    filters     = filters or {}
    results     = []
    seen        = set()
    vessel_type = filters.get("type","").lower()
    yw_type     = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power"

    base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/"
    if vessel_type in ["sailboat","sail","velero",""]:
        base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/"

    print(f"[YachtWorld] URL: {base_url}")

    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True,
                args=['--disable-blink-features=AutomationControlled','--no-sandbox']
            )
            context = browser.new_context(
                viewport={'width':1920,'height':1080},
                user_agent=random.choice(USER_AGENTS),
                locale='en-US', timezone_id='America/New_York',
                ignore_https_errors=True,
            )
            context.add_init_script(
                "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
                "window.chrome={runtime:{}};"
            )
            page = context.new_page()
            try:
                page.goto(base_url, timeout=30000, wait_until='domcontentloaded')
                page.wait_for_timeout(random.randint(2000,4000))
                for _ in range(3):
                    page.evaluate("window.scrollBy(0,400)")
                    page.wait_for_timeout(random.randint(400,800))
                html = page.content()
            except Exception as e:
                print(f"[YachtWorld] Error: {e}")
                html = ""
            finally:
                try: page.close()
                except: pass
            browser.close()

        if not html:
            print("[YachtWorld] Sin HTML")
            return []

        soup = BeautifulSoup(html,'html.parser')
        page_count = 0
        for a in soup.find_all('a', href=True):
            href = a['href']
            if '/boat-details/' in href or '/yacht/' in href:
                if not href.startswith('http'):
                    href = 'https://www.yachtworld.com' + href
                if href in seen: continue
                seen.add(href)
                title = a.get_text(strip=True)
                results.append({"title":title,"url":href,"price":"","source":"YachtWorld"})
                page_count += 1

        print(f"[YachtWorld] {page_count} resultados")
    except Exception as e:
        print(f"[YachtWorld] Error: {e}")

    return results


def scrape_boattrader(query: str, filters: dict = None) -> list:
    filters = filters or {}
    src = {
        "name": "BoatTrader",
        "search_url": "https://www.boattrader.com/boats/?query={query}",
        "type": "broker",
        "category": "Venta Especializada",
    }
    results = []
    seen    = set()
    url     = src["search_url"].replace("{query}", requests.utils.quote(query.strip()))
    print(f"[BoatTrader] URL: {url}")

    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True,
                args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
            )
            context = browser.new_context(
                viewport={"width":1280,"height":900},
                user_agent=random.choice(USER_AGENTS),
                locale="en-US", timezone_id="America/New_York",
                ignore_https_errors=True,
            )
            context.add_init_script(
                "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
                "window.chrome={runtime:{}};"
            )
            page = context.new_page()
            try:
                page.goto(url, timeout=35000, wait_until="domcontentloaded")
                page.wait_for_timeout(random.randint(4000,6000))
                page.evaluate("window.scrollBy(0,600)")
                page.wait_for_timeout(1500)
                html = page.content()
            except Exception as e:
                print(f"[BoatTrader] Error: {e}")
                html = ""
            finally:
                try: page.close()
                except: pass
            browser.close()

        if not html:
            print("[BoatTrader] Sin HTML")
            return []

        soup  = BeautifulSoup(html,"html.parser")
        cards = soup.find_all("li", class_="lib-card")
        if not cards:
            cards = soup.find_all(class_=re.compile(r'\blib-card\b'))
        print(f"[BoatTrader] Cards encontradas: {len(cards)}")

        for card in cards:
            try:
                link_tag = card.find("a", href=re.compile(r'^/boat/[\w-]+-\d+/$'))
                if not link_tag: continue
                href = "https://www.boattrader.com" + link_tag["href"]
                if href in seen: continue
                seen.add(href)
                title_el = card.find(class_=re.compile(r'listingTitle',re.I))
                title = title_el.get_text(strip=True) if title_el else link_tag.get_text(strip=True)
                price_el = card.find(class_=re.compile(r'listingPrice',re.I))
                price = ""
                if price_el:
                    pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True))
                    if pm: price = f"${pm.group(1)}"
                results.append({"title":title,"url":href,"price":price,"source":"BoatTrader"})
            except Exception:
                continue

        print(f"[BoatTrader] {len(results)} resultados")
    except Exception as e:
        print(f"[BoatTrader] Error: {e}")

    return results


def scrape_boats(query: str, filters: dict = None) -> list:
    filters = filters or {}
    src = {
        "name": "Boats.com",
        "search_url": "https://www.boats.com/boats-for-sale/?query={query}",
        "type": "broker",
        "category": "Venta Especializada",
    }
    results = []
    seen    = set()
    url     = src["search_url"].replace("{query}", requests.utils.quote(query.strip()))
    print(f"[Boats.com] URL: {url}")

    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True,
                args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
            )
            context = browser.new_context(
                viewport={"width":1280,"height":900},
                user_agent=random.choice(USER_AGENTS),
                locale="en-US", timezone_id="America/New_York",
                ignore_https_errors=True,
            )
            context.add_init_script(
                "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
                "window.chrome={runtime:{}};"
            )
            page = context.new_page()
            try:
                page.goto(url, timeout=35000, wait_until="domcontentloaded")
                page.wait_for_timeout(random.randint(4000,6000))
                page.evaluate("window.scrollBy(0,600)")
                page.wait_for_timeout(1500)
                html = page.content()
            except Exception as e:
                print(f"[Boats.com] Error: {e}")
                html = ""
            finally:
                try: page.close()
                except: pass
            browser.close()

        if not html:
            print("[Boats.com] Sin HTML")
            return []

        soup = BeautifulSoup(html,"html.parser")

        def _extract_card(card):
            a = card.find("a", href=re.compile(r'^/'))
            if not a: return
            href = "https://www.boats.com" + a["href"]
            if href in seen: return
            seen.add(href)
            year_el = card.select_one("div.year")
            name_el = card.select_one("h2")
            year  = year_el.get_text(strip=True) if year_el else ""
            name  = name_el.get_text(strip=True) if name_el else ""
            title = f"{year} {name}".strip() if year else name
            if not title: return
            price_el = card.select_one("div.price")
            price = ""
            if price_el:
                pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True))
                price = f"${pm.group(1)}" if pm else ""
            results.append({"title":title,"url":href,"price":price,"source":"Boats.com"})

        for card in soup.select("li[data-listing-id]"):
            _extract_card(card)
        for card in soup.select("li.enhanced.oem"):
            _extract_card(card)

        print(f"[Boats.com] {len(results)} resultados")
    except Exception as e:
        print(f"[Boats.com] Error: {e}")

    return results


def scrape_craigslist(query: str, filters: dict = None) -> list:
    filters = filters or {}
    results = []
    seen    = set()
    CITIES  = ["miami", "tampa", "sfbay", "losangeles", "seattle",
               "boston", "newyork", "chicago", "sandiego"]
    qs = requests.utils.quote(query.strip())

    print(f"[Craigslist] Query: '{query}' - probando 3 ciudades al azar")

    try:
        from playwright.sync_api import sync_playwright
        all_html_parts = []
        cities_tested  = random.sample(CITIES, min(3, len(CITIES)))
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
            ctx = browser.new_context(
                user_agent=random.choice(USER_AGENTS),
                locale="en-US", ignore_https_errors=True,
            )
            for city in cities_tested:
                city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel"
                print(f"[Craigslist] >> {city_url}")
                page = ctx.new_page()
                try:
                    page.goto(city_url, timeout=25000, wait_until="domcontentloaded")
                    page.wait_for_timeout(2500)
                    all_html_parts.append((city, page.content()))
                except Exception as e:
                    print(f"[Craigslist] {city} error: {e}")
                finally:
                    try: page.close()
                    except: pass
            browser.close()

        for city, html in all_html_parts:
            soup  = BeautifulSoup(html, "html.parser")
            cards = soup.find_all(attrs={"data-pid": True})
            print(f"[Craigslist] {city}: {len(cards)} cards en HTML")
            for card in cards:
                try:
                    a_main = card.find("a", class_="main")
                    if not a_main: continue
                    listing_url = a_main.get("href","")
                    if not listing_url or listing_url in seen: continue
                    seen.add(listing_url)
                    title = card.get("title","")
                    if not title:
                        span = card.find("span", class_="label")
                        title = span.get_text(strip=True) if span else ""
                    if not title: continue
                    price_el = card.find("span", class_="priceinfo")
                    price    = price_el.get_text(strip=True) if price_el else ""
                    results.append({
                        "title": title[:120], "url": listing_url,
                        "price": price, "source": f"Craigslist {city}",
                    })
                except Exception:
                    continue

        print(f"[Craigslist] {len(results)} resultados totales")
    except Exception as e:
        print(f"[Craigslist] Error: {e}")

    return results


def scrape_hibid(query: str, filters: dict = None) -> list:
    filters = filters or {}
    src     = {"name":"HiBid","type":"auction","category":"Subastas USA"}
    results = []
    q       = requests.utils.quote((query.strip() + " boat"))
    url     = f"https://www.hibid.com/lots?q={q}"
    print(f"[HiBid] URL: {url}")

    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
            ctx = browser.new_context(
                user_agent=random.choice(USER_AGENTS),
                viewport={"width":1280,"height":900},
                locale="en-US", ignore_https_errors=True,
            )
            ctx.add_init_script(
                "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
            )
            page = ctx.new_page()
            try:
                page.goto(url, timeout=30000, wait_until="domcontentloaded")
                page.wait_for_timeout(4000)
                html = page.content()
            finally:
                try: page.close()
                except: pass
            browser.close()

        soup = BeautifulSoup(html,"html.parser")
        seen = set()
        selectors = ".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"
        cards = soup.select(selectors)
        print(f"[HiBid] Cards encontradas: {len(cards)}")

        for card in cards:
            try:
                a = card.find("a", href=True)
                if not a: continue
                href = a["href"]
                if not href.startswith("http"):
                    href = "https://www.hibid.com" + href
                if href in seen: continue
                seen.add(href)
                title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]")
                title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
                price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]")
                price = price_el.get_text(strip=True) if price_el else ""
                if title and len(title) > 4:
                    results.append({"title":title,"url":href,"price":price,"source":"HiBid"})
            except Exception:
                continue

        print(f"[HiBid] {len(results)} resultados")
    except Exception as e:
        print(f"[HiBid] Error: {e}")

    return results


# ══════════════════════════════════════════════════════════════════════════════
# RUNNER PRINCIPAL
# ══════════════════════════════════════════════════════════════════════════════

SCRAPER_MAP = {
    "ebay":        scrape_ebay,
    "yachtworld":  scrape_yachtworld,
    "boattrader":  scrape_boattrader,
    "boats":       scrape_boats,
    "hibid":       scrape_hibid,
    "craigslist":  scrape_craigslist,
}

def run_test(query: str, targets: list = None, filters: dict = None):
    targets  = targets or list(SCRAPER_MAP.keys())
    filters  = filters or {}
    total    = 0
    all_ok   = []

    print("\n" + "="*60)
    print(f"  PRUEBA DE SCRAPERS  |  query: {query!r}")
    print("="*60 + "\n")

    for name in targets:
        fn = SCRAPER_MAP.get(name.lower())
        if not fn:
            print(f"[!] Scraper desconocido: {name}")
            continue
        print(f"\n{'-'*50}")
        print(f"  >> {name.upper()}")
        print(f"{'-'*50}")
        t0      = time.time()
        results = fn(query, filters)
        elapsed = time.time() - t0

        def safe(s): return s.encode('ascii','replace').decode('ascii')
        if results:
            all_ok.append(name)
            print(f"\n[OK] {name}: {len(results)} resultados en {elapsed:.1f}s")
            for i, r in enumerate(results[:3], 1):
                print(f"   {i}. {safe(r['title'][:70])}")
                if r.get('price'):
                    print(f"      $ {safe(r['price'])}")
                print(f"      > {r['url'][:80]}")
        else:
            print(f"\n[!!] {name}: 0 resultados en {elapsed:.1f}s")

        total += len(results)

    print("\n" + "="*60)
    print(f"  RESUMEN: {total} resultados totales")
    print(f"  Funcionando: {', '.join(all_ok) if all_ok else 'ninguno'}")
    print("="*60)


if __name__ == "__main__":
    args    = sys.argv[1:]
    query   = "sailboat velero"  # default
    targets = []

    if args:
        # El primer arg que NO empiece con letra de scraper es la query
        if args[0].lower() not in SCRAPER_MAP:
            query   = args[0]
            targets = [a.lower() for a in args[1:] if a.lower() in SCRAPER_MAP]
        else:
            targets = [a.lower() for a in args if a.lower() in SCRAPER_MAP]

    run_test(query, targets if targets else None)