""" scraper_test.py — Prueba individual de scrapers sin levantar el servidor Flask. Uso: python scraper_test.py # prueba los 5 scrapers con query por defecto python scraper_test.py "catalina 30" # query personalizada python scraper_test.py "beneteau" ebay # solo eBay python scraper_test.py "sailboat" yachtworld boattrader Scrapers disponibles: ebay, yachtworld, boattrader, boats, hibid """ import sys import re import time import random import threading import requests import urllib3 from bs4 import BeautifulSoup urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # ── User-Agents ────────────────────────────────────────────────────────────── USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', ] _interleave_lock = threading.Lock() _interleave_idx = 0 _interleave_sites = ["https://miami.craigslist.org", "https://www.ebay.com"] def get_headers(referer=None): h = { 'User-Agent': random.choice(USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } if referer: h['Referer'] = referer return h def polite_pause(source_name: str): global _interleave_idx with _interleave_lock: site = _interleave_sites[_interleave_idx % len(_interleave_sites)] _interleave_idx += 1 try: requests.get(site, headers=get_headers(), timeout=5, verify=False) except Exception: pass time.sleep(random.uniform(2.0, 4.0)) print(f"[{source_name}] Pausa cortés lista — continuando...") def _extract_best_src(img_tag) -> str: candidates = [ img_tag.get("src",""), img_tag.get("data-src",""), img_tag.get("data-lazy-src",""), img_tag.get("data-original",""), img_tag.get("data-lazy",""), img_tag.get("data-image",""), ] srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","") if srcset: parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()] candidates.extend(parts) for c in candidates: c = c.strip() if c and c.startswith("http") and not c.startswith("data:"): return c return "" # ══════════════════════════════════════════════════════════════════════════════ # SCRAPERS # ══════════════════════════════════════════════════════════════════════════════ def scrape_ebay(query: str, filters: dict = None) -> list: filters = filters or {} src = { "name": "eBay Marine", "search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10", "type": "classifieds", "category": "Clasificados USA", } results = [] seen = set() raw_url = src["search_url"] clean_q = " ".join(dict.fromkeys(query.strip().split())) url = raw_url.replace("{query}", requests.utils.quote(clean_q)) # Ajuste de categoría por tipo de embarcación vtype = filters.get("type","").lower() EBAY_CAT = { "sailboat":"36431","sail":"36431","velero":"36431", "motor":"36432","motorboat":"36432","yacht":"36432", "fishing":"36432","tug":"36432","barge":"36432", } if vtype and vtype in EBAY_CAT: url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url) print(f"[eBay] URL: {url}") try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled","--no-sandbox"] ) context = browser.new_context( viewport={"width":1280,"height":900}, user_agent=random.choice(USER_AGENTS), locale="en-US", timezone_id="America/New_York", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(url, timeout=30000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(1500,2500)) page.evaluate("window.scrollBy(0,600)") page.wait_for_timeout(800) html = page.content() except Exception as e: print(f"[eBay] Playwright error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: print("[eBay] Sin HTML") return [] soup = BeautifulSoup(html, "html.parser") cards = soup.find_all("li", class_="s-card") print(f"[eBay] Tarjetas nuevo layout: {len(cards)}") if not cards: # layout antiguo li.s-item for item in soup.find_all("li", class_="s-item"): try: link = item.find("a", class_="s-item__link") if not link: continue href = link.get("href","") m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href) if not m: continue href = m.group(1) if href in seen: continue seen.add(href) title_el = (item.find("span", class_="BOLD") or item.find("div", class_="s-item__title") or item.find("span", class_="s-item__title")) title = (title_el or link).get_text(strip=True) if not title or title.lower().startswith("shop on ebay"): continue price_el = item.find("span", class_="s-item__price") price = price_el.get_text(strip=True) if price_el else "" results.append({"title":title,"url":href,"price":price,"source":"eBay"}) except Exception: continue print(f"[eBay] {len(results)} resultados (layout antiguo)") return results for card in cards: try: title_link = None for a in card.find_all("a", class_="s-card__link"): if "image-treatment" in (a.get("class") or []): continue t = a.get_text(strip=True) if t and not t.lower().startswith("shop on ebay"): title_link = a; break if not title_link: continue href = title_link.get("href","") if "/itm/" not in href: continue m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href) if not m: continue href = m.group(1) if href in seen: continue seen.add(href) title = re.sub(r'\s*Opens in a new window or tab.*','', title_link.get_text(strip=True), flags=re.I).strip() price_el = (card.find(class_="s-card__price") or card.find(class_="s-item__price")) price = price_el.get_text(strip=True) if price_el else "" results.append({"title":title,"url":href,"price":price,"source":"eBay"}) except Exception: continue print(f"[eBay] {len(results)} resultados") except Exception as e: print(f"[eBay] Error: {e}") return results def scrape_yachtworld(query: str, filters: dict = None) -> list: filters = filters or {} results = [] seen = set() vessel_type = filters.get("type","").lower() yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power" base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/" if vessel_type in ["sailboat","sail","velero",""]: base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/" print(f"[YachtWorld] URL: {base_url}") try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=['--disable-blink-features=AutomationControlled','--no-sandbox'] ) context = browser.new_context( viewport={'width':1920,'height':1080}, user_agent=random.choice(USER_AGENTS), locale='en-US', timezone_id='America/New_York', ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(base_url, timeout=30000, wait_until='domcontentloaded') page.wait_for_timeout(random.randint(2000,4000)) for _ in range(3): page.evaluate("window.scrollBy(0,400)") page.wait_for_timeout(random.randint(400,800)) html = page.content() except Exception as e: print(f"[YachtWorld] Error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: print("[YachtWorld] Sin HTML") return [] soup = BeautifulSoup(html,'html.parser') page_count = 0 for a in soup.find_all('a', href=True): href = a['href'] if '/boat-details/' in href or '/yacht/' in href: if not href.startswith('http'): href = 'https://www.yachtworld.com' + href if href in seen: continue seen.add(href) title = a.get_text(strip=True) results.append({"title":title,"url":href,"price":"","source":"YachtWorld"}) page_count += 1 print(f"[YachtWorld] {page_count} resultados") except Exception as e: print(f"[YachtWorld] Error: {e}") return results def scrape_boattrader(query: str, filters: dict = None) -> list: filters = filters or {} src = { "name": "BoatTrader", "search_url": "https://www.boattrader.com/boats/?query={query}", "type": "broker", "category": "Venta Especializada", } results = [] seen = set() url = src["search_url"].replace("{query}", requests.utils.quote(query.strip())) print(f"[BoatTrader] URL: {url}") try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled","--no-sandbox"] ) context = browser.new_context( viewport={"width":1280,"height":900}, user_agent=random.choice(USER_AGENTS), locale="en-US", timezone_id="America/New_York", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(4000,6000)) page.evaluate("window.scrollBy(0,600)") page.wait_for_timeout(1500) html = page.content() except Exception as e: print(f"[BoatTrader] Error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: print("[BoatTrader] Sin HTML") return [] soup = BeautifulSoup(html,"html.parser") cards = soup.find_all("li", class_="lib-card") if not cards: cards = soup.find_all(class_=re.compile(r'\blib-card\b')) print(f"[BoatTrader] Cards encontradas: {len(cards)}") for card in cards: try: link_tag = card.find("a", href=re.compile(r'^/boat/[\w-]+-\d+/$')) if not link_tag: continue href = "https://www.boattrader.com" + link_tag["href"] if href in seen: continue seen.add(href) title_el = card.find(class_=re.compile(r'listingTitle',re.I)) title = title_el.get_text(strip=True) if title_el else link_tag.get_text(strip=True) price_el = card.find(class_=re.compile(r'listingPrice',re.I)) price = "" if price_el: pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True)) if pm: price = f"${pm.group(1)}" results.append({"title":title,"url":href,"price":price,"source":"BoatTrader"}) except Exception: continue print(f"[BoatTrader] {len(results)} resultados") except Exception as e: print(f"[BoatTrader] Error: {e}") return results def scrape_boats(query: str, filters: dict = None) -> list: filters = filters or {} src = { "name": "Boats.com", "search_url": "https://www.boats.com/boats-for-sale/?query={query}", "type": "broker", "category": "Venta Especializada", } results = [] seen = set() url = src["search_url"].replace("{query}", requests.utils.quote(query.strip())) print(f"[Boats.com] URL: {url}") try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled","--no-sandbox"] ) context = browser.new_context( viewport={"width":1280,"height":900}, user_agent=random.choice(USER_AGENTS), locale="en-US", timezone_id="America/New_York", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(4000,6000)) page.evaluate("window.scrollBy(0,600)") page.wait_for_timeout(1500) html = page.content() except Exception as e: print(f"[Boats.com] Error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: print("[Boats.com] Sin HTML") return [] soup = BeautifulSoup(html,"html.parser") def _extract_card(card): a = card.find("a", href=re.compile(r'^/')) if not a: return href = "https://www.boats.com" + a["href"] if href in seen: return seen.add(href) year_el = card.select_one("div.year") name_el = card.select_one("h2") year = year_el.get_text(strip=True) if year_el else "" name = name_el.get_text(strip=True) if name_el else "" title = f"{year} {name}".strip() if year else name if not title: return price_el = card.select_one("div.price") price = "" if price_el: pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True)) price = f"${pm.group(1)}" if pm else "" results.append({"title":title,"url":href,"price":price,"source":"Boats.com"}) for card in soup.select("li[data-listing-id]"): _extract_card(card) for card in soup.select("li.enhanced.oem"): _extract_card(card) print(f"[Boats.com] {len(results)} resultados") except Exception as e: print(f"[Boats.com] Error: {e}") return results def scrape_craigslist(query: str, filters: dict = None) -> list: filters = filters or {} results = [] seen = set() CITIES = ["miami", "tampa", "sfbay", "losangeles", "seattle", "boston", "newyork", "chicago", "sandiego"] qs = requests.utils.quote(query.strip()) print(f"[Craigslist] Query: '{query}' - probando 3 ciudades al azar") try: from playwright.sync_api import sync_playwright all_html_parts = [] cities_tested = random.sample(CITIES, min(3, len(CITIES))) with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True, ) for city in cities_tested: city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel" print(f"[Craigslist] >> {city_url}") page = ctx.new_page() try: page.goto(city_url, timeout=25000, wait_until="domcontentloaded") page.wait_for_timeout(2500) all_html_parts.append((city, page.content())) except Exception as e: print(f"[Craigslist] {city} error: {e}") finally: try: page.close() except: pass browser.close() for city, html in all_html_parts: soup = BeautifulSoup(html, "html.parser") cards = soup.find_all(attrs={"data-pid": True}) print(f"[Craigslist] {city}: {len(cards)} cards en HTML") for card in cards: try: a_main = card.find("a", class_="main") if not a_main: continue listing_url = a_main.get("href","") if not listing_url or listing_url in seen: continue seen.add(listing_url) title = card.get("title","") if not title: span = card.find("span", class_="label") title = span.get_text(strip=True) if span else "" if not title: continue price_el = card.find("span", class_="priceinfo") price = price_el.get_text(strip=True) if price_el else "" results.append({ "title": title[:120], "url": listing_url, "price": price, "source": f"Craigslist {city}", }) except Exception: continue print(f"[Craigslist] {len(results)} resultados totales") except Exception as e: print(f"[Craigslist] Error: {e}") return results def scrape_hibid(query: str, filters: dict = None) -> list: filters = filters or {} src = {"name":"HiBid","type":"auction","category":"Subastas USA"} results = [] q = requests.utils.quote((query.strip() + " boat")) url = f"https://www.hibid.com/lots?q={q}" print(f"[HiBid] URL: {url}") try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={"width":1280,"height":900}, locale="en-US", ignore_https_errors=True, ) ctx.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" ) page = ctx.new_page() try: page.goto(url, timeout=30000, wait_until="domcontentloaded") page.wait_for_timeout(4000) html = page.content() finally: try: page.close() except: pass browser.close() soup = BeautifulSoup(html,"html.parser") seen = set() selectors = ".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]" cards = soup.select(selectors) print(f"[HiBid] Cards encontradas: {len(cards)}") for card in cards: try: a = card.find("a", href=True) if not a: continue href = a["href"] if not href.startswith("http"): href = "https://www.hibid.com" + href if href in seen: continue seen.add(href) title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]") title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]") price = price_el.get_text(strip=True) if price_el else "" if title and len(title) > 4: results.append({"title":title,"url":href,"price":price,"source":"HiBid"}) except Exception: continue print(f"[HiBid] {len(results)} resultados") except Exception as e: print(f"[HiBid] Error: {e}") return results # ══════════════════════════════════════════════════════════════════════════════ # RUNNER PRINCIPAL # ══════════════════════════════════════════════════════════════════════════════ SCRAPER_MAP = { "ebay": scrape_ebay, "yachtworld": scrape_yachtworld, "boattrader": scrape_boattrader, "boats": scrape_boats, "hibid": scrape_hibid, "craigslist": scrape_craigslist, } def run_test(query: str, targets: list = None, filters: dict = None): targets = targets or list(SCRAPER_MAP.keys()) filters = filters or {} total = 0 all_ok = [] print("\n" + "="*60) print(f" PRUEBA DE SCRAPERS | query: {query!r}") print("="*60 + "\n") for name in targets: fn = SCRAPER_MAP.get(name.lower()) if not fn: print(f"[!] Scraper desconocido: {name}") continue print(f"\n{'-'*50}") print(f" >> {name.upper()}") print(f"{'-'*50}") t0 = time.time() results = fn(query, filters) elapsed = time.time() - t0 def safe(s): return s.encode('ascii','replace').decode('ascii') if results: all_ok.append(name) print(f"\n[OK] {name}: {len(results)} resultados en {elapsed:.1f}s") for i, r in enumerate(results[:3], 1): print(f" {i}. {safe(r['title'][:70])}") if r.get('price'): print(f" $ {safe(r['price'])}") print(f" > {r['url'][:80]}") else: print(f"\n[!!] {name}: 0 resultados en {elapsed:.1f}s") total += len(results) print("\n" + "="*60) print(f" RESUMEN: {total} resultados totales") print(f" Funcionando: {', '.join(all_ok) if all_ok else 'ninguno'}") print("="*60) if __name__ == "__main__": args = sys.argv[1:] query = "sailboat velero" # default targets = [] if args: # El primer arg que NO empiece con letra de scraper es la query if args[0].lower() not in SCRAPER_MAP: query = args[0] targets = [a.lower() for a in args[1:] if a.lower() in SCRAPER_MAP] else: targets = [a.lower() for a in args if a.lower() in SCRAPER_MAP] run_test(query, targets if targets else None)