Files
alro65 235a9abbfe security: SECRET_KEY from env, CORS restricted to localhost
- Replace hardcoded secret_key with os.environ.get('SECRET_KEY')
- RuntimeError if SECRET_KEY not set (fail fast)
- Restrict CORS to localhost:8765 origins (was allow all with credentials)
- Add .gitignore excluding db, env, __pycache__, backups

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-03 12:55:19 -04:00

638 lines
25 KiB
Python

"""
scraper_test.py — Prueba individual de scrapers sin levantar el servidor Flask.
Uso:
python scraper_test.py # prueba los 5 scrapers con query por defecto
python scraper_test.py "catalina 30" # query personalizada
python scraper_test.py "beneteau" ebay # solo eBay
python scraper_test.py "sailboat" yachtworld boattrader
Scrapers disponibles: ebay, yachtworld, boattrader, boats, hibid
"""
import sys
import re
import time
import random
import threading
import requests
import urllib3
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# ── User-Agents ──────────────────────────────────────────────────────────────
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
]
_interleave_lock = threading.Lock()
_interleave_idx = 0
_interleave_sites = ["https://miami.craigslist.org", "https://www.ebay.com"]
def get_headers(referer=None):
h = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
if referer:
h['Referer'] = referer
return h
def polite_pause(source_name: str):
global _interleave_idx
with _interleave_lock:
site = _interleave_sites[_interleave_idx % len(_interleave_sites)]
_interleave_idx += 1
try:
requests.get(site, headers=get_headers(), timeout=5, verify=False)
except Exception:
pass
time.sleep(random.uniform(2.0, 4.0))
print(f"[{source_name}] Pausa cortés lista — continuando...")
def _extract_best_src(img_tag) -> str:
candidates = [
img_tag.get("src",""), img_tag.get("data-src",""),
img_tag.get("data-lazy-src",""), img_tag.get("data-original",""),
img_tag.get("data-lazy",""), img_tag.get("data-image",""),
]
srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","")
if srcset:
parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()]
candidates.extend(parts)
for c in candidates:
c = c.strip()
if c and c.startswith("http") and not c.startswith("data:"):
return c
return ""
# ══════════════════════════════════════════════════════════════════════════════
# SCRAPERS
# ══════════════════════════════════════════════════════════════════════════════
def scrape_ebay(query: str, filters: dict = None) -> list:
filters = filters or {}
src = {
"name": "eBay Marine",
"search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10",
"type": "classifieds",
"category": "Clasificados USA",
}
results = []
seen = set()
raw_url = src["search_url"]
clean_q = " ".join(dict.fromkeys(query.strip().split()))
url = raw_url.replace("{query}", requests.utils.quote(clean_q))
# Ajuste de categoría por tipo de embarcación
vtype = filters.get("type","").lower()
EBAY_CAT = {
"sailboat":"36431","sail":"36431","velero":"36431",
"motor":"36432","motorboat":"36432","yacht":"36432",
"fishing":"36432","tug":"36432","barge":"36432",
}
if vtype and vtype in EBAY_CAT:
url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url)
print(f"[eBay] URL: {url}")
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
)
context = browser.new_context(
viewport={"width":1280,"height":900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", timezone_id="America/New_York",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(url, timeout=30000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(1500,2500))
page.evaluate("window.scrollBy(0,600)")
page.wait_for_timeout(800)
html = page.content()
except Exception as e:
print(f"[eBay] Playwright error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
print("[eBay] Sin HTML")
return []
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all("li", class_="s-card")
print(f"[eBay] Tarjetas nuevo layout: {len(cards)}")
if not cards:
# layout antiguo li.s-item
for item in soup.find_all("li", class_="s-item"):
try:
link = item.find("a", class_="s-item__link")
if not link: continue
href = link.get("href","")
m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href)
if not m: continue
href = m.group(1)
if href in seen: continue
seen.add(href)
title_el = (item.find("span", class_="BOLD") or
item.find("div", class_="s-item__title") or
item.find("span", class_="s-item__title"))
title = (title_el or link).get_text(strip=True)
if not title or title.lower().startswith("shop on ebay"): continue
price_el = item.find("span", class_="s-item__price")
price = price_el.get_text(strip=True) if price_el else ""
results.append({"title":title,"url":href,"price":price,"source":"eBay"})
except Exception:
continue
print(f"[eBay] {len(results)} resultados (layout antiguo)")
return results
for card in cards:
try:
title_link = None
for a in card.find_all("a", class_="s-card__link"):
if "image-treatment" in (a.get("class") or []): continue
t = a.get_text(strip=True)
if t and not t.lower().startswith("shop on ebay"):
title_link = a; break
if not title_link: continue
href = title_link.get("href","")
if "/itm/" not in href: continue
m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href)
if not m: continue
href = m.group(1)
if href in seen: continue
seen.add(href)
title = re.sub(r'\s*Opens in a new window or tab.*','',
title_link.get_text(strip=True), flags=re.I).strip()
price_el = (card.find(class_="s-card__price") or
card.find(class_="s-item__price"))
price = price_el.get_text(strip=True) if price_el else ""
results.append({"title":title,"url":href,"price":price,"source":"eBay"})
except Exception:
continue
print(f"[eBay] {len(results)} resultados")
except Exception as e:
print(f"[eBay] Error: {e}")
return results
def scrape_yachtworld(query: str, filters: dict = None) -> list:
filters = filters or {}
results = []
seen = set()
vessel_type = filters.get("type","").lower()
yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power"
base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/"
if vessel_type in ["sailboat","sail","velero",""]:
base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/"
print(f"[YachtWorld] URL: {base_url}")
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=['--disable-blink-features=AutomationControlled','--no-sandbox']
)
context = browser.new_context(
viewport={'width':1920,'height':1080},
user_agent=random.choice(USER_AGENTS),
locale='en-US', timezone_id='America/New_York',
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(base_url, timeout=30000, wait_until='domcontentloaded')
page.wait_for_timeout(random.randint(2000,4000))
for _ in range(3):
page.evaluate("window.scrollBy(0,400)")
page.wait_for_timeout(random.randint(400,800))
html = page.content()
except Exception as e:
print(f"[YachtWorld] Error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
print("[YachtWorld] Sin HTML")
return []
soup = BeautifulSoup(html,'html.parser')
page_count = 0
for a in soup.find_all('a', href=True):
href = a['href']
if '/boat-details/' in href or '/yacht/' in href:
if not href.startswith('http'):
href = 'https://www.yachtworld.com' + href
if href in seen: continue
seen.add(href)
title = a.get_text(strip=True)
results.append({"title":title,"url":href,"price":"","source":"YachtWorld"})
page_count += 1
print(f"[YachtWorld] {page_count} resultados")
except Exception as e:
print(f"[YachtWorld] Error: {e}")
return results
def scrape_boattrader(query: str, filters: dict = None) -> list:
filters = filters or {}
src = {
"name": "BoatTrader",
"search_url": "https://www.boattrader.com/boats/?query={query}",
"type": "broker",
"category": "Venta Especializada",
}
results = []
seen = set()
url = src["search_url"].replace("{query}", requests.utils.quote(query.strip()))
print(f"[BoatTrader] URL: {url}")
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
)
context = browser.new_context(
viewport={"width":1280,"height":900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", timezone_id="America/New_York",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(4000,6000))
page.evaluate("window.scrollBy(0,600)")
page.wait_for_timeout(1500)
html = page.content()
except Exception as e:
print(f"[BoatTrader] Error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
print("[BoatTrader] Sin HTML")
return []
soup = BeautifulSoup(html,"html.parser")
cards = soup.find_all("li", class_="lib-card")
if not cards:
cards = soup.find_all(class_=re.compile(r'\blib-card\b'))
print(f"[BoatTrader] Cards encontradas: {len(cards)}")
for card in cards:
try:
link_tag = card.find("a", href=re.compile(r'^/boat/[\w-]+-\d+/$'))
if not link_tag: continue
href = "https://www.boattrader.com" + link_tag["href"]
if href in seen: continue
seen.add(href)
title_el = card.find(class_=re.compile(r'listingTitle',re.I))
title = title_el.get_text(strip=True) if title_el else link_tag.get_text(strip=True)
price_el = card.find(class_=re.compile(r'listingPrice',re.I))
price = ""
if price_el:
pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True))
if pm: price = f"${pm.group(1)}"
results.append({"title":title,"url":href,"price":price,"source":"BoatTrader"})
except Exception:
continue
print(f"[BoatTrader] {len(results)} resultados")
except Exception as e:
print(f"[BoatTrader] Error: {e}")
return results
def scrape_boats(query: str, filters: dict = None) -> list:
filters = filters or {}
src = {
"name": "Boats.com",
"search_url": "https://www.boats.com/boats-for-sale/?query={query}",
"type": "broker",
"category": "Venta Especializada",
}
results = []
seen = set()
url = src["search_url"].replace("{query}", requests.utils.quote(query.strip()))
print(f"[Boats.com] URL: {url}")
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
)
context = browser.new_context(
viewport={"width":1280,"height":900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", timezone_id="America/New_York",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(4000,6000))
page.evaluate("window.scrollBy(0,600)")
page.wait_for_timeout(1500)
html = page.content()
except Exception as e:
print(f"[Boats.com] Error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
print("[Boats.com] Sin HTML")
return []
soup = BeautifulSoup(html,"html.parser")
def _extract_card(card):
a = card.find("a", href=re.compile(r'^/'))
if not a: return
href = "https://www.boats.com" + a["href"]
if href in seen: return
seen.add(href)
year_el = card.select_one("div.year")
name_el = card.select_one("h2")
year = year_el.get_text(strip=True) if year_el else ""
name = name_el.get_text(strip=True) if name_el else ""
title = f"{year} {name}".strip() if year else name
if not title: return
price_el = card.select_one("div.price")
price = ""
if price_el:
pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True))
price = f"${pm.group(1)}" if pm else ""
results.append({"title":title,"url":href,"price":price,"source":"Boats.com"})
for card in soup.select("li[data-listing-id]"):
_extract_card(card)
for card in soup.select("li.enhanced.oem"):
_extract_card(card)
print(f"[Boats.com] {len(results)} resultados")
except Exception as e:
print(f"[Boats.com] Error: {e}")
return results
def scrape_craigslist(query: str, filters: dict = None) -> list:
filters = filters or {}
results = []
seen = set()
CITIES = ["miami", "tampa", "sfbay", "losangeles", "seattle",
"boston", "newyork", "chicago", "sandiego"]
qs = requests.utils.quote(query.strip())
print(f"[Craigslist] Query: '{query}' - probando 3 ciudades al azar")
try:
from playwright.sync_api import sync_playwright
all_html_parts = []
cities_tested = random.sample(CITIES, min(3, len(CITIES)))
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
locale="en-US", ignore_https_errors=True,
)
for city in cities_tested:
city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel"
print(f"[Craigslist] >> {city_url}")
page = ctx.new_page()
try:
page.goto(city_url, timeout=25000, wait_until="domcontentloaded")
page.wait_for_timeout(2500)
all_html_parts.append((city, page.content()))
except Exception as e:
print(f"[Craigslist] {city} error: {e}")
finally:
try: page.close()
except: pass
browser.close()
for city, html in all_html_parts:
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all(attrs={"data-pid": True})
print(f"[Craigslist] {city}: {len(cards)} cards en HTML")
for card in cards:
try:
a_main = card.find("a", class_="main")
if not a_main: continue
listing_url = a_main.get("href","")
if not listing_url or listing_url in seen: continue
seen.add(listing_url)
title = card.get("title","")
if not title:
span = card.find("span", class_="label")
title = span.get_text(strip=True) if span else ""
if not title: continue
price_el = card.find("span", class_="priceinfo")
price = price_el.get_text(strip=True) if price_el else ""
results.append({
"title": title[:120], "url": listing_url,
"price": price, "source": f"Craigslist {city}",
})
except Exception:
continue
print(f"[Craigslist] {len(results)} resultados totales")
except Exception as e:
print(f"[Craigslist] Error: {e}")
return results
def scrape_hibid(query: str, filters: dict = None) -> list:
filters = filters or {}
src = {"name":"HiBid","type":"auction","category":"Subastas USA"}
results = []
q = requests.utils.quote((query.strip() + " boat"))
url = f"https://www.hibid.com/lots?q={q}"
print(f"[HiBid] URL: {url}")
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width":1280,"height":900},
locale="en-US", ignore_https_errors=True,
)
ctx.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
)
page = ctx.new_page()
try:
page.goto(url, timeout=30000, wait_until="domcontentloaded")
page.wait_for_timeout(4000)
html = page.content()
finally:
try: page.close()
except: pass
browser.close()
soup = BeautifulSoup(html,"html.parser")
seen = set()
selectors = ".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"
cards = soup.select(selectors)
print(f"[HiBid] Cards encontradas: {len(cards)}")
for card in cards:
try:
a = card.find("a", href=True)
if not a: continue
href = a["href"]
if not href.startswith("http"):
href = "https://www.hibid.com" + href
if href in seen: continue
seen.add(href)
title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]")
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]")
price = price_el.get_text(strip=True) if price_el else ""
if title and len(title) > 4:
results.append({"title":title,"url":href,"price":price,"source":"HiBid"})
except Exception:
continue
print(f"[HiBid] {len(results)} resultados")
except Exception as e:
print(f"[HiBid] Error: {e}")
return results
# ══════════════════════════════════════════════════════════════════════════════
# RUNNER PRINCIPAL
# ══════════════════════════════════════════════════════════════════════════════
SCRAPER_MAP = {
"ebay": scrape_ebay,
"yachtworld": scrape_yachtworld,
"boattrader": scrape_boattrader,
"boats": scrape_boats,
"hibid": scrape_hibid,
"craigslist": scrape_craigslist,
}
def run_test(query: str, targets: list = None, filters: dict = None):
targets = targets or list(SCRAPER_MAP.keys())
filters = filters or {}
total = 0
all_ok = []
print("\n" + "="*60)
print(f" PRUEBA DE SCRAPERS | query: {query!r}")
print("="*60 + "\n")
for name in targets:
fn = SCRAPER_MAP.get(name.lower())
if not fn:
print(f"[!] Scraper desconocido: {name}")
continue
print(f"\n{'-'*50}")
print(f" >> {name.upper()}")
print(f"{'-'*50}")
t0 = time.time()
results = fn(query, filters)
elapsed = time.time() - t0
def safe(s): return s.encode('ascii','replace').decode('ascii')
if results:
all_ok.append(name)
print(f"\n[OK] {name}: {len(results)} resultados en {elapsed:.1f}s")
for i, r in enumerate(results[:3], 1):
print(f" {i}. {safe(r['title'][:70])}")
if r.get('price'):
print(f" $ {safe(r['price'])}")
print(f" > {r['url'][:80]}")
else:
print(f"\n[!!] {name}: 0 resultados en {elapsed:.1f}s")
total += len(results)
print("\n" + "="*60)
print(f" RESUMEN: {total} resultados totales")
print(f" Funcionando: {', '.join(all_ok) if all_ok else 'ninguno'}")
print("="*60)
if __name__ == "__main__":
args = sys.argv[1:]
query = "sailboat velero" # default
targets = []
if args:
# El primer arg que NO empiece con letra de scraper es la query
if args[0].lower() not in SCRAPER_MAP:
query = args[0]
targets = [a.lower() for a in args[1:] if a.lower() in SCRAPER_MAP]
else:
targets = [a.lower() for a in args if a.lower() in SCRAPER_MAP]
run_test(query, targets if targets else None)