235a9abbfe
- Replace hardcoded secret_key with os.environ.get('SECRET_KEY')
- RuntimeError if SECRET_KEY not set (fail fast)
- Restrict CORS to localhost:8765 origins (was allow all with credentials)
- Add .gitignore excluding db, env, __pycache__, backups
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
638 lines
25 KiB
Python
638 lines
25 KiB
Python
"""
|
|
scraper_test.py — Prueba individual de scrapers sin levantar el servidor Flask.
|
|
|
|
Uso:
|
|
python scraper_test.py # prueba los 5 scrapers con query por defecto
|
|
python scraper_test.py "catalina 30" # query personalizada
|
|
python scraper_test.py "beneteau" ebay # solo eBay
|
|
python scraper_test.py "sailboat" yachtworld boattrader
|
|
|
|
Scrapers disponibles: ebay, yachtworld, boattrader, boats, hibid
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import time
|
|
import random
|
|
import threading
|
|
import requests
|
|
import urllib3
|
|
from bs4 import BeautifulSoup
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# ── User-Agents ──────────────────────────────────────────────────────────────
|
|
USER_AGENTS = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
|
|
]
|
|
|
|
_interleave_lock = threading.Lock()
|
|
_interleave_idx = 0
|
|
_interleave_sites = ["https://miami.craigslist.org", "https://www.ebay.com"]
|
|
|
|
|
|
def get_headers(referer=None):
|
|
h = {
|
|
'User-Agent': random.choice(USER_AGENTS),
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Connection': 'keep-alive',
|
|
'Cache-Control': 'max-age=0',
|
|
}
|
|
if referer:
|
|
h['Referer'] = referer
|
|
return h
|
|
|
|
|
|
def polite_pause(source_name: str):
|
|
global _interleave_idx
|
|
with _interleave_lock:
|
|
site = _interleave_sites[_interleave_idx % len(_interleave_sites)]
|
|
_interleave_idx += 1
|
|
try:
|
|
requests.get(site, headers=get_headers(), timeout=5, verify=False)
|
|
except Exception:
|
|
pass
|
|
time.sleep(random.uniform(2.0, 4.0))
|
|
print(f"[{source_name}] Pausa cortés lista — continuando...")
|
|
|
|
|
|
def _extract_best_src(img_tag) -> str:
|
|
candidates = [
|
|
img_tag.get("src",""), img_tag.get("data-src",""),
|
|
img_tag.get("data-lazy-src",""), img_tag.get("data-original",""),
|
|
img_tag.get("data-lazy",""), img_tag.get("data-image",""),
|
|
]
|
|
srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","")
|
|
if srcset:
|
|
parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()]
|
|
candidates.extend(parts)
|
|
for c in candidates:
|
|
c = c.strip()
|
|
if c and c.startswith("http") and not c.startswith("data:"):
|
|
return c
|
|
return ""
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SCRAPERS
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
def scrape_ebay(query: str, filters: dict = None) -> list:
|
|
filters = filters or {}
|
|
src = {
|
|
"name": "eBay Marine",
|
|
"search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10",
|
|
"type": "classifieds",
|
|
"category": "Clasificados USA",
|
|
}
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = src["search_url"]
|
|
clean_q = " ".join(dict.fromkeys(query.strip().split()))
|
|
url = raw_url.replace("{query}", requests.utils.quote(clean_q))
|
|
|
|
# Ajuste de categoría por tipo de embarcación
|
|
vtype = filters.get("type","").lower()
|
|
EBAY_CAT = {
|
|
"sailboat":"36431","sail":"36431","velero":"36431",
|
|
"motor":"36432","motorboat":"36432","yacht":"36432",
|
|
"fishing":"36432","tug":"36432","barge":"36432",
|
|
}
|
|
if vtype and vtype in EBAY_CAT:
|
|
url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url)
|
|
|
|
print(f"[eBay] URL: {url}")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width":1280,"height":900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", timezone_id="America/New_York",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(1500,2500))
|
|
page.evaluate("window.scrollBy(0,600)")
|
|
page.wait_for_timeout(800)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[eBay] Playwright error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
print("[eBay] Sin HTML")
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards = soup.find_all("li", class_="s-card")
|
|
print(f"[eBay] Tarjetas nuevo layout: {len(cards)}")
|
|
|
|
if not cards:
|
|
# layout antiguo li.s-item
|
|
for item in soup.find_all("li", class_="s-item"):
|
|
try:
|
|
link = item.find("a", class_="s-item__link")
|
|
if not link: continue
|
|
href = link.get("href","")
|
|
m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href)
|
|
if not m: continue
|
|
href = m.group(1)
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
title_el = (item.find("span", class_="BOLD") or
|
|
item.find("div", class_="s-item__title") or
|
|
item.find("span", class_="s-item__title"))
|
|
title = (title_el or link).get_text(strip=True)
|
|
if not title or title.lower().startswith("shop on ebay"): continue
|
|
price_el = item.find("span", class_="s-item__price")
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
results.append({"title":title,"url":href,"price":price,"source":"eBay"})
|
|
except Exception:
|
|
continue
|
|
print(f"[eBay] {len(results)} resultados (layout antiguo)")
|
|
return results
|
|
|
|
for card in cards:
|
|
try:
|
|
title_link = None
|
|
for a in card.find_all("a", class_="s-card__link"):
|
|
if "image-treatment" in (a.get("class") or []): continue
|
|
t = a.get_text(strip=True)
|
|
if t and not t.lower().startswith("shop on ebay"):
|
|
title_link = a; break
|
|
if not title_link: continue
|
|
href = title_link.get("href","")
|
|
if "/itm/" not in href: continue
|
|
m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href)
|
|
if not m: continue
|
|
href = m.group(1)
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
title = re.sub(r'\s*Opens in a new window or tab.*','',
|
|
title_link.get_text(strip=True), flags=re.I).strip()
|
|
price_el = (card.find(class_="s-card__price") or
|
|
card.find(class_="s-item__price"))
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
results.append({"title":title,"url":href,"price":price,"source":"eBay"})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[eBay] {len(results)} resultados")
|
|
except Exception as e:
|
|
print(f"[eBay] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_yachtworld(query: str, filters: dict = None) -> list:
|
|
filters = filters or {}
|
|
results = []
|
|
seen = set()
|
|
vessel_type = filters.get("type","").lower()
|
|
yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power"
|
|
|
|
base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/"
|
|
if vessel_type in ["sailboat","sail","velero",""]:
|
|
base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/"
|
|
|
|
print(f"[YachtWorld] URL: {base_url}")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=['--disable-blink-features=AutomationControlled','--no-sandbox']
|
|
)
|
|
context = browser.new_context(
|
|
viewport={'width':1920,'height':1080},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale='en-US', timezone_id='America/New_York',
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(base_url, timeout=30000, wait_until='domcontentloaded')
|
|
page.wait_for_timeout(random.randint(2000,4000))
|
|
for _ in range(3):
|
|
page.evaluate("window.scrollBy(0,400)")
|
|
page.wait_for_timeout(random.randint(400,800))
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[YachtWorld] Error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
print("[YachtWorld] Sin HTML")
|
|
return []
|
|
|
|
soup = BeautifulSoup(html,'html.parser')
|
|
page_count = 0
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
if '/boat-details/' in href or '/yacht/' in href:
|
|
if not href.startswith('http'):
|
|
href = 'https://www.yachtworld.com' + href
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
title = a.get_text(strip=True)
|
|
results.append({"title":title,"url":href,"price":"","source":"YachtWorld"})
|
|
page_count += 1
|
|
|
|
print(f"[YachtWorld] {page_count} resultados")
|
|
except Exception as e:
|
|
print(f"[YachtWorld] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_boattrader(query: str, filters: dict = None) -> list:
|
|
filters = filters or {}
|
|
src = {
|
|
"name": "BoatTrader",
|
|
"search_url": "https://www.boattrader.com/boats/?query={query}",
|
|
"type": "broker",
|
|
"category": "Venta Especializada",
|
|
}
|
|
results = []
|
|
seen = set()
|
|
url = src["search_url"].replace("{query}", requests.utils.quote(query.strip()))
|
|
print(f"[BoatTrader] URL: {url}")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width":1280,"height":900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", timezone_id="America/New_York",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(4000,6000))
|
|
page.evaluate("window.scrollBy(0,600)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[BoatTrader] Error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
print("[BoatTrader] Sin HTML")
|
|
return []
|
|
|
|
soup = BeautifulSoup(html,"html.parser")
|
|
cards = soup.find_all("li", class_="lib-card")
|
|
if not cards:
|
|
cards = soup.find_all(class_=re.compile(r'\blib-card\b'))
|
|
print(f"[BoatTrader] Cards encontradas: {len(cards)}")
|
|
|
|
for card in cards:
|
|
try:
|
|
link_tag = card.find("a", href=re.compile(r'^/boat/[\w-]+-\d+/$'))
|
|
if not link_tag: continue
|
|
href = "https://www.boattrader.com" + link_tag["href"]
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
title_el = card.find(class_=re.compile(r'listingTitle',re.I))
|
|
title = title_el.get_text(strip=True) if title_el else link_tag.get_text(strip=True)
|
|
price_el = card.find(class_=re.compile(r'listingPrice',re.I))
|
|
price = ""
|
|
if price_el:
|
|
pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True))
|
|
if pm: price = f"${pm.group(1)}"
|
|
results.append({"title":title,"url":href,"price":price,"source":"BoatTrader"})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[BoatTrader] {len(results)} resultados")
|
|
except Exception as e:
|
|
print(f"[BoatTrader] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_boats(query: str, filters: dict = None) -> list:
|
|
filters = filters or {}
|
|
src = {
|
|
"name": "Boats.com",
|
|
"search_url": "https://www.boats.com/boats-for-sale/?query={query}",
|
|
"type": "broker",
|
|
"category": "Venta Especializada",
|
|
}
|
|
results = []
|
|
seen = set()
|
|
url = src["search_url"].replace("{query}", requests.utils.quote(query.strip()))
|
|
print(f"[Boats.com] URL: {url}")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width":1280,"height":900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", timezone_id="America/New_York",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(4000,6000))
|
|
page.evaluate("window.scrollBy(0,600)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[Boats.com] Error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
print("[Boats.com] Sin HTML")
|
|
return []
|
|
|
|
soup = BeautifulSoup(html,"html.parser")
|
|
|
|
def _extract_card(card):
|
|
a = card.find("a", href=re.compile(r'^/'))
|
|
if not a: return
|
|
href = "https://www.boats.com" + a["href"]
|
|
if href in seen: return
|
|
seen.add(href)
|
|
year_el = card.select_one("div.year")
|
|
name_el = card.select_one("h2")
|
|
year = year_el.get_text(strip=True) if year_el else ""
|
|
name = name_el.get_text(strip=True) if name_el else ""
|
|
title = f"{year} {name}".strip() if year else name
|
|
if not title: return
|
|
price_el = card.select_one("div.price")
|
|
price = ""
|
|
if price_el:
|
|
pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True))
|
|
price = f"${pm.group(1)}" if pm else ""
|
|
results.append({"title":title,"url":href,"price":price,"source":"Boats.com"})
|
|
|
|
for card in soup.select("li[data-listing-id]"):
|
|
_extract_card(card)
|
|
for card in soup.select("li.enhanced.oem"):
|
|
_extract_card(card)
|
|
|
|
print(f"[Boats.com] {len(results)} resultados")
|
|
except Exception as e:
|
|
print(f"[Boats.com] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_craigslist(query: str, filters: dict = None) -> list:
|
|
filters = filters or {}
|
|
results = []
|
|
seen = set()
|
|
CITIES = ["miami", "tampa", "sfbay", "losangeles", "seattle",
|
|
"boston", "newyork", "chicago", "sandiego"]
|
|
qs = requests.utils.quote(query.strip())
|
|
|
|
print(f"[Craigslist] Query: '{query}' - probando 3 ciudades al azar")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
all_html_parts = []
|
|
cities_tested = random.sample(CITIES, min(3, len(CITIES)))
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", ignore_https_errors=True,
|
|
)
|
|
for city in cities_tested:
|
|
city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel"
|
|
print(f"[Craigslist] >> {city_url}")
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(city_url, timeout=25000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(2500)
|
|
all_html_parts.append((city, page.content()))
|
|
except Exception as e:
|
|
print(f"[Craigslist] {city} error: {e}")
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
for city, html in all_html_parts:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards = soup.find_all(attrs={"data-pid": True})
|
|
print(f"[Craigslist] {city}: {len(cards)} cards en HTML")
|
|
for card in cards:
|
|
try:
|
|
a_main = card.find("a", class_="main")
|
|
if not a_main: continue
|
|
listing_url = a_main.get("href","")
|
|
if not listing_url or listing_url in seen: continue
|
|
seen.add(listing_url)
|
|
title = card.get("title","")
|
|
if not title:
|
|
span = card.find("span", class_="label")
|
|
title = span.get_text(strip=True) if span else ""
|
|
if not title: continue
|
|
price_el = card.find("span", class_="priceinfo")
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
results.append({
|
|
"title": title[:120], "url": listing_url,
|
|
"price": price, "source": f"Craigslist {city}",
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[Craigslist] {len(results)} resultados totales")
|
|
except Exception as e:
|
|
print(f"[Craigslist] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_hibid(query: str, filters: dict = None) -> list:
|
|
filters = filters or {}
|
|
src = {"name":"HiBid","type":"auction","category":"Subastas USA"}
|
|
results = []
|
|
q = requests.utils.quote((query.strip() + " boat"))
|
|
url = f"https://www.hibid.com/lots?q={q}"
|
|
print(f"[HiBid] URL: {url}")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
viewport={"width":1280,"height":900},
|
|
locale="en-US", ignore_https_errors=True,
|
|
)
|
|
ctx.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(4000)
|
|
html = page.content()
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
soup = BeautifulSoup(html,"html.parser")
|
|
seen = set()
|
|
selectors = ".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"
|
|
cards = soup.select(selectors)
|
|
print(f"[HiBid] Cards encontradas: {len(cards)}")
|
|
|
|
for card in cards:
|
|
try:
|
|
a = card.find("a", href=True)
|
|
if not a: continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = "https://www.hibid.com" + href
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]")
|
|
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
|
|
price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]")
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
if title and len(title) > 4:
|
|
results.append({"title":title,"url":href,"price":price,"source":"HiBid"})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[HiBid] {len(results)} resultados")
|
|
except Exception as e:
|
|
print(f"[HiBid] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# RUNNER PRINCIPAL
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
SCRAPER_MAP = {
|
|
"ebay": scrape_ebay,
|
|
"yachtworld": scrape_yachtworld,
|
|
"boattrader": scrape_boattrader,
|
|
"boats": scrape_boats,
|
|
"hibid": scrape_hibid,
|
|
"craigslist": scrape_craigslist,
|
|
}
|
|
|
|
def run_test(query: str, targets: list = None, filters: dict = None):
|
|
targets = targets or list(SCRAPER_MAP.keys())
|
|
filters = filters or {}
|
|
total = 0
|
|
all_ok = []
|
|
|
|
print("\n" + "="*60)
|
|
print(f" PRUEBA DE SCRAPERS | query: {query!r}")
|
|
print("="*60 + "\n")
|
|
|
|
for name in targets:
|
|
fn = SCRAPER_MAP.get(name.lower())
|
|
if not fn:
|
|
print(f"[!] Scraper desconocido: {name}")
|
|
continue
|
|
print(f"\n{'-'*50}")
|
|
print(f" >> {name.upper()}")
|
|
print(f"{'-'*50}")
|
|
t0 = time.time()
|
|
results = fn(query, filters)
|
|
elapsed = time.time() - t0
|
|
|
|
def safe(s): return s.encode('ascii','replace').decode('ascii')
|
|
if results:
|
|
all_ok.append(name)
|
|
print(f"\n[OK] {name}: {len(results)} resultados en {elapsed:.1f}s")
|
|
for i, r in enumerate(results[:3], 1):
|
|
print(f" {i}. {safe(r['title'][:70])}")
|
|
if r.get('price'):
|
|
print(f" $ {safe(r['price'])}")
|
|
print(f" > {r['url'][:80]}")
|
|
else:
|
|
print(f"\n[!!] {name}: 0 resultados en {elapsed:.1f}s")
|
|
|
|
total += len(results)
|
|
|
|
print("\n" + "="*60)
|
|
print(f" RESUMEN: {total} resultados totales")
|
|
print(f" Funcionando: {', '.join(all_ok) if all_ok else 'ninguno'}")
|
|
print("="*60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1:]
|
|
query = "sailboat velero" # default
|
|
targets = []
|
|
|
|
if args:
|
|
# El primer arg que NO empiece con letra de scraper es la query
|
|
if args[0].lower() not in SCRAPER_MAP:
|
|
query = args[0]
|
|
targets = [a.lower() for a in args[1:] if a.lower() in SCRAPER_MAP]
|
|
else:
|
|
targets = [a.lower() for a in args if a.lower() in SCRAPER_MAP]
|
|
|
|
run_test(query, targets if targets else None)
|