From 235a9abbfe989981efdc5e4db15c15acddb86ce1 Mon Sep 17 00:00:00 2001 From: alro1965 Date: Fri, 3 Jul 2026 12:55:19 -0400 Subject: [PATCH] security: SECRET_KEY from env, CORS restricted to localhost - Replace hardcoded secret_key with os.environ.get('SECRET_KEY') - RuntimeError if SECRET_KEY not set (fail fast) - Restrict CORS to localhost:8765 origins (was allow all with credentials) - Add .gitignore excluding db, env, __pycache__, backups Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 21 + INICIAR.bat | 99 + LEEME.md | 151 ++ _sbl_patch.py | 1 + scraper_test.py | 637 ++++++ server.py | 5402 +++++++++++++++++++++++++++++++++++++++++++++ static/index.html | 1344 +++++++++++ stop.bat | 15 + 8 files changed, 7670 insertions(+) create mode 100644 .gitignore create mode 100644 INICIAR.bat create mode 100644 LEEME.md create mode 100644 _sbl_patch.py create mode 100644 scraper_test.py create mode 100644 server.py create mode 100644 static/index.html create mode 100644 stop.bat diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..afefba1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ + +# Database (local data) +*.db +*.sqlite + +# Environment secrets +.env +*.env + +# Backup archives +Backup/ +*.zip + +# OS +.DS_Store +Thumbs.db diff --git a/INICIAR.bat b/INICIAR.bat new file mode 100644 index 0000000..e67930a --- /dev/null +++ b/INICIAR.bat @@ -0,0 +1,99 @@ +@echo off +chcp 65001 >nul +title Boat^&Ship-Finder + +echo. +echo =================================================== +echo Boat^&Ship-Finder - Broker Tool +echo =================================================== +echo. + +:: Buscar Python +set PYTHON= +where python >nul 2>&1 && set PYTHON=python +if not defined PYTHON ( + where python3 >nul 2>&1 && set PYTHON=python3 +) +if not defined PYTHON ( + if exist "%LOCALAPPDATA%\Programs\Python\Python312\python.exe" set PYTHON=%LOCALAPPDATA%\Programs\Python\Python312\python.exe +) +if not defined PYTHON ( + if exist "%LOCALAPPDATA%\Programs\Python\Python311\python.exe" set PYTHON=%LOCALAPPDATA%\Programs\Python\Python311\python.exe +) +if not defined PYTHON ( + if exist "%LOCALAPPDATA%\Programs\Python\Python310\python.exe" set PYTHON=%LOCALAPPDATA%\Programs\Python\Python310\python.exe +) +if not defined PYTHON ( + if exist "C:\Python312\python.exe" set PYTHON=C:\Python312\python.exe +) +if not defined PYTHON ( + if exist "C:\Python311\python.exe" set PYTHON=C:\Python311\python.exe +) +if not defined PYTHON ( + echo [ERROR] No se encontro Python. + echo. + echo Descargalo de: https://www.python.org/downloads/ + echo Durante la instalacion marca: "Add Python to PATH" + echo. + pause & exit /b 1 +) +echo [OK] Python: %PYTHON% + +:: Verificar/instalar Flask +%PYTHON% -c "import flask" >nul 2>&1 +if %errorlevel% neq 0 ( + echo [INSTALANDO] Flask y dependencias... + %PYTHON% -m pip install flask flask-cors requests beautifulsoup4 --quiet + echo [OK] Dependencias instaladas. +) + +:: Verificar Ollama +curl -s http://localhost:11434/api/tags >nul 2>&1 +if %errorlevel% neq 0 ( + echo. + echo [AVISO] Ollama no esta corriendo. + echo Abre Ollama Desktop desde la barra de tareas. + echo Luego presiona cualquier tecla aqui. + echo. + pause >nul +) +echo [OK] Ollama activo. + +:: Puerto fijo +set PORT=8765 +set MARINE_PORT=8765 +echo [OK] Puerto: %PORT% + +:: Obtener IP de Tailscale +set TSIP= +for /f "tokens=*" %%i in ('tailscale ip -4 2^>nul') do set TSIP=%%i +if not defined TSIP ( + for /f "tokens=2 delims=:" %%a in ('ipconfig ^| findstr /i "tailscale" 2^>nul') do set TSIP=%%a +) + +echo. +echo =================================================== +echo Corriendo en puerto %PORT% +echo. +echo Desde esta PC: http://localhost:%PORT% +if defined TSIP ( +echo Desde tu celular: http://%TSIP%:%PORT% +) else ( +echo Tailscale: no detectado +) +echo. +echo Presiona Ctrl+C para detener +echo =================================================== +echo. + +:: Ir a carpeta con server.py +if exist "server.py" goto :run +if exist "Boat^&Ship-Finder\server.py" cd /d "Boat^&Ship-Finder" + +:run +:: Abrir navegador en 5 segundos +start "" cmd /c "timeout /t 5 /nobreak >nul & start http://localhost:8765" + +:: Iniciar servidor +%PYTHON% server.py +pause diff --git a/LEEME.md b/LEEME.md new file mode 100644 index 0000000..c30e8c2 --- /dev/null +++ b/LEEME.md @@ -0,0 +1,151 @@ +# Boat&Ship-Finder — Guía de Instalación +## Broker Intelligence Platform + Ollama (Windows + Tailscale) + +--- + +## LO QUE TIENES + +``` +Boat&Ship-Finder/ +├── INSTALAR.bat ← Ejecuta esto PRIMERO (una sola vez) +├── INICIAR.bat ← Ejecuta esto cada vez que quieras usar la app +├── server.py ← Backend Python (Flask + Ollama) +└── static/ + └── index.html ← Frontend (dashboard completo) +``` + +--- + +## PASO 1 — Verificar que Ollama está corriendo + +1. Abre **Ollama Desktop** desde tu barra de tareas +2. Verifica en el navegador: http://localhost:11434 + - Debes ver: `Ollama is running` + +Los modelos que usará el sistema: +- **qwen2.5:72b** → Extracción y análisis principal +- **llama3.1:8b** → Clasificación rápida +- **nomic-embed-text** → Deduplicación semántica + +--- + +## PASO 2 — Instalar (solo la primera vez) + +1. Pon todos los archivos en una carpeta (ej: `C:\Boat&Ship-Finder\`) +2. Doble clic en **INSTALAR.bat** +3. Espera que termine (instala Flask y dependencias Python) + +--- + +## PASO 3 — Iniciar el servidor + +1. Doble clic en **INICIAR.bat** +2. Verás en pantalla: + ``` + Acceso local: http://localhost:8000 + Acceso Tailscale: http://100.x.x.x:8000 + ``` +3. Abre esa URL en tu navegador o celular + +--- + +## ACCESO DESDE CELULAR (Tailscale) + +1. Instala Tailscale en tu celular (App Store / Play Store) +2. Inicia sesión con la misma cuenta que tu Windows +3. Abre en el celular: `http://100.x.x.x:8000` + (usa la IP que muestra INICIAR.bat) + +--- + +## CÓMO USAR LA APP + +### Buscar con IA +- Clic en **"⚡ Buscar con IA Local"** (barra de búsqueda) +- Escribe en lenguaje natural: + - `"remolcador acero más de 30 metros en subasta"` + - `"fishing vessel noruego buen precio"` + - `"offshore support vessel government surplus Florida"` +- La IA consulta +60 fuentes y extrae resultados + +### Analizar un anuncio +- Clic en **"📋 Analizar"** +- Pega el texto de cualquier anuncio (periódico, email, clasificado) +- La IA extrae automáticamente todos los datos técnicos + +### Guardar favoritas +- Clic en ☆ en cualquier tarjeta +- Ver en la pestaña **"★ Guardadas"** + +### Crear alertas +- Pestaña **"🔔 Alertas"** +- Define criterios (tipo, precio, estado) +- El sistema notifica cuando encuentra coincidencias + +--- + +## FUENTES MONITOREADAS (+60) + +### Subastas USA +GovPlanet, GovDeals, PropertyRoom, PublicSurplus, AuctionTime, IronPlanet, HiBid, Copart, BidSpotter + +### Subastas Internacionales +Ritchie Bros, Euro Auctions, Troostwijk, Surplex, BVA, Catawiki, ShipXchange + +### Venta Especializada +YachtWorld, Boats.com, BoatTrader, Apollo Duck, Rightboat, Boat24, Seaboats, NauticExpo + +### Clasificados Globales +Craigslist, eBay Marine, Facebook Marketplace, Kijiji, Gumtree, Subito.it, LeBonCoin, Wallapop, MercadoLibre, OLX + +### Salvage & Wrecks +Salvex, MarineWrecks, BoatBreakers, NavAuctions, Barnacle Marine + +### Revistas & Noticias +Boat International, Superyacht Times, The Triton, WorkBoat, Lloyd's List, TradeWinds, Maritime Executive, Splash247 + +### Registros +USCG, UK Ship Register, Panama Registry, Marshall Islands, Liberian Registry, Bahamas Maritime + +--- + +## CAMBIAR EL MODELO DE IA + +Edita `server.py`, sección `MODELS`: + +```python +MODELS = { + 'extract': 'qwen2.5:72b', # Cambia por cualquier modelo que tengas + 'classify': 'llama3.1:8b', + 'embed': 'nomic-embed-text', + 'parse': 'qwen3-coder:latest' +} +``` + +Para ver tus modelos disponibles: http://localhost:11434/api/tags + +--- + +## SOLUCIÓN DE PROBLEMAS + +| Problema | Solución | +|----------|----------| +| "Servidor desconectado" | Ejecuta INICIAR.bat | +| "Ollama no responde" | Abre Ollama Desktop | +| Sin resultados en búsqueda | Verifica que qwen2.5:72b está descargado | +| No accede desde celular | Verifica que Tailscale está activo en ambos dispositivos | +| Puerto 8000 ocupado | Cambia `port=8000` a `port=8001` en server.py | + +--- + +## AGREGAR MÁS FUENTES + +En `server.py`, sección `SOURCES`, agrega: + +```python +"Mi categoría": [ + {"name": "NombreSitio", "url": "https://sitio.com", "type": "auction"}, +], +``` + +Tipos disponibles: `auction`, `broker`, `classifieds`, `salvage`, `news`, `magazine`, `registry`, `commercial` diff --git a/_sbl_patch.py b/_sbl_patch.py new file mode 100644 index 0000000..f5011c4 --- /dev/null +++ b/_sbl_patch.py @@ -0,0 +1 @@ +# Patch marker — not used, just to verify write access diff --git a/scraper_test.py b/scraper_test.py new file mode 100644 index 0000000..1b8bb00 --- /dev/null +++ b/scraper_test.py @@ -0,0 +1,637 @@ +""" +scraper_test.py — Prueba individual de scrapers sin levantar el servidor Flask. + +Uso: + python scraper_test.py # prueba los 5 scrapers con query por defecto + python scraper_test.py "catalina 30" # query personalizada + python scraper_test.py "beneteau" ebay # solo eBay + python scraper_test.py "sailboat" yachtworld boattrader + +Scrapers disponibles: ebay, yachtworld, boattrader, boats, hibid +""" + +import sys +import re +import time +import random +import threading +import requests +import urllib3 +from bs4 import BeautifulSoup + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# ── User-Agents ────────────────────────────────────────────────────────────── +USER_AGENTS = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', +] + +_interleave_lock = threading.Lock() +_interleave_idx = 0 +_interleave_sites = ["https://miami.craigslist.org", "https://www.ebay.com"] + + +def get_headers(referer=None): + h = { + 'User-Agent': random.choice(USER_AGENTS), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Cache-Control': 'max-age=0', + } + if referer: + h['Referer'] = referer + return h + + +def polite_pause(source_name: str): + global _interleave_idx + with _interleave_lock: + site = _interleave_sites[_interleave_idx % len(_interleave_sites)] + _interleave_idx += 1 + try: + requests.get(site, headers=get_headers(), timeout=5, verify=False) + except Exception: + pass + time.sleep(random.uniform(2.0, 4.0)) + print(f"[{source_name}] Pausa cortés lista — continuando...") + + +def _extract_best_src(img_tag) -> str: + candidates = [ + img_tag.get("src",""), img_tag.get("data-src",""), + img_tag.get("data-lazy-src",""), img_tag.get("data-original",""), + img_tag.get("data-lazy",""), img_tag.get("data-image",""), + ] + srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","") + if srcset: + parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()] + candidates.extend(parts) + for c in candidates: + c = c.strip() + if c and c.startswith("http") and not c.startswith("data:"): + return c + return "" + + +# ══════════════════════════════════════════════════════════════════════════════ +# SCRAPERS +# ══════════════════════════════════════════════════════════════════════════════ + +def scrape_ebay(query: str, filters: dict = None) -> list: + filters = filters or {} + src = { + "name": "eBay Marine", + "search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10", + "type": "classifieds", + "category": "Clasificados USA", + } + results = [] + seen = set() + + raw_url = src["search_url"] + clean_q = " ".join(dict.fromkeys(query.strip().split())) + url = raw_url.replace("{query}", requests.utils.quote(clean_q)) + + # Ajuste de categoría por tipo de embarcación + vtype = filters.get("type","").lower() + EBAY_CAT = { + "sailboat":"36431","sail":"36431","velero":"36431", + "motor":"36432","motorboat":"36432","yacht":"36432", + "fishing":"36432","tug":"36432","barge":"36432", + } + if vtype and vtype in EBAY_CAT: + url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url) + + print(f"[eBay] URL: {url}") + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled","--no-sandbox"] + ) + context = browser.new_context( + viewport={"width":1280,"height":900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", timezone_id="America/New_York", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(url, timeout=30000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(1500,2500)) + page.evaluate("window.scrollBy(0,600)") + page.wait_for_timeout(800) + html = page.content() + except Exception as e: + print(f"[eBay] Playwright error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + print("[eBay] Sin HTML") + return [] + + soup = BeautifulSoup(html, "html.parser") + cards = soup.find_all("li", class_="s-card") + print(f"[eBay] Tarjetas nuevo layout: {len(cards)}") + + if not cards: + # layout antiguo li.s-item + for item in soup.find_all("li", class_="s-item"): + try: + link = item.find("a", class_="s-item__link") + if not link: continue + href = link.get("href","") + m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href) + if not m: continue + href = m.group(1) + if href in seen: continue + seen.add(href) + title_el = (item.find("span", class_="BOLD") or + item.find("div", class_="s-item__title") or + item.find("span", class_="s-item__title")) + title = (title_el or link).get_text(strip=True) + if not title or title.lower().startswith("shop on ebay"): continue + price_el = item.find("span", class_="s-item__price") + price = price_el.get_text(strip=True) if price_el else "" + results.append({"title":title,"url":href,"price":price,"source":"eBay"}) + except Exception: + continue + print(f"[eBay] {len(results)} resultados (layout antiguo)") + return results + + for card in cards: + try: + title_link = None + for a in card.find_all("a", class_="s-card__link"): + if "image-treatment" in (a.get("class") or []): continue + t = a.get_text(strip=True) + if t and not t.lower().startswith("shop on ebay"): + title_link = a; break + if not title_link: continue + href = title_link.get("href","") + if "/itm/" not in href: continue + m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href) + if not m: continue + href = m.group(1) + if href in seen: continue + seen.add(href) + title = re.sub(r'\s*Opens in a new window or tab.*','', + title_link.get_text(strip=True), flags=re.I).strip() + price_el = (card.find(class_="s-card__price") or + card.find(class_="s-item__price")) + price = price_el.get_text(strip=True) if price_el else "" + results.append({"title":title,"url":href,"price":price,"source":"eBay"}) + except Exception: + continue + + print(f"[eBay] {len(results)} resultados") + except Exception as e: + print(f"[eBay] Error: {e}") + + return results + + +def scrape_yachtworld(query: str, filters: dict = None) -> list: + filters = filters or {} + results = [] + seen = set() + vessel_type = filters.get("type","").lower() + yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power" + + base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/" + if vessel_type in ["sailboat","sail","velero",""]: + base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/" + + print(f"[YachtWorld] URL: {base_url}") + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=['--disable-blink-features=AutomationControlled','--no-sandbox'] + ) + context = browser.new_context( + viewport={'width':1920,'height':1080}, + user_agent=random.choice(USER_AGENTS), + locale='en-US', timezone_id='America/New_York', + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(base_url, timeout=30000, wait_until='domcontentloaded') + page.wait_for_timeout(random.randint(2000,4000)) + for _ in range(3): + page.evaluate("window.scrollBy(0,400)") + page.wait_for_timeout(random.randint(400,800)) + html = page.content() + except Exception as e: + print(f"[YachtWorld] Error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + print("[YachtWorld] Sin HTML") + return [] + + soup = BeautifulSoup(html,'html.parser') + page_count = 0 + for a in soup.find_all('a', href=True): + href = a['href'] + if '/boat-details/' in href or '/yacht/' in href: + if not href.startswith('http'): + href = 'https://www.yachtworld.com' + href + if href in seen: continue + seen.add(href) + title = a.get_text(strip=True) + results.append({"title":title,"url":href,"price":"","source":"YachtWorld"}) + page_count += 1 + + print(f"[YachtWorld] {page_count} resultados") + except Exception as e: + print(f"[YachtWorld] Error: {e}") + + return results + + +def scrape_boattrader(query: str, filters: dict = None) -> list: + filters = filters or {} + src = { + "name": "BoatTrader", + "search_url": "https://www.boattrader.com/boats/?query={query}", + "type": "broker", + "category": "Venta Especializada", + } + results = [] + seen = set() + url = src["search_url"].replace("{query}", requests.utils.quote(query.strip())) + print(f"[BoatTrader] URL: {url}") + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled","--no-sandbox"] + ) + context = browser.new_context( + viewport={"width":1280,"height":900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", timezone_id="America/New_York", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(4000,6000)) + page.evaluate("window.scrollBy(0,600)") + page.wait_for_timeout(1500) + html = page.content() + except Exception as e: + print(f"[BoatTrader] Error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + print("[BoatTrader] Sin HTML") + return [] + + soup = BeautifulSoup(html,"html.parser") + cards = soup.find_all("li", class_="lib-card") + if not cards: + cards = soup.find_all(class_=re.compile(r'\blib-card\b')) + print(f"[BoatTrader] Cards encontradas: {len(cards)}") + + for card in cards: + try: + link_tag = card.find("a", href=re.compile(r'^/boat/[\w-]+-\d+/$')) + if not link_tag: continue + href = "https://www.boattrader.com" + link_tag["href"] + if href in seen: continue + seen.add(href) + title_el = card.find(class_=re.compile(r'listingTitle',re.I)) + title = title_el.get_text(strip=True) if title_el else link_tag.get_text(strip=True) + price_el = card.find(class_=re.compile(r'listingPrice',re.I)) + price = "" + if price_el: + pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True)) + if pm: price = f"${pm.group(1)}" + results.append({"title":title,"url":href,"price":price,"source":"BoatTrader"}) + except Exception: + continue + + print(f"[BoatTrader] {len(results)} resultados") + except Exception as e: + print(f"[BoatTrader] Error: {e}") + + return results + + +def scrape_boats(query: str, filters: dict = None) -> list: + filters = filters or {} + src = { + "name": "Boats.com", + "search_url": "https://www.boats.com/boats-for-sale/?query={query}", + "type": "broker", + "category": "Venta Especializada", + } + results = [] + seen = set() + url = src["search_url"].replace("{query}", requests.utils.quote(query.strip())) + print(f"[Boats.com] URL: {url}") + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled","--no-sandbox"] + ) + context = browser.new_context( + viewport={"width":1280,"height":900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", timezone_id="America/New_York", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(4000,6000)) + page.evaluate("window.scrollBy(0,600)") + page.wait_for_timeout(1500) + html = page.content() + except Exception as e: + print(f"[Boats.com] Error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + print("[Boats.com] Sin HTML") + return [] + + soup = BeautifulSoup(html,"html.parser") + + def _extract_card(card): + a = card.find("a", href=re.compile(r'^/')) + if not a: return + href = "https://www.boats.com" + a["href"] + if href in seen: return + seen.add(href) + year_el = card.select_one("div.year") + name_el = card.select_one("h2") + year = year_el.get_text(strip=True) if year_el else "" + name = name_el.get_text(strip=True) if name_el else "" + title = f"{year} {name}".strip() if year else name + if not title: return + price_el = card.select_one("div.price") + price = "" + if price_el: + pm = re.search(r'\$\s*([\d,]+)', price_el.get_text(" ",strip=True)) + price = f"${pm.group(1)}" if pm else "" + results.append({"title":title,"url":href,"price":price,"source":"Boats.com"}) + + for card in soup.select("li[data-listing-id]"): + _extract_card(card) + for card in soup.select("li.enhanced.oem"): + _extract_card(card) + + print(f"[Boats.com] {len(results)} resultados") + except Exception as e: + print(f"[Boats.com] Error: {e}") + + return results + + +def scrape_craigslist(query: str, filters: dict = None) -> list: + filters = filters or {} + results = [] + seen = set() + CITIES = ["miami", "tampa", "sfbay", "losangeles", "seattle", + "boston", "newyork", "chicago", "sandiego"] + qs = requests.utils.quote(query.strip()) + + print(f"[Craigslist] Query: '{query}' - probando 3 ciudades al azar") + + try: + from playwright.sync_api import sync_playwright + all_html_parts = [] + cities_tested = random.sample(CITIES, min(3, len(CITIES))) + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + locale="en-US", ignore_https_errors=True, + ) + for city in cities_tested: + city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel" + print(f"[Craigslist] >> {city_url}") + page = ctx.new_page() + try: + page.goto(city_url, timeout=25000, wait_until="domcontentloaded") + page.wait_for_timeout(2500) + all_html_parts.append((city, page.content())) + except Exception as e: + print(f"[Craigslist] {city} error: {e}") + finally: + try: page.close() + except: pass + browser.close() + + for city, html in all_html_parts: + soup = BeautifulSoup(html, "html.parser") + cards = soup.find_all(attrs={"data-pid": True}) + print(f"[Craigslist] {city}: {len(cards)} cards en HTML") + for card in cards: + try: + a_main = card.find("a", class_="main") + if not a_main: continue + listing_url = a_main.get("href","") + if not listing_url or listing_url in seen: continue + seen.add(listing_url) + title = card.get("title","") + if not title: + span = card.find("span", class_="label") + title = span.get_text(strip=True) if span else "" + if not title: continue + price_el = card.find("span", class_="priceinfo") + price = price_el.get_text(strip=True) if price_el else "" + results.append({ + "title": title[:120], "url": listing_url, + "price": price, "source": f"Craigslist {city}", + }) + except Exception: + continue + + print(f"[Craigslist] {len(results)} resultados totales") + except Exception as e: + print(f"[Craigslist] Error: {e}") + + return results + + +def scrape_hibid(query: str, filters: dict = None) -> list: + filters = filters or {} + src = {"name":"HiBid","type":"auction","category":"Subastas USA"} + results = [] + q = requests.utils.quote((query.strip() + " boat")) + url = f"https://www.hibid.com/lots?q={q}" + print(f"[HiBid] URL: {url}") + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + viewport={"width":1280,"height":900}, + locale="en-US", ignore_https_errors=True, + ) + ctx.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + ) + page = ctx.new_page() + try: + page.goto(url, timeout=30000, wait_until="domcontentloaded") + page.wait_for_timeout(4000) + html = page.content() + finally: + try: page.close() + except: pass + browser.close() + + soup = BeautifulSoup(html,"html.parser") + seen = set() + selectors = ".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]" + cards = soup.select(selectors) + print(f"[HiBid] Cards encontradas: {len(cards)}") + + for card in cards: + try: + a = card.find("a", href=True) + if not a: continue + href = a["href"] + if not href.startswith("http"): + href = "https://www.hibid.com" + href + if href in seen: continue + seen.add(href) + title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]") + title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] + price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]") + price = price_el.get_text(strip=True) if price_el else "" + if title and len(title) > 4: + results.append({"title":title,"url":href,"price":price,"source":"HiBid"}) + except Exception: + continue + + print(f"[HiBid] {len(results)} resultados") + except Exception as e: + print(f"[HiBid] Error: {e}") + + return results + + +# ══════════════════════════════════════════════════════════════════════════════ +# RUNNER PRINCIPAL +# ══════════════════════════════════════════════════════════════════════════════ + +SCRAPER_MAP = { + "ebay": scrape_ebay, + "yachtworld": scrape_yachtworld, + "boattrader": scrape_boattrader, + "boats": scrape_boats, + "hibid": scrape_hibid, + "craigslist": scrape_craigslist, +} + +def run_test(query: str, targets: list = None, filters: dict = None): + targets = targets or list(SCRAPER_MAP.keys()) + filters = filters or {} + total = 0 + all_ok = [] + + print("\n" + "="*60) + print(f" PRUEBA DE SCRAPERS | query: {query!r}") + print("="*60 + "\n") + + for name in targets: + fn = SCRAPER_MAP.get(name.lower()) + if not fn: + print(f"[!] Scraper desconocido: {name}") + continue + print(f"\n{'-'*50}") + print(f" >> {name.upper()}") + print(f"{'-'*50}") + t0 = time.time() + results = fn(query, filters) + elapsed = time.time() - t0 + + def safe(s): return s.encode('ascii','replace').decode('ascii') + if results: + all_ok.append(name) + print(f"\n[OK] {name}: {len(results)} resultados en {elapsed:.1f}s") + for i, r in enumerate(results[:3], 1): + print(f" {i}. {safe(r['title'][:70])}") + if r.get('price'): + print(f" $ {safe(r['price'])}") + print(f" > {r['url'][:80]}") + else: + print(f"\n[!!] {name}: 0 resultados en {elapsed:.1f}s") + + total += len(results) + + print("\n" + "="*60) + print(f" RESUMEN: {total} resultados totales") + print(f" Funcionando: {', '.join(all_ok) if all_ok else 'ninguno'}") + print("="*60) + + +if __name__ == "__main__": + args = sys.argv[1:] + query = "sailboat velero" # default + targets = [] + + if args: + # El primer arg que NO empiece con letra de scraper es la query + if args[0].lower() not in SCRAPER_MAP: + query = args[0] + targets = [a.lower() for a in args[1:] if a.lower() in SCRAPER_MAP] + else: + targets = [a.lower() for a in args if a.lower() in SCRAPER_MAP] + + run_test(query, targets if targets else None) diff --git a/server.py b/server.py new file mode 100644 index 0000000..b315e0a --- /dev/null +++ b/server.py @@ -0,0 +1,5402 @@ +""" +Boat&Ship-Finder - Backend Server +Requiere: pip install flask flask-cors requests beautifulsoup4 playwright +""" + +from flask import Flask, jsonify, request, send_from_directory, session +import hashlib as _hashlib +from flask_cors import CORS +import requests +import json +import sqlite3 +import os +import re +import time +import hashlib +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from bs4 import BeautifulSoup +import threading +import urllib3 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +app = Flask(__name__, static_folder='static') +_secret = os.environ.get('SECRET_KEY') +if not _secret: + raise RuntimeError("SECRET_KEY not set — add SECRET_KEY= to your environment") +app.secret_key = _secret +CORS(app, + origins=["http://localhost:8765", "http://127.0.0.1:8765"], + supports_credentials=True) + +DB_PATH = 'marine.db' +OLLAMA_URL = 'http://localhost:11434/api/generate' + +# ── Modelos Ollama por tarea ────────────────────────────────────────────────── +MODELS = { + 'extract': 'qwen2.5:32b', # Extracción de specs (más rápido que 72b, igual de preciso) + 'classify': 'llama3.1:8b', # Clasificación rápida + 'embed': 'nomic-embed-text:latest', # Embeddings para dedup + 'parse': 'qwen3-coder:latest' # Parsing estructurado +} + +# ── Fuentes globales por categoría ──────────────────────────────────────────── +SOURCES = { + "Subastas USA": [ + {"name": "GovPlanet", "url": "https://www.govplanet.com/boats", "type": "auction"}, + {"name": "GovDeals", "url": "https://www.govdeals.com", "type": "auction"}, + {"name": "PropertyRoom", "url": "https://www.propertyroom.com/boats", "type": "auction"}, + {"name": "PublicSurplus", "url": "https://www.publicsurplus.com", "type": "auction"}, + {"name": "AuctionTime", "url": "https://www.auctiontime.com/boats", "type": "auction"}, + {"name": "IronPlanet", "url": "https://www.ironplanet.com/boats", "type": "auction"}, + {"name": "HiBid", "url": "https://www.hibid.com/boats", "type": "auction"}, + {"name": "Copart Marine", "url": "https://www.copart.com/boats", "type": "auction"}, + {"name": "BidSpotter", "url": "https://www.bidspotter.com/boats", "type": "auction"}, + {"name": "32auctions", "url": "https://www.32auctions.com", "type": "auction"}, + ], + "Subastas Internacionales": [ + {"name": "Ritchie Bros", "url": "https://www.rbauction.com/boats", "type": "auction"}, + {"name": "Euro Auctions", "url": "https://www.euroauctions.com", "type": "auction"}, + {"name": "Troostwijk", "url": "https://www.troostwijkauctions.com", "type": "auction"}, + {"name": "Surplex", "url": "https://www.surplex.com/marine", "type": "auction"}, + {"name": "BVA Auctions", "url": "https://www.bva-auctions.com", "type": "auction"}, + {"name": "Catawiki Marine", "url": "https://www.catawiki.com/boats", "type": "auction"}, + {"name": "Barnebys", "url": "https://www.barnebys.com/boats", "type": "auction"}, + {"name": "ShipXchange", "url": "https://www.shipxchange.com", "type": "auction"}, + ], + "Venta Especializada": [ + {"name": "YachtWorld", "url": "https://www.yachtworld.com", "type": "broker"}, + {"name": "Boats.com", "url": "https://www.boats.com", "type": "broker", + "search_url": "https://www.boats.com/boats-for-sale/?query={query}"}, + {"name": "BoatTrader", "url": "https://www.boattrader.com", "type": "broker", + "search_url": "https://www.boattrader.com/boats/?query={query}"}, + {"name": "Apollo Duck", "url": "https://www.apolloduck.com", "type": "broker", + "search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1"}, + {"name": "Rightboat", "url": "https://www.rightboat.com", "type": "broker", + "search_url": "https://www.rightboat.com/boats-for-sale/?q={query}"}, + {"name": "Boat24", "url": "https://www.boat24.com", "type": "broker", + "search_url": "https://www.boat24.com/en/usedboats/"}, + {"name": "Inautia", "url": "https://www.inautia.com", "type": "broker", + "search_url": "https://www.inautia.com/boats/?q={query}"}, + # ── US Brokers ──────────────────────────────────────────────────────── + {"name": "HMY Yachts", "url": "https://hmy.com", "type": "broker", + "search_url": "https://www.hmy.com/yachts-for-sale/?SaleClassCode=used", + "category": "Brokers USA"}, + {"name": "Denison Yachting","url": "https://www.denisonyachtsales.com", "type": "broker", + "search_url": "https://www.denisonyachtsales.com/yachts-for-sale/?search={query}", + "category": "Brokers USA"}, + {"name": "United Yacht", "url": "https://www.unitedyacht.com", "type": "broker", + "search_url": "https://www.unitedyacht.com/yachts-for-sale/", + "category": "Brokers USA"}, + {"name": "Northrop & Johnson","url": "https://www.n-j.com", "type": "broker", + "search_url": "https://www.n-j.com/yachts-for-sale/", + "category": "Brokers USA"}, + {"name": "Worth Ave Yachts","url": "https://www.worthavenueyachts.com", "type": "broker", + "search_url": "https://www.worthavenueyachts.com/yachts-for-sale/", + "category": "Brokers USA"}, + {"name": "Bluewater Yachting","url": "https://www.bluewateryachting.com", "type": "broker", + "category": "Brokers USA"}, + {"name": "Galati Yachts", "url": "https://www.galatiyachts.com", "type": "broker", + "search_url": "https://www.galatiyachts.com/boat-search/?q={query}", + "category": "Brokers USA"}, + {"name": "Fraser Yachts", "url": "https://www.fraseryachts.com", "type": "broker", + "search_url": "https://www.fraseryachts.com/en/yachts-for-sale/?search={query}", + "category": "Brokers INT"}, + {"name": "Burgess Yachts", "url": "https://www.burgessyachts.com", "type": "broker", + "search_url": "https://www.burgessyachts.com/en/yacht-sale?q={query}", + "category": "Brokers INT"}, + {"name": "Ocean Alexander", "url": "https://www.oceanalexander.com", "type": "broker", + "search_url": "https://www.oceanalexander.com/find-a-boat/?q={query}", + "category": "Brokers USA"}, + {"name": "Merle Wood", "url": "https://www.merlewood.com", "type": "broker", + "search_url": "https://www.merlewood.com/yachts-for-sale/", + "category": "Brokers INT"}, + # ── Other ───────────────────────────────────────────────────────────── + {"name": "NauticExpo", "url": "https://www.nauticexpo.com", "type": "broker"}, + {"name": "Seaboats", "url": "https://www.seaboats.net", "type": "broker"}, + {"name": "YachtBroker", "url": "https://www.yachtbroker.com", "type": "broker"}, + ], + "Comercial / Industrial": [ + {"name": "WorkBoat", "url": "https://www.workboat.com/classifieds", "type": "commercial"}, + {"name": "TradeABoat", "url": "https://www.tradeaboat.com.au", "type": "broker"}, + {"name": "Boatpoint", "url": "https://www.boatpoint.com.au", "type": "broker"}, + {"name": "Boats & Outboards","url": "https://www.boatsandoutboards.co.uk", "type": "broker"}, + {"name": "Commercial Vessel","url": "https://www.commercialvessel.com", "type": "commercial"}, + {"name": "ShipServ", "url": "https://www.shipserv.com", "type": "commercial"}, + {"name": "Marine Classifieds","url": "https://www.marineclassifieds.com", "type": "classifieds"}, + {"name": "Barcos.net", "url": "https://www.barcos.net", "type": "broker"}, + # ── Offshore / DP / OSV ─────────────────────────────────────────────── + {"name": "Offshore Vessel Exchange","url": "https://www.offshorevessel.exchange","type": "commercial", + "search_url": "https://www.offshorevessel.exchange/?s={query}", + "category": "Offshore / DP"}, + {"name": "MarineTraffic Vessels For Sale","url": "https://www.marinetraffic.com/en/ads/p/list","type": "commercial", + "search_url": "https://www.marinetraffic.com/en/ads/p/list?search={query}", + "category": "Offshore / DP"}, + {"name": "YachtWorld Commercial","url": "https://www.yachtworld.com","type": "commercial", + "search_url": "https://www.yachtworld.com/boats-for-sale/type-commercial/?query={query}", + "category": "Offshore / DP"}, + {"name": "Apollo Duck Workboats","url": "https://www.apolloduck.com","type": "commercial", + "search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", + "category": "Offshore / DP"}, + {"name": "Seawork Classifieds","url": "https://www.seawork.com","type": "commercial", + "search_url": "https://www.seawork.com/classifieds/", + "category": "Offshore / DP"}, + {"name": "ShipXchange OSV", "url": "https://www.shipxchange.com", "type": "commercial", + "search_url": "https://www.shipxchange.com/en/vessel-types/offshore-support-vessel", + "category": "Offshore / DP"}, + {"name": "Vessel Sales & Acquisitions","url": "https://www.vsl.no", "type": "commercial", + "search_url": "https://www.vsl.no/vessels-for-sale/", + "category": "Offshore / DP"}, + ], + "Clasificados Generales": [ + {"name": "Craigslist Boats", "url": "https://www.craigslist.org/search/boa", "type": "classifieds"}, + {"name": "eBay Motors Marine","url": "https://www.ebay.com/b/Boats/26429", "type": "classifieds", + "search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10"}, + {"name": "Facebook Marketplace","url": "https://www.facebook.com/marketplace/boats","type": "classifieds"}, + {"name": "BoatCrazy", "url": "https://boatcrazy.com", "type": "classifieds", + "search_url": "https://boatcrazy.com/boats?q={query}", + "category": "Clasificados USA"}, + {"name": "Kijiji Marine", "url": "https://www.kijiji.ca/b-boats", "type": "classifieds"}, + {"name": "Gumtree Boats", "url": "https://www.gumtree.com/boats", "type": "classifieds"}, + {"name": "Subito.it Barche", "url": "https://www.subito.it/barche", "type": "classifieds"}, + {"name": "LeBonCoin Bateaux","url": "https://www.leboncoin.fr/bateaux", "type": "classifieds"}, + {"name": "Wallapop Barcos", "url": "https://es.wallapop.com/barcos", "type": "classifieds"}, + {"name": "MercadoLibre", "url": "https://www.mercadolibre.com/barcos", "type": "classifieds"}, + {"name": "OLX Marine", "url": "https://www.olx.com/boats", "type": "classifieds"}, + ], + "Salvage & Wrecks": [ + {"name": "Cooper Capital Salvage", "url": "https://www.cooperss.com", "type": "salvage", + "search_url": "https://www.cooperss.com/", + "category": "Salvage USA"}, + {"name": "Salvex", "url": "https://www.salvex.com", "type": "salvage", + "search_url": "https://www.salvex.com/search/?q={query}&cat=30", + "category": "Salvage USA"}, + {"name": "Copart Marine", "url": "https://www.copart.com", "type": "salvage", + "search_url": "https://www.copart.com/public/data/lotSearchResults/?query={query}&vehicleType=BOAT", + "category": "Salvage USA"}, + {"name": "IAA Watercraft", "url": "https://www.iaai.com", "type": "salvage", + "search_url": "https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft", + "category": "Salvage USA"}, + {"name": "Ritchie Bros Marine","url": "https://www.rbauction.com", "type": "auction", + "search_url": "https://www.rbauction.com/used-equipment?q={query}&searchType=MODEL&equipmentCategory=marine", + "category": "Salvage USA"}, + {"name": "NavAuctions", "url": "https://www.navauctions.com", "type": "salvage"}, + {"name": "MarineWrecks", "url": "https://www.marinewrecks.com", "type": "salvage"}, + {"name": "BoatBreakers", "url": "https://www.boatbreakers.com", "type": "salvage"}, + {"name": "Barnacle Marine", "url": "https://www.barnaclemarine.com", "type": "salvage"}, + {"name": "Boat Breakers AU","url": "https://www.boatbreakersnz.com", "type": "salvage"}, + ], + "Revistas & Noticias": [ + {"name": "Trade Only Today","url": "https://www.tradeonlytoday.com", "type": "news"}, + {"name": "Nautical News", "url": "https://www.nauticalnews.com", "type": "news"}, + {"name": "Boat International","url": "https://www.boatinternational.com/yachts","type": "magazine"}, + {"name": "Superyacht Times","url": "https://www.superyachttimes.com", "type": "magazine"}, + {"name": "The Triton", "url": "https://www.the-triton.com/classifieds", "type": "magazine"}, + {"name": "Passagemaker", "url": "https://www.passagemaker.com", "type": "magazine"}, + {"name": "WorkBoat Mag", "url": "https://www.workboat.com", "type": "magazine"}, + {"name": "Lloyd's List", "url": "https://lloydslist.maritimeintelligence.informa.com", "type": "news"}, + {"name": "Tradewinds", "url": "https://www.tradewindsnews.com", "type": "news"}, + {"name": "Maritime Executive","url": "https://www.maritime-executive.com", "type": "news"}, + {"name": "Splash247", "url": "https://splash247.com", "type": "news"}, + {"name": "Bairdmaritime", "url": "https://www.bairdmaritime.com", "type": "news"}, + ], + "Registros & Gobierno": [ + {"name": "USCG Docs", "url": "https://www.dco.uscg.mil/nvdc", "type": "registry"}, + {"name": "UK Ship Register","url": "https://www.ukshipregister.co.uk", "type": "registry"}, + {"name": "Panama Registry", "url": "https://www.segumar.com", "type": "registry"}, + {"name": "Marshall Islands","url": "https://www.register-iri.com", "type": "registry"}, + {"name": "Liberian Registry","url": "https://www.liscr.com", "type": "registry"}, + {"name": "Bahamas Maritime","url": "https://www.bahamasmaritime.com", "type": "registry"}, + {"name": "IHS Sea-web", "url": "https://maritime.ihs.com", "type": "registry"}, + ], +} + +# ── Database ────────────────────────────────────────────────────────────────── +def init_db(): + conn = sqlite3.connect(DB_PATH) + c = conn.cursor() + c.executescript(""" + CREATE TABLE IF NOT EXISTS vessels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT, + vessel_type TEXT, + loa_m REAL, + beam_m REAL, + draft_m REAL, + year_built INTEGER, + hull TEXT, + propulsion TEXT, + status TEXT, + price_usd REAL, + currency TEXT DEFAULT 'USD', + location TEXT, + country TEXT, + source_name TEXT, + source_url TEXT, + description TEXT, + images TEXT, + flags TEXT, + score REAL DEFAULT 0, + fingerprint TEXT UNIQUE, + raw_data TEXT, + created_at TEXT DEFAULT (datetime('now')), + updated_at TEXT DEFAULT (datetime('now')) + ); + CREATE TABLE IF NOT EXISTS saved_vessels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + vessel_id INTEGER REFERENCES vessels(id), + notes TEXT, + saved_at TEXT DEFAULT (datetime('now')) + ); + CREATE TABLE IF NOT EXISTS search_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + query TEXT, + filters TEXT, + results INTEGER, + searched_at TEXT DEFAULT (datetime('now')) + ); + CREATE TABLE IF NOT EXISTS custom_sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + category TEXT DEFAULT 'Custom', + search_url TEXT NOT NULL, + source_type TEXT DEFAULT 'broker', + active INTEGER DEFAULT 1, + added_by TEXT, + last_status TEXT DEFAULT 'unknown', + created_at TEXT DEFAULT (datetime('now')) + ); + CREATE TABLE IF NOT EXISTS users ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + username TEXT UNIQUE NOT NULL, + password TEXT NOT NULL, + role TEXT DEFAULT 'user', + created_at TEXT DEFAULT (datetime('now')) + ); + CREATE TABLE IF NOT EXISTS collections ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + description TEXT, + color TEXT DEFAULT '#00b4ff', + icon TEXT DEFAULT '📁', + created_at TEXT DEFAULT (datetime('now')) + ); + CREATE TABLE IF NOT EXISTS collection_vessels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + collection_id INTEGER REFERENCES collections(id), + vessel_id INTEGER REFERENCES vessels(id), + notes TEXT, + added_at TEXT DEFAULT (datetime('now')), + UNIQUE(collection_id, vessel_id) + ); + CREATE TABLE IF NOT EXISTS alerts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT, + filters TEXT, + last_match INTEGER DEFAULT 0, + active INTEGER DEFAULT 1, + created_at TEXT DEFAULT (datetime('now')) + ); + CREATE INDEX IF NOT EXISTS idx_vessels_type ON vessels(vessel_type); + CREATE INDEX IF NOT EXISTS idx_vessels_status ON vessels(status); + CREATE INDEX IF NOT EXISTS idx_vessels_price ON vessels(price_usd); + CREATE INDEX IF NOT EXISTS idx_vessels_score ON vessels(score DESC); + """) + conn.commit() + conn.close() + +def get_db(): + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + return conn + +# ── Ollama helpers ───────────────────────────────────────────────────────────── +_ollama_sem = threading.Semaphore(3) # max 3 concurrent Ollama calls + +def ollama_generate(prompt: str, model: str = None, json_mode: bool = False) -> str: + model = model or MODELS['classify'] + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 2048} + } + if json_mode: + payload["format"] = "json" + with _ollama_sem: + try: + r = requests.post(OLLAMA_URL, json=payload, timeout=120) + r.raise_for_status() + return r.json().get("response", "") + except Exception as e: + print(f"[Ollama] Error: {e}") + return "" + +def ollama_models() -> list: + try: + r = requests.get("http://localhost:11434/api/tags", timeout=5) + return [m["name"] for m in r.json().get("models", [])] + except: + return [] + +def extract_vessel_from_text(text: str, source: str) -> dict: + """Use Ollama to extract structured vessel data from raw text.""" + prompt = f"""Eres un experto en inteligencia de mercado marítimo. +Analiza este texto de un anuncio de embarcación y extrae los datos disponibles. +Fuente: {source} + +TEXTO: +{text[:3000]} + +Responde SOLO con JSON válido. Si el texto NO es un listing de embarcación específica responde {{"skip": true}}. + +{{ + "skip": false, + "name": "nombre del barco o descripción corta", + "vessel_type": "Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Salvage|Other", + "loa_m": número o null, + "beam_m": número o null, + "draft_m": número o null, + "year_built": número o null, + "hull": "Fiberglass|Steel|Aluminum|Wood|Composite|Unknown", + "propulsion": "Diesel|Gasoline|Electric|Sail|None|Unknown", + "status": "active|auction|salvage|abandoned|sold", + "price_usd": número o null, + "currency": "USD|EUR|GBP|CAD|AUD|etc", + "location": "ciudad, país", + "country": "código ISO 2 letras", + "description": "resumen en español máximo 200 caracteres", + "flags": ["below_market","rare","auction","salvage_value","motivated_seller","commercial","government_surplus"], + "score": número del 0 al 100 según oportunidad para un broker +}}""" + + response = ollama_generate(prompt, model=MODELS['extract'], json_mode=True) + try: + data = json.loads(response) + if data.get("skip"): + return {} + return data + except: + match = re.search(r'\{.*\}', response, re.DOTALL) + if match: + try: + data = json.loads(match.group()) + if data.get("skip"): + return {} + return data + except: + pass + return {} + +# ── Direct source scrapers — no search engine middleman ────────────────── + +import random + +USER_AGENTS = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0', +] + +def get_headers(referer=None): + ua = random.choice(USER_AGENTS) + h = { + 'User-Agent': ua, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,fr;q=0.7', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0', + } + if referer: + h['Referer'] = referer + return h + +HEADERS = get_headers() + + +# Each source has its own search URL pattern and CSS selectors +# Sites we scrape directly (confirmed working) +DIRECT_SOURCES = [ + # ── Craigslist ───────────────────────────────────────────────────────────── + # Single multi-city entry (uses scrape_craigslist internally — Playwright, 3+ cities) + {"name":"Craigslist", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + # Individual cities — each makes one targeted request via scrape_direct_source + {"name":"Craigslist Miami", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Tampa", "category":"Clasificados USA", "search_url":"https://tampa.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Fort Laud", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}&sort=date", "type":"classifieds"}, + {"name":"Craigslist New Orleans","category":"Clasificados USA", "search_url":"https://neworleans.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Houston", "category":"Clasificados USA", "search_url":"https://houston.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Seattle", "category":"Clasificados USA", "search_url":"https://seattle.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist LA", "category":"Clasificados USA", "search_url":"https://losangeles.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist SF", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Jacksonville","category":"Clasificados USA", "search_url":"https://jacksonville.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Sarasota", "category":"Clasificados USA", "search_url":"https://sarasota.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Chicago", "category":"Clasificados USA", "search_url":"https://chicago.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Boston", "category":"Clasificados USA", "search_url":"https://boston.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Atlanta", "category":"Clasificados USA", "search_url":"https://atlanta.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Baltimore", "category":"Clasificados USA", "search_url":"https://baltimore.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Norfolk", "category":"Clasificados USA", "search_url":"https://norfolk.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist San Diego", "category":"Clasificados USA", "search_url":"https://sandiego.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Portland OR","category":"Clasificados USA", "search_url":"https://portland.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Minneapolis","category":"Clasificados USA", "search_url":"https://minneapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Detroit", "category":"Clasificados USA", "search_url":"https://detroit.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Cleveland", "category":"Clasificados USA", "search_url":"https://cleveland.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Charlotte", "category":"Clasificados USA", "search_url":"https://charlotte.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Denver", "category":"Clasificados USA", "search_url":"https://denver.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Phoenix", "category":"Clasificados USA", "search_url":"https://phoenix.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Annapolis", "category":"Clasificados USA", "search_url":"https://annapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist New Jersey", "category":"Clasificados USA", "search_url":"https://newjersey.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Galveston", "category":"Clasificados USA", "search_url":"https://galveston.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Pensacola", "category":"Clasificados USA", "search_url":"https://pensacola.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Mobile AL", "category":"Clasificados USA", "search_url":"https://mobile.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Key West", "category":"Clasificados USA", "search_url":"https://keys.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Corpus", "category":"Clasificados USA", "search_url":"https://corpuschristi.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Beaumont", "category":"Clasificados USA", "search_url":"https://beaumont.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + {"name":"Craigslist Baton Rouge","category":"Clasificados USA", "search_url":"https://batonrouge.craigslist.org/search/boa?query={query}", "type":"classifieds"}, + # NOTE: gulfcoast.craigslist.org (Biloxi) no longer exists — replaced with Mobile AL + + # ── eBay ────────────────────────────────────────────────────────────────── + {"name":"eBay Marine", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10", "type":"classifieds"}, + {"name":"eBay Auction", "category":"Subastas USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_Auction=1", "type":"auction"}, + {"name":"eBay Motors Sail", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=36431&LH_BIN=1&_sop=10", "type":"classifieds"}, + {"name":"eBay Boats Complete", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+boat&_sacat=26429&LH_BIN=1&_sop=15", "type":"classifieds"}, + {"name":"eBay Salvage Boats", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"}, + + # ── Subastas Gobierno ───────────────────────────────────────────────────── + {"name":"GovDeals", "category":"Subastas Gobierno", "search_url":"https://www.govdeals.com/index.cfm?fa=Main.AdvSearchResultsNew&kWord={query}&category=70", "type":"auction"}, + {"name":"PublicSurplus", "category":"Subastas Gobierno", "search_url":"https://www.publicsurplus.com/sms/browse/home?search={query}", "type":"auction"}, + {"name":"PropertyRoom", "category":"Subastas Gobierno", "search_url":"https://www.propertyroom.com/s?q={query}+boat", "type":"auction"}, + # GovPlanet: correct URL confirmed working (Recreational Marine category) + {"name":"GovPlanet", "category":"Subastas Gobierno", "search_url":"https://www.govplanet.com/Recreational+Marine", "type":"auction"}, + # IronPlanet: correct URL confirmed working (Commercial Marine Vessels) + {"name":"IronPlanet", "category":"Subastas Gobierno", "search_url":"https://www.ironplanet.com/Commercial+Marine+Vessels", "type":"auction"}, + # HiBid: React SPA — scrape_hibid uses Playwright + {"name":"HiBid", "category":"Subastas USA", "search_url":"https://www.hibid.com/lots?q={query}+boat", "type":"auction"}, + {"name":"AuctionTime", "category":"Subastas USA", "search_url":"https://www.auctiontime.com/listings/search?q={query}+boat", "type":"auction"}, + {"name":"BidSpotter", "category":"Subastas USA", "search_url":"https://www.bidspotter.com/en-us/auction-catalogues?q={query}+boat", "type":"auction"}, + # Copart: Playwright scraper handles JS-rendered lots + {"name":"Copart Marine", "category":"Subastas USA", "search_url":"https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT", "type":"auction"}, + + # ── Salvage ─────────────────────────────────────────────────────────────── + {"name":"Salvex Marine", "category":"Salvage / Subastas", "search_url":"https://salvex.com/listings/?q={query}&cat=marine", "type":"salvage"}, + {"name":"Barnacle Marine", "category":"Salvage / Subastas", "search_url":"https://www.barnaclemarine.com/?s={query}", "type":"salvage"}, + {"name":"eBay Salvage", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"}, + {"name":"Cooper Capital Salvage","category":"Salvage USA", "search_url":"https://www.cooperss.com/", "type":"salvage"}, + {"name":"IAA Watercraft", "category":"Salvage USA", "search_url":"https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft", "type":"salvage"}, + + # ── Venta Especializada — principales ──────────────────────────────────── + {"name":"YachtWorld", "category":"Venta Especializada", "search_url":"https://www.yachtworld.com/boats-for-sale/", "type":"broker"}, + {"name":"BoatTrader", "category":"Venta Especializada", "search_url":"https://www.boattrader.com/boats/?query={query}", "type":"broker"}, + {"name":"Boats.com", "category":"Venta Especializada", "search_url":"https://www.boats.com/boats-for-sale/?query={query}", "type":"broker"}, + {"name":"Apollo Duck", "category":"Venta Especializada", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"broker"}, + {"name":"Rightboat", "category":"Venta Especializada", "search_url":"https://www.rightboat.com/boats-for-sale/?q={query}", "type":"broker"}, + # Boat24: 403 on requests — scrape_eu_broker uses Playwright + {"name":"Boat24", "category":"Venta Especializada", "search_url":"https://www.boat24.com/en/boats/?q={query}", "type":"broker"}, + # YachtMarket: uses scrape_eu_broker (Playwright) in case of blocks + {"name":"YachtMarket", "category":"Venta Especializada", "search_url":"https://www.yachtmarket.com/boats-for-sale/?q={query}", "type":"broker"}, + + # ── SailboatListings (dedicated thread also runs in parallel) ──────────── + {"name":"SailboatListings", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=&material=&hull=&state=&keyword={query}&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True}, + {"name":"SailboatListings View", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=Sail&material=&hull=&state=&keyword=&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True}, + # Forums: Playwright scraper handles vBulletin/XenForo FS sections + {"name":"TheHullTruth", "category":"Veleros Global", "search_url":"https://www.thehulltruth.com/boating-forum/search.php?do=process&query={query}&prefixid=FS&type=post", "type":"classifieds"}, + {"name":"Cruisers Forum", "category":"Veleros Global", "search_url":"https://www.cruisersforum.com/forums/f152/", "type":"classifieds"}, + + # ── Comercial / Offshore ────────────────────────────────────────────────── + {"name":"WorkBoat Classifieds", "category":"Comercial Offshore", "search_url":"https://www.workboat.com/classifieds/?keywords={query}", "type":"commercial"}, + {"name":"Commercial Vessel", "category":"Comercial Offshore", "search_url":"https://www.commercialvessel.com/search?keywords={query}", "type":"commercial"}, + {"name":"OSV Broker", "category":"Comercial Offshore", "search_url":"https://www.osvbroker.com/?s={query}", "type":"commercial"}, + {"name":"Marine Classifieds", "category":"Comercial Offshore", "search_url":"https://www.marineclassifieds.com/search.php?search={query}", "type":"commercial"}, + {"name":"Seaboats", "category":"Comercial Global", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=0", "type":"commercial"}, + {"name":"Seaboats Offshore", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=offshore+support+vessels", "type":"commercial"}, + {"name":"Seaboats Tug", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=tugs+%26+pushboats", "type":"commercial"}, + {"name":"Seaboats Barge", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=barges+%26+lighters", "type":"commercial"}, + {"name":"Seaboats Fishing", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=fishing+vessels", "type":"commercial"}, + {"name":"Apollo Duck Workboats", "category":"Comercial Offshore", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"commercial"}, + {"name":"YachtWorld Commercial", "category":"Comercial Offshore", "search_url":"https://www.yachtworld.com/boats-for-sale/type-commercial/", "type":"commercial"}, + + # ── Australia / Pacífico ───────────────────────────────────────────────── + # Trade a Boat AU: server-rendered, correct URL confirmed working + {"name":"Trade a Boat AU", "category":"Australia / Pacifico","search_url":"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query}", "type":"broker"}, + # Boatsales.com.au (Boatpoint redirects here): scrape_eu_broker via Playwright + {"name":"Boatsales AU", "category":"Australia / Pacifico","search_url":"https://www.boatsales.com.au/boats-for-sale/?q={query}", "type":"broker"}, + + # ── Reino Unido ─────────────────────────────────────────────────────────── + # Boats & Outboards UK: 403 on requests — scrape_eu_broker uses Playwright + {"name":"Boats & Outboards UK", "category":"Reino Unido", "search_url":"https://www.boatsandoutboards.co.uk/boats-for-sale/?q={query}", "type":"broker"}, + # Apollo Duck UK: use same apolloduck.com (no separate UK subdomain) + {"name":"Apollo Duck UK", "category":"Reino Unido", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1&country=GB", "type":"broker"}, + + # ── Francia ─────────────────────────────────────────────────────────────── + # Annonces Bateau: 403 on requests — scrape_eu_broker uses Playwright + {"name":"Annonces Bateau", "category":"Francia", "search_url":"https://www.annoncesbateau.com/bateaux/annonces-bateaux?keyword={query}", "type":"broker"}, + + # ── España / Mediterráneo ──────────────────────────────────────────────── + # Inautia ES: 403 on requests — scrape_eu_broker uses Playwright + {"name":"Inautia ES", "category":"Espana / Global", "search_url":"https://www.inautia.es/barca?q={query}", "type":"broker"}, + {"name":"Barcos.net", "category":"Espana / Global", "search_url":"https://www.barcos.net/busqueda/?q={query}", "type":"broker"}, + + # ── Europa / Global ─────────────────────────────────────────────────────── + # YachtAll: 403 on requests — scrape_eu_broker uses Playwright + {"name":"YachtAll", "category":"Clasificados EU", "search_url":"https://yachtall.com/yachts/?search={query}", "type":"broker"}, + + # ── Brokers USA ─────────────────────────────────────────────────────────── + {"name":"HMY Yachts", "category":"Brokers USA", "search_url":"https://www.hmy.com/yachts-for-sale/?SaleClassCode=used", "type":"broker"}, + {"name":"Denison Yachting", "category":"Brokers USA", "search_url":"https://www.denisonyachtsales.com/yachts-for-sale/?search={query}", "type":"broker"}, + {"name":"BoatCrazy", "category":"Brokers USA", "search_url":"https://boatcrazy.com/boats?q={query}", "type":"classifieds"}, + # Galati Yachts: server-rendered WP site — scrape_galati uses requests + {"name":"Galati Yachts", "category":"Brokers USA", "search_url":"https://www.galatiyachts.com/yachts-for-sale/?keywords={query}", "type":"broker"}, + {"name":"United Yacht Sales", "category":"Brokers USA", "search_url":"https://www.unitedyacht.com/yachts-for-sale/", "type":"broker"}, + # Worth Ave Yachts: hybrid server-rendered — scrape_luxury_broker uses Playwright + {"name":"Worth Ave Yachts", "category":"Brokers USA", "search_url":"https://www.worthavenueyachts.com/yachts-for-sale/", "type":"broker"}, + + # ── Brokers Internacionales ─────────────────────────────────────────────── + # Fraser Yachts: Vue/JS SPA — scrape_luxury_broker uses Playwright + {"name":"Fraser Yachts", "category":"Brokers Internacional","search_url":"https://www.fraseryachts.com/en/yachts-for-sale/", "type":"broker"}, + # Burgess Yachts: JS-loaded — scrape_luxury_broker uses Playwright + {"name":"Burgess Yachts", "category":"Brokers Internacional","search_url":"https://www.burgessyachts.com/en/yachts/sale/", "type":"broker"}, + # Northrop & Johnson: JS-loaded — scrape_luxury_broker uses Playwright + {"name":"Northrop & Johnson", "category":"Brokers Internacional","search_url":"https://www.njcharters.com/yachts-for-sale/", "type":"broker"}, + {"name":"Merle Wood", "category":"Brokers Internacional","search_url":"https://www.merlewood.com/yachts-for-sale/", "type":"broker"}, + + # ── Canada ──────────────────────────────────────────────────────────────── + {"name":"Kijiji Boats CA", "category":"Canada", "search_url":"https://www.kijiji.ca/b-boats/{query}/k0c132", "type":"classifieds"}, +] + +# Web search queries — finds listings on ANY site including blocked ones +# DuckDuckGo returns results from YachtWorld, Boats.com, Apollo Duck, etc. +# Base web search templates — {query} is replaced at runtime +# Dynamic templates also get price/loa filters appended when available +WEB_SEARCH_TEMPLATES = [ + '"{query}" boat for sale', + '"{query}" sailboat for sale', + '"{query}" vessel for sale', + '"{query}" yacht for sale', + '"{query}" barco venta', + '"{query}" bateau vendre occasion', + 'site:yachtworld.com {query} for sale sail cruiser', + 'site:boats.com {query} sailboat for sale', + 'site:apolloduck.com {query} for sale', + 'site:rightboat.com {query} for sale', + 'site:boat24.com {query} for sale', + 'site:yachtall.com {query} sailboat', + 'site:annoncesbateau.com {query} voilier', + 'site:cruisersforum.com {query} for sale', + 'site:thehulltruth.com {query} for sale fs', + 'site:govplanet.com {query} vessel', + 'site:ironplanet.com {query} boat vessel', + 'site:govdeals.com {query} vessel boat', + 'site:publicsurplus.com {query} vessel', + 'site:hibid.com {query} boat', + 'site:copart.com {query} boat vessel', + 'site:rbauction.com {query} boat', + '"{query}" boat auction government surplus', + '"{query}" vessel auction salvage', + # Salvage specific + 'site:salvex.com {query} marine vessel', + 'site:copart.com {query} boat salvage', + 'site:iaai.com {query} boat', + 'site:boatbreakers.com {query}', + '"{query}" salvage boat for sale', + '"{query}" insurance total loss boat', + '"{query}" wrecked boat for sale parts', + '"{query}" boat salvage title for sale', + 'site:seaboats.net {query}', + 'site:workboat.com {query} for sale', + 'site:commercialvessel.com {query}', + # Offshore / commercial + 'site:osvbroker.com {query}', + 'site:marineclassifieds.com {query} for sale', + 'site:apolloduck.com {query} offshore tug barge', + '"{query}" offshore supply vessel for sale', + '"{query}" OSV for sale broker', + '"{query}" crew boat for sale', + '"{query}" workboat for sale', + '"{query}" tug for sale', + '"{query}" barge for sale', + '"{query}" supply vessel for sale', + '"{query}" fishing vessel for sale', + '"{query}" commercial vessel for sale', + # Australia / Pacific + 'site:tradeaboat.com.au {query} for sale', + 'site:boatpoint.com.au {query} for sale', + # Europe classifieds + 'site:boatsandoutboards.co.uk {query} for sale', + 'site:annoncesbateau.com {query} voilier', + 'site:inautia.com {query} barco venta', +] + +def build_web_queries(base_query: str, filters: dict) -> list: + """Build web search queries filtered by vessel type/status to avoid irrelevant searches.""" + price_ctx = "" + loa_ctx = "" + if filters.get("max_price"): + price_ctx = f" under ${filters['max_price']}" + if filters.get("min_loa"): + ft = int(float(filters["min_loa"]) / 0.3048) + loa_ctx = f" {ft}ft+" + + vtype = (filters.get("type","") or "").lower() + status = (filters.get("status","") or "").lower() + + # Categorize templates so we only include relevant ones + SALVAGE_KWORDS = {"salvage","copart","iaai","boatbreakers","insurance","total loss","wrecked","salvage title"} + OFFSHORE_KWORDS = {"workboat","commercial","osvbroker","offshore","osv","crew boat","supply vessel","tug","barge","fishing vessel"} + SAIL_KWORDS = {"sailboat","yachtall","annoncesbateau","voilier","cruisersforum","sail cruiser"} + GENERIC_KWORDS = {"boat for sale","vessel for sale","yacht for sale","barco venta","bateau","yachtworld","boats.com","apolloduck","rightboat","boat24","govplanet","ironplanet","govdeals","publicsurplus","hibid","rbauction","tradeaboat","boatpoint","boatsandoutboards","inautia"} + + is_salvage = status == "salvage" or "salvage" in base_query.lower() + is_offshore = vtype in {"offshore","tug","barge","ferry","fishing","commercial"} or any(k in base_query.lower() for k in {"tug","barge","osv","crew boat","workboat"}) + is_sail = vtype in {"sailboat","velero","sail"} or any(k in base_query.lower() for k in {"sail","velero","ketch","sloop"}) + + queries = [] + for tmpl in WEB_SEARCH_TEMPLATES: + tmpl_l = tmpl.lower() + # Skip salvage templates for non-salvage searches + if any(k in tmpl_l for k in SALVAGE_KWORDS) and not is_salvage: + continue + # Skip offshore templates for clearly non-offshore searches (sailboat/velero) + if any(k in tmpl_l for k in OFFSHORE_KWORDS) and is_sail and not is_offshore: + continue + # Skip sailboat templates for offshore/salvage searches + if any(k in tmpl_l for k in SAIL_KWORDS) and (is_offshore or is_salvage) and not is_sail: + continue + + q = tmpl.replace("{query}", base_query) + if not q.startswith("site:") and (price_ctx or loa_ctx): + q += loa_ctx + price_ctx + queries.append(q) + return queries + +SEARCH_ENGINES = [ + { + "name": "DuckDuckGo", + "url": "https://html.duckduckgo.com/html/?q={query}", + "link_sel": "a.result__a", + "snippet_sel": "a.result__snippet", + }, + { + "name": "Bing", + "url": "https://www.bing.com/search?q={query}&count=20", + "link_sel": "h2 a", + "snippet_sel": ".b_caption p", + }, +] + +def web_search(query: str, max_results: int = 8) -> list[dict]: + """Search web engines for real listings.""" + results = [] + seen = set() + skip = ["google.","bing.","duckduckgo.","yahoo.","wikipedia.","youtube.", + "facebook.com/login","instagram.","twitter.","linkedin.", + "pinterest.","reddit.com/r/",".pdf","amazon.com/s?"] + + for engine in SEARCH_ENGINES: + try: + url = engine["url"].format(query=requests.utils.quote(query)) + time.sleep(1.0) + r = requests.get(url, headers=get_headers(), timeout=20, verify=False) + if r.status_code != 200: + continue + soup = BeautifulSoup(r.text, "html.parser") + links = soup.select(engine["link_sel"]) + snippets = soup.select(engine["snippet_sel"]) + + for i, link in enumerate(links[:max_results*2]): + href = link.get("href","") + # Clean DDG redirect + if "duckduckgo.com" in href: + m = re.search(r'uddg=([^&]+)', href) + if m: href = requests.utils.unquote(m.group(1)) + if not href.startswith("http"): continue + if any(s in href for s in skip): continue + if href in seen: continue + seen.add(href) + title = link.get_text(strip=True) + snippet = snippets[i].get_text(strip=True) if i < len(snippets) else "" + try: source = href.split("/")[2].replace("www.","") + except: source = "web" + results.append({ + "url": href, "title": title, "snippet": snippet, + "price_text": "", "img_url": "", + "location": "", "source": source, + "source_type": "broker", "category": "Web Search" + }) + if len(results) >= max_results: break + except Exception as e: + pass + if len(results) >= max_results: break + return results + + + +def scrape_direct_source(source: dict, query: str, filters: dict = None) -> list[dict]: + if filters is None: filters = {} + """AI-powered scraper — no CSS selectors, reads HTML like a human.""" + results = [] + try: + # Build URL — expand filter placeholders if source supports them + raw_url = source["search_url"] + if source.get("supports_filters"): + min_loa_m = float(filters.get("min_loa") or 0) + max_price = filters.get("max_price") or "" + min_price = filters.get("min_price") or "" + loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else "" + loa_max_ft = "" # no max LOA filter in current UI + raw_url = raw_url.replace("{loa_min_ft}", str(loa_min_ft)) + raw_url = raw_url.replace("{loa_max_ft}", str(loa_max_ft)) + raw_url = raw_url.replace("{price_min}", str(min_price)) + raw_url = raw_url.replace("{price_max}", str(max_price)) + # Clean query - remove duplicate "for sale" + # Clean query - remove duplicates + clean_q = query.strip() + for phrase in [" for sale for sale", "for sale for sale", " velero velero", " sailboat sailboat"]: + clean_q = clean_q.replace(phrase, phrase.split()[0] + " " + phrase.split()[1]) + clean_q = ' '.join(dict.fromkeys(clean_q.split())) # remove duplicate words + url = raw_url.format(query=requests.utils.quote(clean_q.replace(' for sale for sale',' for sale'))) + time.sleep(1.0) + domain = url.split('/')[2] + headers = get_headers(referer=f"https://{domain}/") + r = requests.get(url, headers=headers, timeout=25, verify=False) + + # Retry with different UA if blocked + if r.status_code in [403, 429, 503]: + time.sleep(2) + headers = get_headers() + r = requests.get(url, headers=headers, timeout=25, verify=False) + + if r.status_code not in [200, 206]: + print(f"[{source['name']}] HTTP {r.status_code}") + return [] + + soup = BeautifulSoup(r.text, "html.parser") + for tag in soup(["script","style","nav","footer","header","aside","noscript","meta","link"]): + tag.decompose() + + base_url = "/".join(url.split("/")[:3]) + raw_links = [] + skip_words = ["login","register","signup","about","contact","help", + "privacy","terms","facebook.com","twitter.com","instagram.com"] + + for a in soup.find_all("a", href=True)[:80]: + href = a["href"].strip() + if not href or href.startswith("#") or href.startswith("javascript"): + continue + if not href.startswith("http"): + href = base_url + ("" if href.startswith("/") else "/") + href + if any(s in href.lower() for s in skip_words): + continue + text = a.get_text(strip=True)[:150] + parent = a.find_parent() + price = "" + img = "" + if parent: + ptxt = parent.get_text(" ", strip=True) + pm = re.search(r'[\d,]+(?:\.\d+)?\s*(?:USD|EUR|GBP|CAD|\$|€|£)', ptxt) + if pm: + price = pm.group() + # Traverse up to 4 levels to find a thumbnail image + node = parent + for _ in range(4): + if node is None: + break + im = node.find("img") + if im: + src = _extract_best_src(im) + if src: + # Convert relative to absolute + if src.startswith("//"): + src = "https:" + src + elif src.startswith("/"): + src = base_url + src + if src.startswith("http") and len(src) > 20: + img = src + break + node = node.parent + if text and len(text) > 8: + raw_links.append({"url":href,"title":text,"price":price,"img":img}) + + if not raw_links: + print(f"[{source['name']}] No links found") + return [] + + seen = set() + unique = [] + for lnk in raw_links: + if lnk["url"] not in seen: + seen.add(lnk["url"]) + unique.append(lnk) + + # ── Heuristic listing filter (no AI needed) ────────────────────────── + # Score each link — higher = more likely to be an actual vessel listing + BOAT_KW = ["boat","yacht","vessel","sail","ketch","sloop","cutter","schooner", + "yawl","catamaran","trimaran","motor","tug","barge","cruiser","skiff", + "fishing","trawler","offshore","cabin","dinghy","pontoon","runabout"] + + def listing_score(lnk): + url_l = lnk["url"].lower() + title_l = lnk["title"].lower() + sc = 0 + if lnk["price"]: sc += 4 # price is strong signal + if lnk["img"]: sc += 1 # has photo + if re.search(r'/\d{5,}', url_l): sc += 3 # 5+ digit ID + if re.search(r'/(view|detail|listing|item|vessel|boat|ship|for-sale)[-/]', url_l): sc += 2 + if re.search(r'-for-sale[/-]?$', url_l): sc += 2 + if re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title_l): sc += 3 # year in title + if re.search(r'\d{2,3}\s*(?:\'|ft|feet|meter)', title_l): sc += 2 # size + if any(k in title_l for k in BOAT_KW): sc += 1 + if re.search(r'\b(for sale|en vente|vendre|en venta)\b', title_l): sc += 1 + if len(lnk["title"]) > 15: sc += 1 # nav links are short + return sc + + scored = [(listing_score(lnk), lnk) for lnk in unique[:30]] + scored.sort(key=lambda x: x[0], reverse=True) + + # Keep links with score >= 3, or fall back to top-5 if nothing qualifies + good = [lnk for sc, lnk in scored if sc >= 3] + if not good: + good = [lnk for _, lnk in scored[:5]] # best guesses from this source + + for lnk in good[:20]: + results.append({ + "url": lnk["url"], + "title": lnk["title"], + "snippet": f"Price: {lnk['price']}", + "price_text": lnk["price"], + "img_url": lnk["img"], + "location": "", + "source": source["name"], + "source_type": source["type"], + "category": source["category"], + }) + + print(f"[{source['name']}] {len(results)} listings found") + except Exception as e: + print(f"[{source['name']}] Error: {e}") + return results + + +# Interleave queue for polite scraping +_interleave_lock = threading.Lock() +_interleave_sites = [ + "https://miami.craigslist.org", + "https://www.seaboats.net", + "https://www.barcos.net", + "https://www.ebay.com", + "https://boston.craigslist.org", + "https://seattle.craigslist.org", +] +_interleave_idx = 0 + +def polite_pause(source_name: str): + """ + Between pages of the same site, make a quick request to a different + site so we look like a human browsing — not a bot hammering one server. + """ + global _interleave_idx + with _interleave_lock: + site = _interleave_sites[_interleave_idx % len(_interleave_sites)] + _interleave_idx += 1 + try: + requests.get(site, headers=get_headers(), timeout=5, verify=False) + except Exception: + pass + # Random human-like delay: 2-5 seconds + time.sleep(random.uniform(2.0, 5.0)) + print(f"[{source_name}] Polite pause done — continuing...") + +def scrape_sailboatlistings(query: str, filters: dict, max_pages: int = 8) -> list[dict]: + """ + Multi-page scraper for SailboatListings.com. + Captures MAIN listings (sailboat=XXXXX) with full structured data, + plus SIDEBAR featured listings (/view/XXXXX) as bonus. + """ + results = [] + seen_urls = set() + + min_loa_m = float(filters.get("min_loa") or 0) + max_loa_m = float(filters.get("max_loa") or 0) + max_price = filters.get("max_price") or "" + loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else "" + loa_max_ft = int(max_loa_m / 0.3048) if max_loa_m else "" + + vessel_type = filters.get("type","").lower() + sbl_type_map = { + "sailboat": "Sail", "sail": "Sail", + "yacht": "cruiser", + "motor": "powerboat", "motorboat": "powerboat", + "fishing": "fishing", + "tug": "", "barge": "", "offshore": "", "ferry": "", "commercial": "", + } + # Default "" → search ALL types on SailboatListings + sbl_type = sbl_type_map.get(vessel_type, "") + hull = filters.get("hull","").lower() + sbl_hull_map = { + "fiberglass":"fiberglass","steel":"steel", + "aluminum":"aluminum","wood":"wood", + } + sbl_material = sbl_hull_map.get(hull, "") + + year_min = filters.get("year_min","---") + year_max = filters.get("year_max","---") + if not year_min: year_min = "---" + if not year_max: year_max = "---" + + base_url = ( + "https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi" + "?db=default&uid=default&sb=33&so=descend&websearch=1" + f"&manufacturer=&model=" + f"&length-gt={loa_min_ft}&length-lt={loa_max_ft}" + f"&year-lt={year_max}&year-gt={year_min}&price-lt={max_price}" + f"&type={sbl_type}&material={sbl_material}&hull=&state=" + f"&keyword={requests.utils.quote(query)}" + f"&view_records=+Show+Matching+Boats+" + ) + + for page in range(1, max_pages + 1): + if page > 1: + polite_pause("SailboatListings") + + try: + url = base_url if page == 1 else base_url + f"&nh={page}" + r = requests.get(url, headers=get_headers(), timeout=25, verify=False) + + if r.status_code == 429: + print(f"[SailboatListings] Rate limited on page {page} — stopping") + break + if r.status_code != 200: + print(f"[SailboatListings] Page {page} HTTP {r.status_code}") + break + + soup = BeautifulSoup(r.text, "html.parser") + body_text = soup.get_text() + + if "no records" in body_text.lower() or "0 matches" in body_text.lower(): + print(f"[SailboatListings] No more results at page {page}") + break + + page_results = 0 + + # ── MAIN LISTINGS (sailboat=XXXXX) — full structured data ── + for header_link in soup.find_all("a", class_="sailheader"): + href = header_link.get("href", "") + m = re.search(r'sailboat=(\d+)', href) + if not m: + continue + sid = m.group(1) + canonical = f"https://www.sailboatlistings.com/view/{sid}" + if canonical in seen_urls: + continue + seen_urls.add(canonical) + + title = header_link.get_text(strip=True) + + # Parent table contains all structured sailvb/sailvk spans + listing_table = header_link.find_parent("table") + if not listing_table: + continue + + # Extract structured fields + fields = {} + for label_span in listing_table.find_all("span", class_="sailvb"): + label = label_span.get_text(strip=True).rstrip(":").strip() + value_span = label_span.find_next("span", class_="sailvk") + if value_span: + fields[label] = value_span.get_text(strip=True) + + price_text = fields.get("Asking", "") + location = fields.get("Location", "") + + # Build context string from structured fields + context_parts = [f"{k}: {v}" for k, v in fields.items()] + context = " | ".join(context_parts) + + # Extract image — upgrade thumbnail to full-size + img_src = "" + img_tag = listing_table.find("img") + if img_tag: + img_src = img_tag.get("src", "") or img_tag.get("data-src", "") + if img_src and not img_src.startswith("http"): + img_src = "https://www.sailboatlistings.com" + img_src + # Upgrade /sailimg/t/ (thumbnail) or /sailimg/m/ (medium) → /sailimg/ (full) + for thumb in ["/sailimg/t/", "/sailimg/m/"]: + if thumb in img_src: + img_src = img_src.replace(thumb, "/sailimg/") + break + if not img_src: + img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg" + + results.append({ + "url": canonical, + "title": title or context[:80], + "snippet": context, + "price_text": price_text, + "img_url": img_src, + "location": location, + "source": "SailboatListings", + "source_type": "broker", + "category": "Veleros Global", + "fields": fields, # pass structured fields for direct extraction + }) + page_results += 1 + + # ── SIDEBAR FEATURED (/view/XXXXX) — less data but more listings ── + for a in soup.find_all("a", class_="featured"): + href = a.get("href", "") + view_m = re.search(r'/view/(\d+)', href) + if not view_m: + continue + sid = view_m.group(1) + canonical = f"https://www.sailboatlistings.com/view/{sid}" + if canonical in seen_urls: + continue + seen_urls.add(canonical) + + link_text = a.get_text(" ", strip=True) + # Extract price from link text: "45' Alden 45 Falmouth, Maine Asking $355,000" + price_m = re.search(r'Asking\s*\$([\d,]+)', link_text) + price_text = f"${price_m.group(1)}" if price_m else "" + + # Extract location from featurespec span + spec_span = a.find("span", class_="featurespec") + location = "" + if spec_span: + spec_text = spec_span.get_text(" ", strip=True) + # Location is before "Asking" + loc_m = re.search(r'^(.+?)\s*Asking', spec_text) + if loc_m: + location = loc_m.group(1).strip() + + img_src = "" + img_tag = a.find("img") + if img_tag: + img_src = img_tag.get("src", "") or "" + if img_src and not img_src.startswith("http"): + img_src = "https://www.sailboatlistings.com" + img_src + for thumb in ["/sailimg/t/", "/sailimg/m/"]: + if thumb in img_src: + img_src = img_src.replace(thumb, "/sailimg/") + break + if not img_src: + img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg" + + results.append({ + "url": canonical, + "title": link_text.split("Asking")[0].strip() if "Asking" in link_text else link_text, + "snippet": link_text, + "price_text": price_text, + "img_url": img_src, + "location": location, + "source": "SailboatListings", + "source_type": "broker", + "category": "Veleros Global", + "fields": {}, # no structured fields for sidebar listings + }) + page_results += 1 + + print(f"[SailboatListings] Page {page}: {page_results} listings (total: {len(results)})") + if page_results == 0: + break + + except Exception as e: + print(f"[SailboatListings] Error page {page}: {e}") + break + + print(f"[SailboatListings] Done — {len(results)} listings total") + return results + +def scrape_and_extract_sailboatlistings(query: str, filters: dict, search_id: str, max_pages: int = 8): + """ + Runs SailboatListings scraping + AI extraction inline. + Saves each vessel to DB immediately so it appears in dashboard in real-time. + """ + print(f"[SBL-Thread] Starting SailboatListings extraction...") + raw_results = scrape_sailboatlistings(query, filters, max_pages) + + if not raw_results: + print("[SBL-Thread] No results from SailboatListings") + return + + sbl_min_loa = float(filters.get("min_loa") or 0) + sbl_max_price = float(filters.get("max_price") or 0) + saved = 0 + + for raw in raw_results: + if search_state.get('search_id') != search_id or search_state.get('cancelled'): + print("[SBL-Thread] Search cancelled — stopping") + return + + try: + snippet = raw.get("snippet", "") + title = raw.get("title", "") + fields = raw.get("fields", {}) # structured fields from main listings + src = snippet + " " + title + + # ── Helper to parse feet values like "30'" or "5.25'" ── + def parse_ft(val): + if not val: return None + m = re.match(r'([\d.]+)', val) + return float(m.group(1)) if m else None + + # ── Use structured fields directly when available (main listings) ── + if fields: + loa_ft = parse_ft(fields.get("Length")) + beam_ft = parse_ft(fields.get("Beam")) + draft_ft = parse_ft(fields.get("Draft")) + year_val = fields.get("Year", "") + year_m = re.search(r'(\d{4})', year_val) + asking = fields.get("Asking", "") + price_r = re.search(r'\$\s*([\d,]{3,})', asking) + location = fields.get("Location", "") + hull_val = fields.get("Hull", "").lower() + else: + # Fallback: regex for sidebar/featured listings + length_r = re.search(r'Length:\s*([\d.]+)', src, re.IGNORECASE) + beam_r = re.search(r'Beam:\s*([\d.]+)', src, re.IGNORECASE) + draft_r = re.search(r'Draft:\s*([\d.]+)', src, re.IGNORECASE) + year_r = re.search(r'Year:\s*(\d{4})', src, re.IGNORECASE) + price_r = re.search(r'(?:Asking|Price):?\s*\$\s*([\d,]{3,})', src, re.IGNORECASE) + if not price_r: + price_r = re.search(r'\$\s*([\d,]{4,})', src) + loa_ft = float(length_r.group(1)) if length_r else None + beam_ft = float(beam_r.group(1)) if beam_r else None + draft_ft = float(draft_r.group(1)) if draft_r else None + year_m = year_r + location = raw.get("location", "") + hull_val = "" + loc_r = re.search(r'Location:\s*([^\n\r]{3,60}?)(?:\s{2,}|$)', src, re.IGNORECASE) + if loc_r: location = loc_r.group(1).strip() + hull_r2 = re.search(r'Hull:\s*([^\n\r]{3,50}?)(?:\s{2,}|$)', src, re.IGNORECASE) + if hull_r2: hull_val = hull_r2.group(1).lower() + + # Fallback: extract LOA from title e.g. "35' Pearson 35" + if not loa_ft: + tm = re.search(r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', title, re.IGNORECASE) + if tm: + loa_ft = float(tm.group(1)) + loa_m = round(loa_ft * 0.3048, 1) if loa_ft else None + beam_m = round(beam_ft * 0.3048, 1) if beam_ft else None + draft_m = round(draft_ft * 0.3048, 1) if draft_ft else None + year = int(year_m.group(1)) if year_m else None + location = location or raw.get("location", "") + + price_usd = None + if price_r: + try: price_usd = float(price_r.group(1).replace(",", "")) + except: pass + if not price_usd and raw.get("price_text"): + pm = re.search(r'[\d,]+', raw["price_text"].replace("$","")) + if pm: + try: price_usd = float(pm.group().replace(",","")) + except: pass + + # Skip only if absolutely no data + if not loa_m and not year and not price_usd: + continue + + # Apply filters + if sbl_min_loa and loa_m and loa_m < (sbl_min_loa - 0.15): + continue + if sbl_max_price and price_usd and price_usd > sbl_max_price * 1.01: + continue + + # Hull normalisation + hull_txt = hull_val + hull = ("Fiberglass" if "fiber" in hull_txt or "glass" in hull_txt else + "Steel" if "steel" in hull_txt else + "Aluminum" if "alum" in hull_txt else + "Wood" if "wood" in hull_txt else + "Composite" if "comp" in hull_txt else "Unknown") + + # Algorithmic score (fast, no AI) + score = 50 + if loa_m: + score += min(15, int((loa_m - 13) * 1.5)) if loa_m >= 13 else 0 + if year: + score += min(10, max(0, (year - 1980) // 3)) + if price_usd and loa_m: + price_per_ft = price_usd / (loa_m / 0.3048) + if price_per_ft < 500: score += 15 + elif price_per_ft < 1000: score += 8 + + flags = [] + if price_usd and loa_m and (price_usd / (loa_m / 0.3048)) < 600: + flags.append("below_market") + + data = { + "name": title or "SailboatListings boat", + "vessel_type": "Sailboat", + "loa_m": loa_m, + "beam_m": beam_m, + "draft_m": draft_m, + "year_built": year, + "hull": hull, + "propulsion": "Sail", + "status": "active", + "price_usd": price_usd, + "currency": "USD", + "location": location, + "country": "US", + "description": f"Velero {title}. LOA: {loa_ft}ft. {location}".strip("."), + "flags": flags, + "score": min(100, score), + "images": [raw["img_url"]] if raw.get("img_url") else [], + "source_url": raw["url"], + "source_name": "SailboatListings", + } + + vid = save_vessel(data) + if vid > 0: + search_state['found'] += 1 + saved += 1 + msg = f"✓ {title} ({loa_ft}ft, ${price_usd:,.0f}) — SailboatListings" if price_usd else f"✓ {title} ({loa_ft}ft) — SailboatListings" + print(f"[SBL-Thread] {msg}") + search_state['log'].append(msg) + + except Exception as e: + print(f"[SBL-Thread] Error on {raw.get('title','?')}: {e}") + + print(f"[SBL-Thread] Done — {saved}/{len(raw_results)} vessels saved") + +def stealth_fetch(url: str, max_chars: int = 3000) -> tuple: + """ + Fetch a Cloudflare-protected page using Playwright with human-like behavior. + Returns (text, [image_urls]) + Techniques used: + - Realistic viewport and user agent + - Random mouse movements before scrolling + - Human-like delays + - Accept cookies automatically + - Disable webdriver flags + """ + text = "" + images = [] + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=[ + '--disable-blink-features=AutomationControlled', + '--disable-dev-shm-usage', + '--no-sandbox', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + ] + ) + context = browser.new_context( + viewport={'width': 1366, 'height': 768}, + user_agent=random.choice(USER_AGENTS), + locale='en-US', + timezone_id='America/New_York', + java_script_enabled=True, + ignore_https_errors=True, + extra_http_headers={ + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Cache-Control': 'no-cache', + 'Pragma': 'no-cache', + } + ) + # Remove webdriver property + context.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); + Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); + window.chrome = {runtime: {}}; + """) + page = context.new_page() + + # Navigate with realistic timeout + page.goto(url, timeout=30000, wait_until='domcontentloaded') + + # Random delay like a human reading + page.wait_for_timeout(random.randint(1500, 3000)) + + # Accept cookies if button exists + for selector in ['button:has-text("Accept")', 'button:has-text("Accept All")', + '#onetrust-accept-btn-handler', '.cookie-accept']: + try: + page.click(selector, timeout=1000) + page.wait_for_timeout(500) + break + except: + pass + + # Scroll down naturally + page.evaluate("window.scrollBy(0, 300)") + page.wait_for_timeout(random.randint(500, 1200)) + page.evaluate("window.scrollBy(0, 300)") + page.wait_for_timeout(random.randint(300, 800)) + + # Get content + html = page.content() + browser.close() + + from bs4 import BeautifulSoup as BS + soup = BS(html, 'html.parser') + + # Extract images — check all lazy-load attributes + skip_words = ['logo','icon','avatar','banner','pixel','sprite','ad', + 'placeholder','blank','loading','spacer','1x1'] + seen_imgs = set() + for img in soup.find_all('img'): + src = _extract_best_src(img) + if src and src not in seen_imgs: + if not any(s in src.lower() for s in skip_words): + seen_imgs.add(src) + images.append(src) + if len(images) >= 12: + break + + for tag in soup(['script','style','nav','footer','header','aside']): + tag.decompose() + text = ' '.join(soup.get_text(' ', strip=True).split())[:max_chars] + + except Exception as e: + print(f"[Stealth] Error: {e}") + return text, images + + +# Sites that need stealth scraping (Cloudflare protected) +STEALTH_REQUIRED = [ + 'yachtworld.com', 'boats.com', 'boattrader.com', + 'rightboat.com', 'boat24.com', 'yachtall.com', + 'botentekoop.nl', 'leboncoin.fr', 'annoncesbateau.com', + 'thehulltruth.com', 'cruisersforum.com', +] + +def smart_fetch(url: str, max_chars: int = 3000) -> tuple: + """Use stealth for protected sites, regular fetch for others.""" + domain = url.split('/')[2].replace('www.','') if '//' in url else '' + needs_stealth = any(s in domain for s in STEALTH_REQUIRED) + if needs_stealth: + print(f"[Fetch] Using stealth for {domain}") + return stealth_fetch(url, max_chars) + else: + return fetch_page_with_images(url) + + +def scrape_yachtworld(query: str, filters: dict, max_pages: int = 5) -> list: + """ + Dedicated YachtWorld stealth scraper. + Builds filtered URL and navigates with human-like behavior. + """ + results = [] + seen = set() + + # Build YachtWorld filtered URL + vessel_type = filters.get("type","").lower() + yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power" + min_loa = filters.get("min_loa","") + max_price = filters.get("max_price","") + + base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/" + if vessel_type in ["sailboat","sail","velero",""]: + base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/" + if min_loa: + ft = int(float(min_loa) / 0.3048) + base_url += f"length-{ft}/" + if max_price: + base_url += f"price-0,{max_price}/" + + print(f"[YachtWorld] Scraping: {base_url}") + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=['--disable-blink-features=AutomationControlled','--no-sandbox'] + ) + context = browser.new_context( + viewport={'width': 1920, 'height': 1080}, + user_agent=random.choice(USER_AGENTS), + locale='en-US', + timezone_id='America/New_York', + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});" + "window.chrome = {runtime: {}};" + ) + + for page_num in range(1, max_pages + 1): + if search_state.get('cancelled'): + break + + page_url = base_url if page_num == 1 else base_url + f"?page={page_num}" + page = context.new_page() + try: + page.goto(page_url, timeout=30000, wait_until='domcontentloaded') + page.wait_for_timeout(random.randint(2000, 4000)) + + # Scroll to load lazy content + for _ in range(3): + page.evaluate("window.scrollBy(0, 400)") + page.wait_for_timeout(random.randint(400, 800)) + + html = page.content() + page.close() + + from bs4 import BeautifulSoup as BS + soup = BS(html, 'html.parser') + + # YachtWorld listing cards + page_count = 0 + for a in soup.find_all('a', href=True): + href = a['href'] + if '/boat-details/' in href or '/yacht/' in href: + if not href.startswith('http'): + href = 'https://www.yachtworld.com' + href + if href in seen: + continue + seen.add(href) + title = a.get_text(strip=True) + parent = a.find_parent() or a + ctx = parent.get_text(' ', strip=True)[:300] + img = "" + for im in parent.find_all('img'): + src = im.get('src') or im.get('data-src','') + if src and 'rendered_yacht' in src: + img = src + break + results.append({ + "url": href, "title": title, + "snippet": ctx, "price_text": "", + "img_url": img, "location": "", + "source": "YachtWorld", + "source_type": "broker", + "category": "Brokers Especializados", + }) + page_count += 1 + + print(f"[YachtWorld] Page {page_num}: {page_count} listings") + if page_count == 0: + break + + # Polite pause between pages + if page_num < max_pages: + polite_pause("YachtWorld") + + except Exception as e: + print(f"[YachtWorld] Page {page_num} error: {e}") + try: page.close() + except: pass + break + + browser.close() + except Exception as e: + print(f"[YachtWorld] Fatal error: {e}") + + print(f"[YachtWorld] Total: {len(results)} listings") + return results + +def fetch_page_text(url: str, max_chars: int = 2000) -> str: + """Fetch plain text from a page.""" + try: + r = requests.get(url, headers=get_headers(), timeout=15, verify=False) + if r.status_code != 200: + return "" + soup = BeautifulSoup(r.text, "html.parser") + for tag in soup(["script","style","nav","footer","header","aside","noscript"]): + tag.decompose() + return " ".join(soup.get_text(" ", strip=True).split())[:max_chars] + except Exception: + return "" + +def _extract_best_src(img_tag) -> str: + """Extract the best image URL from an tag, handling lazy-load patterns.""" + candidates = [ + img_tag.get("src",""), + img_tag.get("data-src",""), + img_tag.get("data-lazy-src",""), + img_tag.get("data-original",""), + img_tag.get("data-lazy",""), + img_tag.get("data-image",""), + img_tag.get("data-full",""), + img_tag.get("data-url",""), + img_tag.get("data-hi-res-src",""), + ] + # Also check srcset — take the largest variant + srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","") + if srcset: + parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()] + candidates.extend(parts) + for c in candidates: + c = c.strip() + if c and c.startswith("http") and not c.startswith("data:"): + return c + return "" + +def fetch_page_with_images(url: str) -> tuple: + """Fetch page text AND images. Returns (text, [image_urls])""" + text = "" + images = [] + base_url = "/".join(url.split("/")[:3]) + try: + r = requests.get(url, headers=get_headers(referer=url), timeout=18, verify=False) + if r.status_code != 200: + return fetch_page_text(url), [] + soup = BeautifulSoup(r.text, "html.parser") + # Extract images before stripping tags + skip_words = ["logo","icon","avatar","banner","pixel","track","ad","sprite","button", + "placeholder","blank","loading","spacer","1x1","transparent"] + seen_imgs = set() + for img in soup.find_all("img"): + src = _extract_best_src(img) + if not src: + continue + # Normalise relative URLs + if src.startswith("//"): + src = "https:" + src + elif src.startswith("/"): + src = base_url + src + if not src.startswith("http"): + continue + if any(s in src.lower() for s in skip_words): + continue + if src in seen_imgs: + continue + try: + w = int(str(img.get("width","0")).replace("px","") or 0) + if 0 < w < 100: + continue + except: + pass + seen_imgs.add(src) + images.append(src) + if len(images) >= 10: + break + for tag in soup(["script","style","nav","footer","header","aside","noscript"]): + tag.decompose() + text = " ".join(soup.get_text(" ", strip=True).split())[:3000] + except Exception: + text = fetch_page_text(url) + return text, images + +# ══════════════════════════════════════════════════════════════════════════════ +# DEDICATED SOURCE SCRAPERS +# Each function handles one site's quirks. scrape_source_router dispatches here. +# ══════════════════════════════════════════════════════════════════════════════ + +def scrape_ebay(src: dict, query: str, filters: dict) -> list[dict]: + """ + eBay Marine scraper — uses Playwright (Akamai blocks plain requests). + Handles all eBay entries: Marine, Auction, Sail, Salvage, etc. + + New eBay layout (2024+) uses: + - for item links + - Text title in nearby spans/divs + - with i.ebayimg.com CDN URLs (s-l500 quality) + """ + results = [] + seen = set() + + raw_url = src.get("search_url", "") + if not raw_url: + return [] + + clean_q = " ".join(dict.fromkeys(query.strip().split())) + url = raw_url.replace("{query}", requests.utils.quote(clean_q)) + + # ── Adjust eBay category based on vessel type filter ────────────────────── + # 26429=All Boats 36431=Sailboats 36432=Powerboats 26430=PWC 63613=Kayaks + vtype = filters.get("type","").lower() if filters else "" + EBAY_CAT = { + "sailboat": "36431", "sail": "36431", "velero": "36431", + "motor": "36432", "motorboat": "36432", "yacht": "36432", + "fishing": "36432", "tug": "36432", "barge": "36432", + "offshore": "36432", "ferry": "36432", + } + if vtype and vtype in EBAY_CAT: + url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url) + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"] + ) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", + timezone_id="America/New_York", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(url, timeout=30000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(1500, 2500)) + # Scroll a bit to trigger lazy images + page.evaluate("window.scrollBy(0,600)") + page.wait_for_timeout(800) + + html = page.content() + except Exception as e: + print(f"[{src['name']}] Playwright nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + + # ── New layout (2024+): li.s-card ───────────────────────────────────── + cards = soup.find_all("li", class_="s-card") + + # ── Old layout fallback: li.s-item ──────────────────────────────────── + if not cards: + return _parse_ebay_old_layout(soup, src) + + for card in cards: + try: + # Title + URL — a.s-card__link WITHOUT image-treatment class + title_link = None + for a in card.find_all("a", class_="s-card__link"): + if "image-treatment" in (a.get("class") or []): + continue + t = a.get_text(strip=True) + if t and not t.lower().startswith("shop on ebay"): + title_link = a + break + if not title_link: + continue + + href = title_link.get("href", "") + if "/itm/" not in href: + continue + m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href) + if not m: + continue + href = m.group(1) + if href in seen: + continue + seen.add(href) + + # Clean title — strip eBay UI noise appended to link text + title = title_link.get_text(strip=True) + title = re.sub(r'\s*Opens in a new window or tab.*', '', + title, flags=re.IGNORECASE).strip() + + # Price ── .s-card__price + price_tag = (card.find(class_="s-card__price") or + card.find(class_="s-item__price")) + price = price_tag.get_text(strip=True) if price_tag else "" + + # Image ── img inside a.s-card__link.image-treatment + img = "" + img_link = card.find("a", class_="image-treatment") + if img_link: + im = img_link.find("img") + if im: + raw = (_extract_best_src(im) or + im.get("src","") or im.get("data-src","")) + if raw: + img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', + r's-l500.\1', raw) + # Fallback: any ebayimg.com src in the card + if not img: + for im in card.find_all("img"): + raw = (_extract_best_src(im) or im.get("src","")) + if raw and "ebayimg.com" in raw: + img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', + r's-l500.\1', raw) + break + + # Location ── "Located in: XXX" — stop before "Delivery" + location = "" + card_text = card.get_text(" ", strip=True) + lm = re.search( + r'[Ll]ocated in[:\s]+([A-Za-z][^,\|•\n$\d]{2,30})', + card_text) + if lm: + loc_raw = lm.group(1).strip() + # Trim trailing noise like "Delivery or pickup..." + loc_raw = re.split(r'\s+[Dd]elivery|\s+[Ss]hipping', + loc_raw)[0].strip() + location = loc_raw + + results.append({ + "url": href, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "eBay"), + "source_type": src.get("type", "classifieds"), + "category": src.get("category", "Clasificados USA"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings (new layout)") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def _parse_ebay_old_layout(soup, src: dict) -> list[dict]: + """Fallback for the classic eBay li.s-item layout.""" + results = [] + seen = set() + for item in soup.find_all("li", class_="s-item"): + try: + link_tag = item.find("a", class_="s-item__link") + if not link_tag: continue + href = link_tag.get("href","") + if "/itm/" not in href: continue + m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href) + if m: href = m.group(1) + if href in seen: continue + seen.add(href) + + title_tag = (item.find("span", class_="BOLD") or + item.find("div", class_="s-item__title") or + item.find("span", class_="s-item__title")) + title = (title_tag or link_tag).get_text(strip=True) + if not title or title.lower().startswith("shop on ebay"): continue + + price_tag = item.find("span", class_="s-item__price") + price = price_tag.get_text(strip=True) if price_tag else "" + + img = "" + img_tag = item.find("img") + if img_tag: + img = (_extract_best_src(img_tag) or img_tag.get("src","")) + if img: img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', r's-l500.\1', img) + + loc_tag = (item.find("span", class_="s-item__location") or + item.find("span", class_="s-item__itemLocation")) + location = "" + if loc_tag: + location = (loc_tag.get_text(strip=True) + .replace("Located in: ","").strip()) + + results.append({ + "url": href, "title": title, "snippet": f"{price} {location}".strip(), + "price_text": price, "img_url": img, "location": location, + "source": src.get("name","eBay"), "source_type": src.get("type","classifieds"), + "category": src.get("category","Clasificados USA"), + }) + except Exception: + continue + print(f"[{src.get('name','eBay')}] {len(results)} listings (old layout)") + return results + + +def scrape_boattrader(src: dict, query: str, filters: dict) -> list[dict]: + """ + BoatTrader scraper — uses Playwright (Cloudflare Turnstile on plain requests). + + Card structure (stable classes): + li.lib-card — card root + a[href^="/boat/...-/"] — listing URL + [class*=listingTitle] — title element + [class*=listingPrice] — price element + img — photo + city, STATE ZIP pattern in text — location + """ + results = [] + seen = set() + + raw_url = src.get("search_url", "") + if not raw_url: + return [] + + clean_q = " ".join(dict.fromkeys(query.strip().split())) + url = raw_url.replace("{query}", requests.utils.quote(clean_q)) + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"] + ) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", + timezone_id="America/New_York", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + # BoatTrader needs time to hydrate React and load listing cards + page.wait_for_timeout(random.randint(4000, 6000)) + page.evaluate("window.scrollBy(0, 600)") + page.wait_for_timeout(1500) + html = page.content() + except Exception as e: + print(f"[{src['name']}] Playwright nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + + # ── Card root: li.lib-card ───────────────────────────────────────────── + cards = soup.find_all("li", class_="lib-card") + if not cards: + # Fallback: any element with lib-card class + cards = soup.find_all(class_=re.compile(r'\blib-card\b')) + + for card in cards: + try: + # Link ── /boat/YEAR-MAKE-...-ID/ + link_tag = card.find( + "a", href=re.compile(r'^/boat/[\w-]+-\d+/$')) + if not link_tag: + continue + href = "https://www.boattrader.com" + link_tag["href"] + if href in seen: + continue + seen.add(href) + + # Title ── element whose class contains 'listingTitle' + title_el = card.find( + class_=re.compile(r'listingTitle', re.I)) + if title_el: + title = title_el.get_text(strip=True) + else: + # Fallback: build from URL slug (2026-catalina-34-123 → 2026 Catalina 34) + slug = link_tag["href"].strip("/").split("/")[-1] + parts = slug.rsplit("-", 1)[0].replace("-", " ").title() + title = parts + if not title: + continue + + # Price ── element whose class contains 'listingPrice' + price_el = card.find( + class_=re.compile(r'listingPrice', re.I)) + price = "" + if price_el: + raw_price = price_el.get_text(" ", strip=True) + # Extract only the first dollar amount — ignore "/mo*" noise + pm = re.search(r'\$\s*([\d,]+)', raw_price) + if pm: + price = f"${pm.group(1)}" + + # Image ── first with a boatsgroup or boattrader CDN src + img = "" + for im in card.find_all("img"): + raw = (_extract_best_src(im) or + im.get("src","") or im.get("data-src","")) + if raw and raw.startswith("http") and not raw.endswith(".svg"): + img = raw + break + + # Location ── "City, ST ZIP" pattern in card text + # Use listingCaption element if available (more precise) + location = "" + caption_el = card.find(class_=re.compile(r'listingCaption|listingLocation', re.I)) + search_text = caption_el.get_text(" ", strip=True) if caption_el else card.get_text(" ", strip=True) + lm = re.search( + r'\b([A-Z][a-zA-Z\s]{2,20},\s+[A-Z]{2}(?:\s+\d{5})?)', + search_text) + if lm: + location = lm.group(1).strip() + + results.append({ + "url": href, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "BoatTrader"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_apolloduck(src: dict, query: str, filters: dict) -> list[dict]: + """ + Apollo Duck scraper — plain requests + BS4 (no JS needed). + + Two card types on the listing page: + Sidebar cards: div.eastSDFPPanel → a.SidebarTitle, a.SidebarPrice, img + Featured cards: div._FeatureAdPanel → a._FeatureTitle, span._FeaturePrice, + img, td._PanelSpecData (location) + + Listing URL pattern: https://www.apolloduck.com/boat/{slug}/{id} + """ + results = [] + seen = set() + + # Use Apollo Duck keyword search — returns results filtered by query. + # Strip trailing "for sale" / "en venta" / "a vendre" since Apollo Duck + # searches listing titles and those phrases rarely appear there. + stripped_q = re.sub( + r'\s*(for\s+sale|en\s+venta|à\s+vendre|zu\s+verkaufen)\s*$', + '', query.strip(), flags=re.I).strip() + clean_q = requests.utils.quote(stripped_q or query.strip()) + if clean_q: + url = f"https://www.apolloduck.com/search.phtml?search={clean_q}&sr=1&q=1" + else: + raw_url = src.get("search_url", "") or "https://www.apolloduck.com/boats/used-boats-for-sale" + url = raw_url.replace("{query}", clean_q) + is_search = bool(clean_q) # only featured cards are query-filtered + + try: + headers = { + "User-Agent": random.choice(USER_AGENTS), + "Accept-Language": "en-US,en;q=0.9", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + } + resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True) + resp.raise_for_status() + resp.encoding = resp.apparent_encoding or "utf-8" + soup = BeautifulSoup(resp.text, "html.parser") + + def _parse_card(card, title_sel, price_sel, is_featured=False): + """Common extraction for both card types.""" + title_el = card.select_one(title_sel) + if not title_el: + return + title = title_el.get_text(strip=True) + if not title: + return + + # URL — from title link or image link + href = title_el.get("href", "") + if not href: + a = card.find("a", href=re.compile(r'/boat/')) + href = a["href"] if a else "" + if not href: + return + full_url = ("https://www.apolloduck.com" + href + if href.startswith("/") else href) + if full_url in seen: + return + seen.add(full_url) + + # Price + price_el = card.select_one(price_sel) + price = price_el.get_text(strip=True) if price_el else "" + + # Image + img = "" + for im in card.find_all("img"): + raw = (im.get("src") or im.get("data-src") or + im.get("data-lazy-src") or "") + if raw and raw.startswith("http") and not raw.endswith(".svg"): + img = raw + break + # srcset fallback + ss = im.get("srcset","") + if ss: + img = ss.split()[0] + break + + # Location — only featured cards have it + location = "" + if is_featured: + for lbl in card.select("td._PanelSpecLabel"): + if "location" in lbl.get_text(strip=True).lower(): + loc_td = lbl.find_next_sibling("td") + if loc_td: + location = loc_td.get_text(strip=True) + break + + results.append({ + "url": full_url, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "Apollo Duck"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + + # Featured cards — always query-filtered on search results (~60-100/page) + for card in soup.select("div._FeatureAdPanel"): + _parse_card(card, "a._FeatureTitle", "span._FeaturePrice", + is_featured=True) + + # Sidebar cards — only when browsing a category (NOT on keyword search, + # because sidebar is always the same 101 generic listings regardless of query) + if not is_search: + for card in soup.select("div.eastSDFPPanel"): + _parse_card(card, "a.SidebarTitle", "a.SidebarPrice") + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_boatsdotcom(src: dict, query: str, filters: dict) -> list[dict]: + """ + Boats.com scraper — uses Playwright (same Boats Group infrastructure as BoatTrader). + + Two card types: + Sponsored/real: li[data-listing-id] → h2+div.year, div.price, + div.img-container img, div.country + OEM specs: li.enhanced.oem → h2+div.year, div.price, + div.img-container img (no location) + + Listing URL pattern: https://www.boats.com/{type}/{year}-{make}-{id}/ + """ + results = [] + seen = set() + + raw_url = src.get("search_url", "") or "https://www.boats.com/boats-for-sale/?query={query}" + clean_q = requests.utils.quote(query.strip()) + url = raw_url.replace("{query}", clean_q) + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"] + ) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", + timezone_id="America/New_York", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(4000, 6000)) + page.evaluate("window.scrollBy(0, 600)") + page.wait_for_timeout(1500) + html = page.content() + except Exception as e: + print(f"[{src['name']}] Playwright nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + + def _extract_card(card, has_location=True): + # URL + a = card.find("a", href=re.compile(r'^/')) + if not a: + return + href = "https://www.boats.com" + a["href"] + if href in seen: + return + seen.add(href) + + # Title = year + model name + year_el = card.select_one("div.year") + name_el = card.select_one("h2") + year = year_el.get_text(strip=True) if year_el else "" + name = name_el.get_text(strip=True) if name_el else "" + title = f"{year} {name}".strip() if year else name + if not title: + return + + # Price + price_el = card.select_one("div.price") + price = "" + if price_el: + raw_p = price_el.get_text(" ", strip=True) + pm = re.search(r'\$\s*([\d,]+)', raw_p) + price = f"${pm.group(1)}" if pm else raw_p[:30] + + # Image + img = "" + img_container = card.select_one("div.img-container") + if img_container: + im = img_container.find("img") + if im: + img = (_extract_best_src(im) or im.get("src","") + or im.get("data-src","")) + + # Location + location = "" + if has_location: + loc_el = card.select_one("div.country") + if loc_el: + location = loc_el.get_text(strip=True) + + results.append({ + "url": href, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "Boats.com"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + + # Sponsored/real marketplace listings + for card in soup.select("li[data-listing-id]"): + _extract_card(card, has_location=True) + + # OEM spec sheets + for card in soup.select("li.enhanced.oem"): + _extract_card(card, has_location=False) + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_craigslist(src: dict, query: str, filters: dict) -> list[dict]: + """ + Craigslist boats scraper — plain requests + BS4. + + Card root : div[data-pid] (class="cl-search-result") + Title : a.posting-title span.label + URL : a.main[href] (full absolute URL with regional subdomain) + Price : span.priceinfo + Location : span.result-location + Image : img[data-image-index="0"] inside div.cl-gallery + """ + results = [] + seen = set() + + # Craigslist has no national search — scrape several major coastal cities + CITIES = ["sfbay", "losangeles", "seattle", "miami", "boston", + "newyork", "chicago", "houston", "dallas", "denver", + "phoenix", "atlanta", "portland", "sandiego", "tampa", + "minneapolis", "stlouis", "nashville", "raleigh", "saltlakecity"] + qs = requests.utils.quote(query.strip()) + + try: + from playwright.sync_api import sync_playwright + all_html_parts = [] + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + locale="en-US", + ignore_https_errors=True, + ) + # Fetch 3 random cities to keep runtime reasonable + for city in random.sample(CITIES, min(3, len(CITIES))): + city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel" + page = ctx.new_page() + try: + page.goto(city_url, timeout=25000, wait_until="domcontentloaded") + page.wait_for_timeout(2500) + all_html_parts.append(page.content()) + except Exception: + pass + finally: + try: page.close() + except: pass + browser.close() + + if not all_html_parts: + return [] + + # Parse all city HTMLs + for html in all_html_parts: + soup = BeautifulSoup(html, "html.parser") + for card in soup.find_all(attrs={"data-pid": True}): + try: + # URL — from the main image link (absolute) + a_main = card.find("a", class_="main") + if not a_main: + continue + listing_url = a_main.get("href", "") + if not listing_url or listing_url in seen: + continue + seen.add(listing_url) + + # Title — from card title attr or span.label + title = card.get("title", "") + if not title: + span = card.find("span", class_="label") + title = span.get_text(strip=True) if span else "" + if not title: + continue + + # Price + price_el = card.find("span", class_="priceinfo") + price = price_el.get_text(strip=True) if price_el else "" + + # Location + loc_el = card.find("span", class_="result-location") + location = loc_el.get_text(strip=True) if loc_el else "" + + # Image — first img with data-image-index="0" + img = "" + im = card.find("img", attrs={"data-image-index": "0"}) + if im: + img = im.get("src", "") or im.get("data-src", "") + if not img: + im = card.find("img") + if im: + img = im.get("src", "") or im.get("data-src", "") + + results.append({ + "url": listing_url, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "Craigslist Boats"), + "source_type": src.get("type", "classifieds"), + "category": src.get("category", "Clasificados Generales"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_rightboat(src: dict, query: str, filters: dict) -> list[dict]: + """ + Rightboat scraper — Playwright (JS-rendered, Tailwind CSS). + + Card root : div[data-tracking-bound="true"] + Image : img.object-cover (first inside card) + Title : first with href containing /boats-for-sale/ that has text + Price : element containing fa-tag icon's sibling text + Location : element containing fa-location-pin icon's sibling text + """ + results = [] + seen = set() + + raw_url = (src.get("search_url", "") + or "https://www.rightboat.com/boats-for-sale/?q={query}") + clean_q = requests.utils.quote(query.strip()) + url = raw_url.replace("{query}", clean_q) + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"] + ) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", + ignore_https_errors=True, + ) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + ) + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(5000, 7000)) + page.evaluate("window.scrollBy(0, 800)") + page.wait_for_timeout(1500) + html = page.content() + except Exception as e: + print(f"[{src['name']}] Playwright nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + + # Cards are div[data-tracking-bound="true"] + cards = soup.find_all(attrs={"data-tracking-bound": "true"}) + + for card in cards: + try: + # URL — the card ITSELF is the element + href = card.get("href", "") + if not href or "/boats-for-sale/" not in href: + continue + listing_url = ("https://www.rightboat.com" + href + if href.startswith("/") else href) + if listing_url in seen: + continue + seen.add(listing_url) + + # Image — first object-cover img (main photo) + img = "" + im = card.find("img", class_=re.compile(r'object-cover')) + if im: + img = im.get("src", "") or im.get("data-src", "") + + # Title — from img alt attribute (most reliable) or heading + title = "" + if im: + title = im.get("alt", "").strip() + if not title: + h_el = card.find(re.compile(r'^h[1-4]$')) + title = h_el.get_text(strip=True) if h_el else "" + if not title: + # Build from URL slug: /boats-for-sale/make/model/rbXXX + parts = href.strip("/").split("/") + if len(parts) >= 3: + title = " ".join(parts[1:-1]).replace("-", " ").title() + if not title: + continue + + # Price —

or regex fallback + price = "" + price_el = card.find("p", class_=re.compile(r'font-bold')) + if price_el: + pt = price_el.get_text(strip=True) + if re.search(r'[\$£€]', pt): + price = pt + if not price: + pm = re.search(r'[\$£€]\s*[\d,]+', card.get_text()) + if pm: + price = pm.group(0) + + # Location — text inside same div as fa-location-pin icon + location = "" + pin_icon = card.find("i", class_=re.compile(r'fa-location')) + if pin_icon: + # Typically:

"City, State"
+ row = pin_icon.find_parent() + if row: + location = row.get_text(" ", strip=True).strip() + + results.append({ + "url": listing_url, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "Rightboat"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_cooperss(src: dict, query: str, filters: dict) -> list[dict]: + """ + Cooper Capital Specialty Salvage (cooperss.com). + Salvage / insurance-loss vessels. + + Structure (paired divs, same index): + div.listing-thumb — image + link (assets/detail/?name=marine&id=N) + div.listing-detail — h5.blue (name) + table (Year,Size,Location,Min Bid…) + """ + results = [] + seen = set() + base = "https://www.cooperss.com" + + try: + headers = {"User-Agent": random.choice(USER_AGENTS), + "Accept-Language": "en-US,en;q=0.9"} + resp = requests.get(base + "/", headers=headers, timeout=20) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + thumbs = [el for el in soup.find_all(class_="listing-thumb") + if "slick-cloned" not in (el.get("class") or [])] + details = [el for el in soup.find_all(class_="listing-detail") + if "slick-cloned" not in (el.get("class") or [])] + + for thumb, detail in zip(thumbs, details): + try: + # URL + a = thumb.find("a", href=True) + if not a: + continue + href = a["href"] + if not href.startswith("http"): + href = base + "/" + href.lstrip("/") + if href in seen: + continue + seen.add(href) + + # Image + img_tag = thumb.find("img") + img = img_tag.get("src", "") if img_tag else "" + if img and not img.startswith("http"): + img = base + "/" + img.lstrip("/") + + # Title — h5.blue (vessel name) + h5 = detail.find("h5", class_="blue") + title = h5.get_text(strip=True).split("\n")[0].strip() if h5 else "" + # Remove video-button text artifact + for tag in (h5.find_all("a") if h5 else []): + tag.decompose() + title = h5.get_text(strip=True) if h5 else title + if not title: + continue + + # Parse the detail table + rows = {td.get_text(strip=True): tds[1].get_text(strip=True) + for tr in detail.find_all("tr") + if len(tds := tr.find_all("td")) == 2 + for td in [tds[0]]} + year = rows.get("Year", "") + size = rows.get("Size", "") + location = rows.get("Location", "") + min_bid = rows.get("Minimum Bid", "") + loss_type= rows.get("Type of Loss", "") + deadline = rows.get("Bid Deadline", "") + + if year: + title = f"{year} {title}".strip() + price = f"Min Bid ${min_bid}" if min_bid else "" + snippet_parts = [p for p in [price, loss_type, location, f"Deadline: {deadline}" if deadline else ""] if p] + + results.append({ + "url": href, + "title": title[:120], + "snippet": " | ".join(snippet_parts), + "price_text": price, + "img_url": img, + "location": location, + "size_m": size, + "source": src.get("name", "Cooper Salvage"), + "source_type": "salvage", + "category": src.get("category", "Salvage & Wrecks"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_inautia(src: dict, query: str, filters: dict) -> list[dict]: + """ + iNautia scraper — same Boats Group platform as BoatTrader/Boats.com. + + Card: div[data-grid-index] + Link: a.grid-listing-link[href] → /boat/YEAR-MAKE-MODEL-ID/ + Title: [class*=listingTitle] + Price: data-ssr-meta="make|type|len||price_eur" (5th field) + Location: [class*=listingBody] + Image: first CDN img in card + """ + results = [] + seen = set() + + raw_url = (src.get("search_url", "") + or "https://www.inautia.com/boats/?q={query}") + url = raw_url.replace("{query}", requests.utils.quote(query.strip())) + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"]) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", ignore_https_errors=True) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};") + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(4000, 6000)) + page.evaluate("window.scrollBy(0,600)") + page.wait_for_timeout(1500) + html = page.content() + except Exception as e: + print(f"[{src['name']}] nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + cards = soup.find_all(attrs={"data-grid-index": True}) + + for card in cards: + try: + link_tag = card.find("a", class_=re.compile(r'grid-listing-link')) + if not link_tag: + continue + href = link_tag.get("href", "") + if not href: + continue + full_url = ("https://www.inautia.com" + href + if href.startswith("/") else href) + if full_url in seen: + continue + seen.add(full_url) + + # Title + title_el = card.find(class_=re.compile(r'listingTitle', re.I)) + title = title_el.get_text(strip=True) if title_el else "" + if not title: + slug = href.strip("/").split("/")[-1] + title = slug.rsplit("-", 1)[0].replace("-", " ").title() + if not title: + continue + + # Price from data-ssr-meta (make|type|length||price_eur) + price = "" + meta = link_tag.get("data-ssr-meta", "") + if meta: + parts = meta.split("|") + if len(parts) >= 5 and parts[4]: + try: + price = f"€{int(float(parts[4])):,}" + except ValueError: + pass + if not price: + price_el = card.find(class_=re.compile(r'listingPrice', re.I)) + if price_el: + raw_p = price_el.get_text(" ", strip=True) + pm = re.search(r'[\$€£]\s*[\d,]+', raw_p) + price = pm.group(0) if pm else "" + + # Location — listingBody contains "Broker | City, Country" + loc_el = card.find(class_=re.compile(r'listingBody', re.I)) + location = loc_el.get_text(" ", strip=True) if loc_el else "" + + # Image + img = "" + for im in card.find_all("img"): + raw = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) + if raw and raw.startswith("http") and not raw.endswith(".svg"): + img = raw + break + + results.append({ + "url": full_url, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "iNautia"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_boat24(src: dict, query: str, filters: dict) -> list[dict]: + """ + Boat24 scraper — European marketplace, plain requests. + + Card: div.blurb.blurb--strip + Link: data-link attr (base64 → ROT13 → URL) + Title: h3.blurb__title + Price: p.blurb__price + Location: p.blurb__location + Image: lazy via slider — extract from li.slider__slide img[src] or data-src + """ + results = [] + seen = set() + BASE = "https://www.boat24.com" + + raw_url = (src.get("search_url", "") + or "https://www.boat24.com/en/usedboats/") + url = raw_url.replace("{query}", requests.utils.quote(query.strip())) + + _rot13 = str.maketrans( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + "NOPQRSTUVWXYZABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm") + + def _decode_link(encoded: str) -> str: + try: + import base64 + rot = base64.b64decode(encoded).decode("utf-8", errors="ignore") + return rot.translate(_rot13) + except Exception: + return "" + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"]) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", ignore_https_errors=True) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});") + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(4000, 6000)) + html = page.content() + except Exception as e: + print(f"[{src['name']}] nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + cards = soup.find_all("div", class_=re.compile(r'\bblurb\b')) + for card in cards: + try: + encoded = card.get("data-link", "") + if not encoded: + continue + listing_url = _decode_link(encoded) + if not listing_url or not listing_url.startswith("http"): + # Try building from title link + a = card.find("a", href=re.compile(r'/en/')) + if a: + listing_url = (BASE + a["href"] if a["href"].startswith("/") + else a["href"]) + else: + continue + if listing_url in seen: + continue + seen.add(listing_url) + + title_el = card.select_one("h3.blurb__title, h2.blurb__title") + title = title_el.get_text(strip=True) if title_el else "" + if not title: + continue + + price_el = card.select_one("p.blurb__price") + price = price_el.get_text(strip=True) if price_el else "" + + loc_el = card.select_one("p.blurb__location") + location = "" + if loc_el: + location = re.sub(r'\s+', ' ', + loc_el.get_text(" ", strip=True)).strip() + + # Image — try slider slides or first img + img = "" + for im in card.find_all("img"): + raw = (im.get("data-src") or im.get("data-lazy") + or im.get("srcset","").split()[0] or im.get("src","")) + if raw and raw.startswith("http") and "/alpha.gif" not in raw: + img = raw + break + + results.append({ + "url": listing_url, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "Boat24"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_facebook_marketplace(src: dict, query: str, filters: dict) -> list[dict]: + """ + Facebook Marketplace scraper. + + Requires a saved session file: fb_session.json (cookies from a logged-in session). + If not found, returns a single instructional result. + + Setup: POST /api/fb-setup → launches a visible browser for the user to log in. + Session file is saved automatically after login. + """ + import json as _json + results = [] + seen = set() + + SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json") + SEARCH_URL = ("https://www.facebook.com/marketplace/search/" + f"?query={requests.utils.quote(query.strip())}" + "&deliveryMethod=local_pick_up") + + if not os.path.exists(SESSION_FILE): + return [{ + "url": "https://www.facebook.com/marketplace/", + "title": "⚠ Facebook Marketplace — Configuración requerida", + "snippet": ("Para habilitar Facebook Marketplace, ve a Fuentes y " + "haz clic en 'Configurar FB'. Solo se necesita una vez."), + "price_text": "", + "img_url": "", + "location": "", + "source": "Facebook Marketplace", + "source_type": "setup_required", + "category": src.get("category", "Clasificados Generales"), + }] + + try: + from playwright.sync_api import sync_playwright + with open(SESSION_FILE) as f: + cookies = _json.load(f) + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled", + "--no-sandbox", "--disable-dev-shm-usage"]) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", ignore_https_errors=True) + context.add_cookies(cookies) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};") + page = context.new_page() + try: + page.goto(SEARCH_URL, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(5000, 7000)) + page.evaluate("window.scrollBy(0,800)") + page.wait_for_timeout(2000) + html = page.content() + except Exception as e: + print(f"[Facebook Marketplace] nav error: {e}") + html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + + # FB Marketplace listing cards — data-testid or aria-label patterns + # Each listing is usually an
with href /marketplace/item/ID/ + listing_links = soup.find_all( + "a", href=re.compile(r'/marketplace/item/\d+')) + + for a in listing_links: + try: + href = a.get("href", "") + full_url = ("https://www.facebook.com" + href + if href.startswith("/") else href) + # Normalize: remove query params after item ID + full_url = re.sub(r'(/marketplace/item/\d+/).*', r'\1', full_url) + if full_url in seen: + continue + seen.add(full_url) + + # Title — span or div with listing title + title_el = (a.find("span", style=re.compile(r'line-clamp')) + or a.find("span", class_=re.compile(r'x1lliihq|xt0psk2')) + or a.find("div", class_=re.compile(r'x1lliihq'))) + title = title_el.get_text(strip=True) if title_el else "" + if not title: + # Try aria-label on the card + title = a.get("aria-label", "") + if not title: + continue + + # Price + price = "" + for span in a.find_all("span"): + t = span.get_text(strip=True) + if re.match(r'[\$£€][\d,]+', t): + price = t + break + + # Image + img = "" + im = a.find("img") + if im: + img = im.get("src", "") or im.get("data-src", "") + + # Location — usually a second span below price + location = "" + spans = [s.get_text(strip=True) for s in a.find_all("span") + if s.get_text(strip=True) and s.get_text(strip=True) != title] + for s in spans: + if re.search(r'[A-Z][a-z]+,\s+[A-Z]{2}', s) or ( + not re.match(r'[\$£€\d]', s) and len(s) > 3 and s != price): + location = s + break + + results.append({ + "url": full_url, + "title": title[:120], + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": "Facebook Marketplace", + "source_type": "classifieds", + "category": src.get("category", "Clasificados Generales"), + }) + except Exception: + continue + + print(f"[Facebook Marketplace] {len(results)} listings") + + except Exception as e: + print(f"[Facebook Marketplace] Error: {e}") + + return results + + +def scrape_hmy(src: dict, query: str, filters: dict) -> list[dict]: + """ + HMY Yachts — queries Algolia directly (app ECN3QX1VBL). + Fast, no Playwright needed. + """ + results = [] + seen = set() + + ALGOLIA_URL = "https://ecn3qx1vbl-dsn.algolia.net/1/indexes/*/queries" + ALGOLIA_HEADERS = { + "x-algolia-application-id": "ECN3QX1VBL", + "x-algolia-api-key": "d86ccdd9ac0292ba76ee4755693d0c10", + "content-type": "application/json", + "referer": "https://www.hmy.com/", + "user-agent": random.choice(USER_AGENTS), + } + + import urllib.parse + params_str = urllib.parse.urlencode({ + "filters": "SalesStatus:Active", + "facetFilters": '[["SaleClassCode:used"]]', + "query": query, + "hitsPerPage": 40, + "page": 0, + }) + + payload = { + "requests": [{ + "indexName": "production_oceanelite_yachts", + "params": params_str, + }] + } + + try: + resp = requests.post(ALGOLIA_URL, json=payload, headers=ALGOLIA_HEADERS, timeout=15) + resp.raise_for_status() + data = resp.json() + hits = data.get("results", [{}])[0].get("hits", []) + + for h in hits: + try: + slug = h.get("Slug", "") + url = h.get("URL") or (f"https://www.hmy.com/yachts-for-sale/{slug}" if slug else "") + if not url or url in seen: + continue + seen.add(url) + + year = h.get("ModelYear", "") + make = h.get("MakeStringExact", "") + model = h.get("ModelExact", "") + name = h.get("BoatName", "") + title = f"{year} {make} {model}".strip() + if name: + title += f' "{name}"' + + price_raw = h.get("NormPrice", 0) + price_text = f"${int(price_raw):,}" if price_raw else "" + + length = h.get("NominalLengthNormalized", "") + country = h.get("country", "USA") + location = f"{length}ft · {country}" if length else country + + img = h.get("mainImage", "") + + results.append({ + "url": url, + "title": title[:120], + "snippet": f"{price_text} · {location}".strip(" ·"), + "price_text": price_text, + "img_url": img, + "location": country, + "source": src.get("name", "HMY Yachts"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Venta Especializada"), + }) + except Exception: + continue + + print(f"[{src.get('name','HMY')}] {len(results)} listings") + + except Exception as e: + print(f"[{src.get('name','HMY')}] Error: {e}") + + return results + + +def scrape_boatcrazy(src: dict, query: str, filters: dict) -> list[dict]: + """ + BoatCrazy — US aggregator with 105+ listings per page. + + Card: div.boat-list-item + Link: a[href*="/boat-for-sale/"] + Image: div.item-img img or div.list-itemimg img + Details: div.item-details + URL pattern: /boat-for-sale/YEAR-MAKE-LOCATION-id + """ + results = [] + seen = set() + + raw_url = src.get("search_url", "") or "https://boatcrazy.com/boats?q={query}" + url = raw_url.replace("{query}", requests.utils.quote(query.strip())) + + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, + args=["--disable-blink-features=AutomationControlled","--no-sandbox"]) + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=random.choice(USER_AGENTS), + locale="en-US", ignore_https_errors=True) + context.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};") + page = context.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(random.randint(4000, 6000)) + html = page.content() + except Exception as e: + print(f"[{src['name']}] nav error: {e}"); html = "" + finally: + try: page.close() + except: pass + browser.close() + + if not html: + return [] + + soup = BeautifulSoup(html, "html.parser") + cards = soup.find_all(class_="boat-list-item") + if not cards: + # fallback: find by link pattern + cards = [] + for a in soup.find_all("a", href=re.compile(r'/boat-for-sale/')): + parent = a.find_parent(class_=re.compile(r'boat|list|item|card')) + if parent and parent not in cards: + cards.append(parent) + + for card in cards: + try: + a = card.find("a", href=re.compile(r'/boat-for-sale/')) + if not a: + continue + href = a["href"] + full_url = href if href.startswith("http") else "https://boatcrazy.com" + href + if full_url in seen: + continue + seen.add(full_url) + + # Title — prefer h3, then aria-label, then slug + title = "" + h3 = card.find("h3") + if h3: + title = h3.get_text(strip=True)[:80] + if not title: + al = card.find(attrs={"aria-label": True}) + if al: + title = al["aria-label"][:80] + if not title: + slug = href.rstrip("/").split("/")[-1] + slug_clean = re.sub(r'-id[-\w]*$', '', slug).replace("-", " ") + title = slug_clean.title()[:80] + if not title: + continue + + # Price + price = "" + price_el = card.find(class_=re.compile(r'\bprice\b')) + if price_el: + pm = re.search(r'\$[\d,]+', price_el.get_text()) + if pm: + price = pm.group(0) + if not price: + pm = re.search(r'\$[\d,]+', card.get_text(" ", strip=True)) + if pm: + price = pm.group(0) + + # Location + location = "" + loc_el = card.find(class_="location") + if loc_el: + location = loc_el.get_text(strip=True)[:60] + if not location: + lm = re.search(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)?,\s*[A-Z]{2})', card.get_text(" ", strip=True)) + if lm: + location = lm.group(1) + + # Image + img = "" + img_div = card.find(class_=re.compile(r'item.?img|list.?item.?img')) + if img_div: + im = img_div.find("img") + if im: + img = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) + if not img: + im = card.find("img") + if im: + img = im.get("src","") or im.get("data-src","") + + results.append({ + "url": full_url, + "title": title, + "snippet": f"{price} {location}".strip(), + "price_text": price, + "img_url": img, + "location": location, + "source": src.get("name", "BoatCrazy"), + "source_type": src.get("type", "classifieds"), + "category": src.get("category", "Clasificados Generales"), + }) + except Exception: + continue + + print(f"[{src['name']}] {len(results)} listings") + + except Exception as e: + print(f"[{src['name']}] Error: {e}") + + return results + + +def scrape_denison(src: dict, query: str, filters: dict) -> list: + """ + Denison Yachting — static HTML, 30 cards per page. + + Card: div.boat-item + URL: a[href*=/yachts-for-sale/SLUG] (non-dashboard link) + Title: boat_length + make/model + year + name + Price: h4.boat_price[data-price] + [data-default_currency] + Location: h3 text | Image: div.news_pic img + Search: ?search={query} + """ + results = [] + seen = set() + + base = "https://www.denisonyachtsales.com/yachts-for-sale/" + url = f"{base}?search={requests.utils.quote(query.strip())}" + + LISTING_RE = re.compile(r'/yachts-for-sale/[a-z][a-z0-9-]{4,}$', re.I) + CURRENCY_SYMBOLS = {"USD": "$", "EUR": "€", "GBP": "£", "AUD": "A$"} + + try: + resp = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)}, + timeout=20, verify=False) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + for card in soup.find_all(class_="boat-item"): + try: + a = card.find("a", href=LISTING_RE) + if not a: + continue + href = a["href"] + full_url = href if href.startswith("http") else "https://www.denisonyachtsales.com" + href + if full_url in seen: + continue + seen.add(full_url) + + # Title: length + make/model year + "name" + h2 = card.find("h2") + if h2: + length_el = h2.find(class_="boat_length") + length_txt = length_el.get_text(strip=True) if length_el else "" + if length_el: + length_el.extract() + name_el = h2.find("span") + name_txt = name_el.get_text(strip=True) if name_el else "" + if name_el: + name_el.extract() + rest = " ".join(h2.get_text(" ", strip=True).split()) + parts = [p for p in [length_txt, rest, f'"{name_txt}"' if name_txt else ""] if p] + title = " ".join(parts)[:100] + else: + title = (a.get("title", "") or "")[:100] + if not title: + continue + + # Price + price_text = "" + price_el = card.find(class_="boat_price") + if price_el: + raw_price = price_el.get("data-price", "") + currency = price_el.get("data-default_currency", "USD") + sym = CURRENCY_SYMBOLS.get(currency, currency + " ") + if raw_price: + try: + price_text = f"{sym}{int(raw_price):,}" + except ValueError: + price_text = price_el.get_text(strip=True)[:30] + + # Location + location = "" + h3 = card.find("h3") + if h3: + location = h3.get_text(strip=True)[:80] + + # Image + img = "" + pic_div = card.find(class_="news_pic") + if pic_div: + im = pic_div.find("img") + if im: + img = im.get("src", "") or im.get("data-src", "") + + results.append({ + "url": full_url, + "title": title, + "snippet": f"{price_text} · {location}".strip(" ·"), + "price_text": price_text, + "img_url": img, + "location": location, + "source": src.get("name", "Denison Yachting"), + "source_type": src.get("type", "broker"), + "category": src.get("category", "Brokers USA"), + }) + except Exception: + continue + + print(f"[{src.get('name','Denison')}] {len(results)} listings") + + except Exception as e: + print(f"[{src.get('name','Denison')}] Error: {e}") + + return results + + + +# ============================================================================= +# SCRAPER: GovPlanet + IronPlanet (Ritchie Bros family — same HTML .sr_lot) +# ============================================================================= +def scrape_govplanet(src: dict, query: str, filters: dict) -> list[dict]: + """ + GovPlanet (recreational marine) and IronPlanet (commercial marine). + Both share Ritchie Bros HTML: listing cards use .sr_lot selector. + GovPlanet: https://www.govplanet.com/Recreational+Marine + IronPlanet: https://www.ironplanet.com/Commercial+Marine+Vessels + """ + results = [] + try: + url = src["search_url"] + base = "https://" + url.split("/")[2] + headers = get_headers(referer=base + "/") + time.sleep(1.0) + r = requests.get(url, headers=headers, timeout=25, verify=False) + if r.status_code not in (200, 206): + print(f"[{src['name']}] HTTP {r.status_code}") + return [] + soup = BeautifulSoup(r.text, "html.parser") + seen = set() + for card in soup.select(".sr_lot, .lot-tile, article.lot, [class*=srItem]"): + try: + a = card.find("a", href=True) + if not a: + continue + href = a["href"] + if not href.startswith("http"): + href = base + href + if href in seen: + continue + seen.add(href) + title = a.get_text(strip=True)[:100] or card.get_text(" ", strip=True)[:80] + price_el = card.select_one(".price, .lot-price, span[class*=price]") + price_txt = price_el.get_text(strip=True) if price_el else "" + img_el = card.find("img") + img = _extract_best_src(img_el) if img_el else "" + if img and img.startswith("/"): + img = base + img + if title and len(title) > 4: + results.append({ + "title": title, + "url": href, + "snippet": card.get_text(" ", strip=True)[:200], + "price_text": price_txt, + "location": "", + "img_url": img, + "source": src["name"], + "source_type": src.get("type", "auction"), + "category": src.get("category", ""), + }) + except Exception: + continue + print(f"[{src['name']}] {len(results)} listings") + except Exception as e: + print(f"[{src['name']}] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: HiBid (React SPA — Playwright required) +# ============================================================================= +def scrape_hibid(src: dict, query: str, filters: dict) -> list[dict]: + """ + HiBid online auction platform — React SPA requires Playwright. + URL: https://www.hibid.com/lots?q={query}+boat + Cards: .lot-tile Title: h3/.lot-title Price: .high-bid/.lot-price + """ + results = [] + try: + q = requests.utils.quote((query.strip() + " boat")) + url = f"https://www.hibid.com/lots?q={q}" + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + viewport={"width": 1280, "height": 900}, + locale="en-US", + ignore_https_errors=True, + ) + ctx.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + ) + page = ctx.new_page() + try: + page.goto(url, timeout=30000, wait_until="domcontentloaded") + page.wait_for_timeout(4000) + html = page.content() + finally: + try: page.close() + except: pass + browser.close() + + soup = BeautifulSoup(html, "html.parser") + seen = set() + for card in soup.select(".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"): + try: + a = card.find("a", href=True) + if not a: + continue + href = a["href"] + if not href.startswith("http"): + href = "https://www.hibid.com" + href + if href in seen: + continue + seen.add(href) + title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]") + title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] + price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]") + price_txt = price_el.get_text(strip=True) if price_el else "" + img_el = card.find("img") + img = _extract_best_src(img_el) if img_el else "" + if title and len(title) > 4: + results.append({ + "title": title, + "url": href, + "snippet": card.get_text(" ", strip=True)[:200], + "price_text": price_txt, + "location": "", + "img_url": img, + "source": src["name"], + "source_type": src.get("type", "auction"), + "category": src.get("category", ""), + }) + except Exception: + continue + print(f"[{src['name']}] {len(results)} lots") + except Exception as e: + print(f"[{src['name']}] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: Copart salvage boats (heavy JS SPA — Playwright) +# ============================================================================= +def scrape_copart(src: dict, query: str, filters: dict) -> list[dict]: + """ + Copart salvage/insurance lots for watercraft. + URL: https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT + Lots render in a React table after JS executes. + """ + results = [] + try: + q = requests.utils.quote(query.strip()) + url = f"https://www.copart.com/vehicleFinderSection/?searchStr={q}&vehicleType=BOAT" + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-blink-features=AutomationControlled"] + ) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + viewport={"width": 1280, "height": 900}, + locale="en-US", + ignore_https_errors=True, + ) + ctx.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = ctx.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(5000) + try: + page.wait_for_selector( + ".lot-row, tr[data-lot], .lot-details, [class*=lottile], [class*=lot-card]", + timeout=8000 + ) + except Exception: + pass + html = page.content() + finally: + try: page.close() + except: pass + browser.close() + + soup = BeautifulSoup(html, "html.parser") + seen = set() + for row in soup.select( + "tr[data-lot], .lot-row, [class*=lot-card], [class*=lottile], [class*=lot-item]" + ): + try: + a = row.find("a", href=re.compile(r"/lot/")) + if not a: + continue + href = a["href"] + if not href.startswith("http"): + href = "https://www.copart.com" + href + if href in seen: + continue + seen.add(href) + title_el = row.select_one("[class*=title], [class*=desc], td.des") + title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] + price_el = row.select_one("[class*=bid], [class*=price], td.bid") + price_txt = price_el.get_text(strip=True) if price_el else "" + img_el = row.find("img") + img = _extract_best_src(img_el) if img_el else "" + if title and len(title) > 4: + results.append({ + "title": title, + "url": href, + "snippet": row.get_text(" ", strip=True)[:200], + "price_text": price_txt, + "location": "", + "img_url": img, + "source": src["name"], + "source_type": "salvage", + "category": src.get("category", ""), + }) + except Exception: + continue + print(f"[{src['name']}] {len(results)} lots") + except Exception as e: + print(f"[{src['name']}] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: Trade a Boat AU (server-rendered Material-UI) +# ============================================================================= +def scrape_tradeaboat(src: dict, query: str, filters: dict) -> list[dict]: + """ + TradeABoat Australia — server-rendered with Material-UI CSS classes. + Cards use jss* dynamic class names; fallback to /details/ link detection. + URL: https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query} + """ + results = [] + try: + q = requests.utils.quote(query.strip()) + url = f"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={q}" + headers = get_headers(referer="https://www.tradeaboat.com.au/") + time.sleep(1.0) + r = requests.get(url, headers=headers, timeout=25, verify=False) + if r.status_code not in (200, 206): + print(f"[Trade a Boat AU] HTTP {r.status_code}") + return [] + soup = BeautifulSoup(r.text, "html.parser") + base = "https://www.tradeaboat.com.au" + seen = set() + # MUI class names are dynamic (jss77, jss78 …) — find cards via /details/ links + detail_links = soup.find_all("a", href=re.compile(r"/details/")) + visited_parents = set() + for a in detail_links: + try: + href = a["href"] + if not href.startswith("http"): + href = base + href + if href in seen: + continue + seen.add(href) + # Walk up to find card container + card = a.find_parent("div") or a + card_id = id(card) + if card_id in visited_parents: + continue + visited_parents.add(card_id) + title_el = card.select_one("h2, h3, [class*=title]") + title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] + price_el = card.select_one("[class*=price], [class*=Price]") + price_txt = price_el.get_text(strip=True) if price_el else "" + img_el = card.find("img") + img = _extract_best_src(img_el) if img_el else "" + if img and img.startswith("/"): + img = base + img + if title and len(title) > 4: + results.append({ + "title": title, + "url": href, + "snippet": card.get_text(" ", strip=True)[:200], + "price_text": price_txt, + "location": "Australia", + "img_url": img, + "source": "Trade a Boat AU", + "source_type": "broker", + "category": src.get("category", ""), + }) + except Exception: + continue + print(f"[Trade a Boat AU] {len(results)} listings") + except Exception as e: + print(f"[Trade a Boat AU] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: Galati Yachts (requests, WordPress / YSP plugin) +# ============================================================================= +def scrape_galati(src: dict, query: str, filters: dict) -> list[dict]: + """ + Galati Yachts — server-rendered WordPress with YachtSalesPlugin. + URL: https://www.galatiyachts.com/yachts-for-sale/?keywords={query} + """ + results = [] + try: + q = requests.utils.quote(query.strip()) + url = f"https://www.galatiyachts.com/yachts-for-sale/?keywords={q}" + headers = get_headers(referer="https://www.galatiyachts.com/") + time.sleep(1.0) + r = requests.get(url, headers=headers, timeout=25, verify=False) + if r.status_code not in (200, 206): + print(f"[Galati Yachts] HTTP {r.status_code}") + return [] + soup = BeautifulSoup(r.text, "html.parser") + base = "https://www.galatiyachts.com" + seen = set() + # YSP listing cards — try common selectors, fallback to /yachts/ links + cards = soup.select(".ysp-listing, .listing-card, .yacht-card, [class*=yacht-listing]") + if not cards: + # fallback: group by /yachts/details/ anchor + for a in soup.find_all("a", href=re.compile(r"/yachts/")): + href = a["href"] + if not href.startswith("http"): + href = base + href + if href in seen or "galatiyachts.com" not in href: + continue + if href.count("/") < 4: + continue + seen.add(href) + card = a.find_parent("div") or a + title_el = card.select_one("h2, h3, [class*=title]") + title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] + price_el = card.select_one("[class*=price], .price") + price_txt = price_el.get_text(strip=True) if price_el else "" + img_el = card.find("img") + img = _extract_best_src(img_el) if img_el else "" + if img and img.startswith("/"): + img = base + img + if title and len(title) > 4: + results.append({ + "title": title, "url": href, + "snippet": card.get_text(" ", strip=True)[:200], + "price_text": price_txt, "location": "USA", + "img_url": img, "source": "Galati Yachts", + "source_type": "broker", "category": src.get("category", ""), + }) + else: + for card in cards: + try: + a = card.find("a", href=True) + if not a: + continue + href = a["href"] + if not href.startswith("http"): + href = base + href + if href in seen: + continue + seen.add(href) + title_el = card.select_one("h2, h3, [class*=title]") + title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] + price_el = card.select_one("[class*=price], .price") + price_txt = price_el.get_text(strip=True) if price_el else "" + img_el = card.find("img") + img = _extract_best_src(img_el) if img_el else "" + if img and img.startswith("/"): + img = base + img + if title and len(title) > 4: + results.append({ + "title": title, "url": href, + "snippet": card.get_text(" ", strip=True)[:200], + "price_text": price_txt, "location": "USA", + "img_url": img, "source": "Galati Yachts", + "source_type": "broker", "category": src.get("category", ""), + }) + except Exception: + continue + print(f"[Galati Yachts] {len(results)} listings") + except Exception as e: + print(f"[Galati Yachts] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: Luxury brokers (Fraser, Burgess, Worth Ave, Merle Wood, N&J) +# Playwright — JS-heavy sites that won't render with plain requests +# ============================================================================= +def scrape_luxury_broker(src: dict, query: str, filters: dict) -> list[dict]: + """ + Generic Playwright scraper for luxury yacht broker sites. + Covers: Fraser Yachts, Worth Ave Yachts, Merle Wood, Burgess, N&J. + Follows internal links with /yacht/, /vessel/, /boat/, /listing/ in path. + """ + results = [] + name = src.get("name", "Broker") + try: + raw_url = src["search_url"] + url = raw_url.replace("{query}", requests.utils.quote(query.strip())) + base = "https://" + url.split("/")[2] + + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-blink-features=AutomationControlled"] + ) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + viewport={"width": 1280, "height": 900}, + locale="en-US", + ignore_https_errors=True, + ) + ctx.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + "window.chrome={runtime:{}};" + ) + page = ctx.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(3000) + page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") + page.wait_for_timeout(1500) + html = page.content() + finally: + try: page.close() + except: pass + browser.close() + + soup = BeautifulSoup(html, "html.parser") + seen = set() + LISTING_RE = re.compile( + r'/(yacht[s]?|vessel[s]?|boat[s]?|listing[s]?|detail[s]?|sale|for-sale)/', + re.I + ) + for a in soup.find_all("a", href=LISTING_RE): + try: + href = a["href"] + if not href.startswith("http"): + href = base + href + if href in seen or len(href) < 25: + continue + path = href.split("?")[0].rstrip("/") + if path.count("/") < 3: + continue + seen.add(href) + parent = a.find_parent("div") or a.find_parent("li") or a + title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80] + title = " ".join(title.split())[:100] + if len(title) < 5: + continue + ctx_txt = parent.get_text(" ", strip=True)[:300] + pm = re.search(r'[\$€£]\s*[\d,\.]+(?:\s*[Mm]illion|M)?', ctx_txt) + price_txt = pm.group() if pm else "" + img_el = parent.find("img") + img = _extract_best_src(img_el) if img_el else "" + if img and img.startswith("/"): + img = base + img + results.append({ + "title": title, "url": href, + "snippet": ctx_txt[:200], "price_text": price_txt, + "location": "", "img_url": img, + "source": name, "source_type": src.get("type", "broker"), + "category": src.get("category", ""), + }) + if len(results) >= 30: + break + except Exception: + continue + print(f"[{name}] {len(results)} listings") + except Exception as e: + print(f"[{name}] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: EU/International brokers blocked on requests (Playwright) +# Covers: Boat24, YachtAll, Annonces Bateau, Inautia ES, Boats&Outboards UK, +# Boatsales AU, YachtMarket, Apollo Duck UK subdomain +# ============================================================================= +def scrape_eu_broker(src: dict, query: str, filters: dict) -> list[dict]: + """ + Generic Playwright scraper for EU/AU/UK broker sites that block plain + requests (403/ECONNREFUSED). Navigates with real browser, extracts listings. + """ + results = [] + name = src.get("name", "EU Broker") + try: + raw_url = src["search_url"] + url = raw_url.replace("{query}", requests.utils.quote(query.strip())) + base = "https://" + url.split("/")[2] + domain = url.split("/")[2] + + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + viewport={"width": 1280, "height": 900}, + locale="en-US", + ignore_https_errors=True, + ) + ctx.add_init_script( + "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" + ) + page = ctx.new_page() + try: + page.goto(url, timeout=35000, wait_until="domcontentloaded") + page.wait_for_timeout(3000) + html = page.content() + finally: + try: page.close() + except: pass + browser.close() + + soup = BeautifulSoup(html, "html.parser") + seen = set() + for a in soup.find_all("a", href=True): + try: + href = a["href"] + if not href.startswith("http"): + href = base + href + if domain not in href or href in seen: + continue + path = href.split("?")[0].rstrip("/") + if path.count("/") < 3: + continue + if any(s in href.lower() for s in [ + "login","register","contact","about","help","privacy", + "sitemap","category","search","tag","page=","lang=" + ]): + continue + seen.add(href) + parent = a.find_parent("div") or a.find_parent("li") or a + title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80] + title = " ".join(title.split())[:100] + if len(title) < 5: + continue + ctx_txt = parent.get_text(" ", strip=True)[:300] + pm = re.search(r'[\$€£]\s*[\d,\.]+', ctx_txt) + price_txt = pm.group() if pm else "" + img_el = parent.find("img") + img = _extract_best_src(img_el) if img_el else "" + if img and img.startswith("/"): + img = base + img + results.append({ + "title": title, "url": href, + "snippet": ctx_txt[:200], "price_text": price_txt, + "location": "", "img_url": img, + "source": name, "source_type": src.get("type", "broker"), + "category": src.get("category", ""), + }) + if len(results) >= 30: + break + except Exception: + continue + print(f"[{name}] {len(results)} listings") + except Exception as e: + print(f"[{name}] Error: {e}") + return results + + +# ============================================================================= +# SCRAPER: Forum For-Sale sections (TheHullTruth, Cruisers Forum) +# ============================================================================= +def scrape_forum_fs(src: dict, query: str, filters: dict) -> list[dict]: + """ + Scrapes For-Sale classified threads from boating forums (Playwright). + TheHullTruth: /boating-forum/search.php?do=process&query={query}&prefixid=FS + Cruisers Forum: /forums/f152/ (Classifieds subforum) + """ + results = [] + name = src.get("name", "Forum") + try: + raw_url = src["search_url"] + url = raw_url.replace("{query}", requests.utils.quote(query.strip())) + base = "https://" + url.split("/")[2] + + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context( + user_agent=random.choice(USER_AGENTS), + viewport={"width": 1280, "height": 900}, + locale="en-US", + ignore_https_errors=True, + ) + page = ctx.new_page() + try: + page.goto(url, timeout=30000, wait_until="domcontentloaded") + page.wait_for_timeout(2000) + html = page.content() + finally: + try: page.close() + except: pass + browser.close() + + soup = BeautifulSoup(html, "html.parser") + seen = set() + # vBulletin/XenForo thread rows + for row in soup.select( + "li.threadbit, div.threadbit, .thread-item, " + "tr.odd, tr.even, .search-result, [class*=thread], " + ".js-threadListItem, li[id*=thread]" + ): + try: + a = row.find("a", href=re.compile( + r'showthread|/thread[s]?/|/t/\d|/post', re.I + )) + if not a: + a = row.find("a", href=True) + if not a: + continue + href = a["href"] + if not href.startswith("http"): + href = base + href + if href in seen: + continue + seen.add(href) + title = a.get_text(strip=True)[:100] + ctx_txt = row.get_text(" ", strip=True)[:200] + pm = re.search(r'\$\s*[\d,]{3,}', ctx_txt) + price_txt = pm.group() if pm else "" + if title and len(title) > 5: + results.append({ + "title": title, "url": href, + "snippet": ctx_txt, "price_text": price_txt, + "location": "", "img_url": "", + "source": name, "source_type": "classifieds", + "category": src.get("category", ""), + }) + except Exception: + continue + print(f"[{name}] {len(results)} threads") + except Exception as e: + print(f"[{name}] Error: {e}") + return results + + +def scrape_source_router(src: dict, query: str, filters: dict, page: int = 1): + """Central dispatcher — routes each source to its dedicated scraper.""" + name = src.get("name", "") + + # ── Dedicated scrapers ──────────────────────────────────────────────────── + if name == "YachtWorld": + return scrape_yachtworld(query, filters, max_pages=1) + + if name.startswith("eBay"): # covers all 5 eBay entries + return scrape_ebay(src, query, filters) + + if name == "BoatTrader": + return scrape_boattrader(src, query, filters) + + if name in ("Apollo Duck", "Apollo Duck Workboats"): + return scrape_apolloduck(src, query, filters) + + if name == "Boats.com": + return scrape_boatsdotcom(src, query, filters) + + if name == "Craigslist": # single multi-city Craigslist entry + return scrape_craigslist(src, query, filters) + + if name.startswith("Craigslist "): # individual city entries — one request each + return scrape_direct_source(src, query, filters) + + if name in ("GovPlanet", "GovPlanet Recreational", + "IronPlanet", "IronPlanet Marine"): + return scrape_govplanet(src, query, filters) + + if name == "HiBid": + return scrape_hibid(src, query, filters) + + if name in ("Copart Marine", "Copart Boats", "Copart Watercraft"): + return scrape_copart(src, query, filters) + + if name == "Trade a Boat AU": + return scrape_tradeaboat(src, query, filters) + + if name == "Galati Yachts": + return scrape_galati(src, query, filters) + + if name in ("Fraser Yachts", "Burgess Yachts", "Northrop & Johnson", + "Worth Ave Yachts"): + return scrape_luxury_broker(src, query, filters) + + # Boat24 handled below by dedicated scrape_boat24; Inautia handled by scrape_inautia + if name in ("Boat24 EU", "YachtAll", "Annonces Bateau", + "Annonces Bateau FR", "Inautia ES", "Boats & Outboards UK", + "Boats Outboards UK", "Apollo Duck UK", + "Boatsales AU", "YachtMarket", "Boatpoint AU"): + return scrape_eu_broker(src, query, filters) + + if name in ("TheHullTruth", "Cruisers Forum"): + return scrape_forum_fs(src, query, filters) + + if name == "YachtWorld Commercial": + return scrape_yachtworld(query, filters, max_pages=1) + + if name == "Rightboat": + return scrape_rightboat(src, query, filters) + + if name in ("Cooper Salvage", "Cooper Capital Salvage"): + return scrape_cooperss(src, query, filters) + + if name == "Inautia": + return scrape_inautia(src, query, filters) + + if name == "Boat24": + return scrape_boat24(src, query, filters) + + if name == "Facebook Marketplace": + return scrape_facebook_marketplace(src, query, filters) + + if name == "HMY Yachts": + return scrape_hmy(src, query, filters) + + if name == "BoatCrazy": + return scrape_boatcrazy(src, query, filters) + + if name == "Denison Yachting": + return scrape_denison(src, query, filters) + + # ── Generic HTML scraper (fallback) ────────────────────────────────────── + return scrape_direct_source(src, query, filters) + + +def extract_vessel_fast(raw: dict) -> dict | None: + """ + Pure-regex vessel extraction — no Ollama call. + Used for results from known boat marketplaces (broker/classifieds/auction/etc.) + Returns a data dict compatible with save_vessel(), or None if too sparse. + """ + title = (raw.get("title") or "").strip() + snippet = (raw.get("snippet") or "") + price_text = (raw.get("price_text") or "") + location = (raw.get("location") or "") + src_name = (raw.get("source") or "").lower() + src_type = (raw.get("source_type") or "") + category = (raw.get("category") or "").lower() + + if not title or len(title) < 5: + return None + + combined = f"{title} {snippet} {price_text}" + + # ── Price ──────────────────────────────────────────────────────────────── + price_usd = None + currency_out = "USD" + for txt in [price_text, snippet, title]: + # USD + m = re.search(r'\$\s*([\d,]{3,})', txt) + if m: + try: + v = float(m.group(1).replace(",","")) + if 500 < v < 50_000_000: + price_usd = v; currency_out = "USD"; break + except: pass + # GBP + m = re.search(r'£\s*([\d,]{3,})', txt) + if m: + try: + v = float(m.group(1).replace(",","")) * 1.27 + if 500 < v < 50_000_000: + price_usd = round(v); currency_out = "GBP"; break + except: pass + # EUR + m = re.search(r'€\s*([\d,]{3,})', txt) + if m: + try: + v = float(m.group(1).replace(",","")) * 1.09 + if 500 < v < 50_000_000: + price_usd = round(v); currency_out = "EUR"; break + except: pass + # plain number + currency word + m = re.search(r'([\d,]{4,})\s*(?:USD|usd|GBP|gbp|EUR|eur)', txt) + if m: + try: + v = float(m.group(1).replace(",","")) + if 500 < v < 50_000_000: + price_usd = round(v); break + except: pass + + # ── LOA ────────────────────────────────────────────────────────────────── + loa_m = None + for pat, in_meters in [ + (r'(?:loa|length)[:\s]+([\d.]+)\s*(?:ft|\'|feet)', False), + (r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', False), # starts with size + (r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet)\b', False), + (r"(\d{2,3}(?:\.\d)?)'", False), + (r'(?:loa|length)[:\s]+([\d.]+)\s*m\b', True), + ]: + m = re.search(pat, combined, re.IGNORECASE) + if m: + try: + v = float(m.group(1)) + if in_meters: + if 5 < v < 200: loa_m = round(v, 1); break + else: + if 10 < v < 500: loa_m = round(v * 0.3048, 1); break + except: pass + + # ── Year ───────────────────────────────────────────────────────────────── + year = None + ym = re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title) + if ym: year = int(ym.group(1)) + + # ── Vessel type ────────────────────────────────────────────────────────── + cl = combined.lower() + if any(k in src_name for k in ["sailboat","sail"]) or "veleros" in category: + vtype = "Sailboat" + elif any(k in src_name for k in ["workboat","commercial","osv","offshore"]): + vtype = "Offshore" + elif "tug" in src_name: vtype = "Tug" + elif "barge" in src_name: vtype = "Barge" + elif any(k in cl for k in ["sailboat","sailing","velero","ketch","sloop","schooner", + "yawl","cutter","catamaran","trimaran","voilier"]): + vtype = "Sailboat" + elif any(k in cl for k in ["tugboat","tug boat","remolcador"]): vtype = "Tug" + elif "barge" in cl or "barcaza" in cl: vtype = "Barge" + elif any(k in cl for k in ["offshore","osv","supply vessel","crew boat"]): vtype = "Offshore" + elif any(k in cl for k in ["fishing","trawler","seiner","pesquero"]): vtype = "Fishing" + elif any(k in cl for k in ["yacht","motor yacht","motoryacht"]): vtype = "Yacht" + else: vtype = "Motor" + + status = ("auction" if src_type == "auction" else + "salvage" if src_type == "salvage" else "active") + + # Infer location from source name when missing (e.g. "Craigslist Houston" → "Houston") + if not location and raw.get("source"): + src_full = raw["source"] + if re.search(r'[Cc]raigslist', src_full): + city = re.sub(r'[Cc]raigslist\s*', '', src_full).strip() + if city: location = city + elif "Kijiji" in src_full: location = "Canada" + elif "Gumtree" in src_full: location = "Australia" + elif "LeBonCoin" in src_full: location = "France" + elif "Subito" in src_full: location = "Italy" + + # For trusted marketplace sources keep the result even with partial data. + # For web-search results require at least one data point to avoid garbage. + is_trusted = src_type in ("broker", "classifieds", "salvage", "commercial", "auction") + if not is_trusted and not (price_usd or loa_m or year or location): + return None + + score = 50 + if loa_m: + score += min(10, int(loa_m - 10)) + if year and year > 1990: + score += min(10, (year - 1990) // 3) + if price_usd and loa_m: + pft = price_usd / max(loa_m / 0.3048, 1) + if pft < 600: score += 15 + elif pft < 1200: score += 8 + score = min(100, max(0, score)) + + return { + "_fast": True, # flag: skip unit-conversion block downstream + "skip": False, + "name": title[:100], + "vessel_type": vtype, + "loa_m": loa_m, + "beam_m": None, + "draft_m": None, + "year_built": year, + "hull": "Unknown", + "propulsion": "Sail" if vtype == "Sailboat" else "Diesel", + "status": status, + "price_usd": price_usd, + "currency": currency_out, + "location": location, + "country": None, + "description": f"{title[:140]}", + "flags": [], + "score": score, + } + + +def search_with_ai(query: str, filters: dict) -> list: + """ + Hybrid search: direct scraping of open sources + web search to reach + blocked sites (YachtWorld, Boats.com, Apollo Duck, etc.) + """ + vessel_type = filters.get("type", "") + region = filters.get("region", "").lower() + + base = query + if vessel_type and vessel_type.lower() not in query.lower(): + base = f"{vessel_type} {base}" + + # Filter sources by region if specified + # Load custom sources from DB and merge with built-in + try: + conn = get_db() + custom = [dict(r) for r in conn.execute( + "SELECT * FROM custom_sources WHERE active=1").fetchall()] + conn.close() + all_sources = DIRECT_SOURCES + [{ + "name": c["name"], + "category": c["category"], + "search_url": c["search_url"], + "result_sel": "a[href]", + "price_sel": "", + "img_sel": "img", + "loc_sel": "", + "type": c["source_type"], + } for c in custom] + except: + all_sources = DIRECT_SOURCES + + sources_to_use = all_sources + if region and region not in ["global", "todo", "all", ""]: + region_map = { + "usa": ["USA", "Clasificados USA", "Subastas Gobierno USA", "Subastas USA", "Subastas Gobierno", "Comercial Offshore"], + "europa": ["Europa", "Brokers Europa", "Francia", "Italia", "Reino Unido", "España", "España / Global"], + "caribe": ["Latinoamérica", "Latinoamérica / España", "España / Global"], + "latin": ["Latinoamérica", "Latinoamérica / España", "España", "España / Global"], + "asia": ["Australia / Pacífico"], + "australia": ["Australia / Pacífico"], + } + allowed_cats = None + for key, cats in region_map.items(): + if key in region: + allowed_cats = cats + break + if allowed_cats: + sources_to_use = [s for s in all_sources if any(c in s["category"] for c in allowed_cats)] + if not sources_to_use: + sources_to_use = all_sources + + # Filter by status + status = filters.get("status", "") + if status == "auction": + sources_to_use = [s for s in sources_to_use if s["type"] in ["auction", "salvage"]] or sources_to_use + elif status == "salvage": + sources_to_use = [s for s in sources_to_use if s["type"] == "salvage"] or sources_to_use + elif status not in ("salvage",): + # Exclude salvage-only sources unless explicitly searching for salvage + sources_to_use = [s for s in sources_to_use if s["type"] != "salvage"] or sources_to_use + + # Vessel-type-aware source prioritization + OFFSHORE_TYPES = {"offshore", "tug", "barge", "ferry", "fishing", "commercial", "salvage"} + SAILBOAT_TYPES = {"sailboat", "sail", "velero", "ketch", "sloop", "cutter", "schooner"} + COMMERCIAL_ONLY_SOURCES = { + "Seaboats Tug", "Seaboats Barge", "Seaboats Offshore", "Seaboats Fishing", + "OSV Broker", "OSVBroker", "WorkBoat Classifieds", "VT Halter Marine", + "Maritime Connector", "ShipXchange", "Commercial Vessel", + } + SAILBOAT_ONLY_SOURCES = {"SailboatListings", "SailboatListings View", "Cruisers Forum", "Sailboat Listing"} + vessel_type_lower = vessel_type.lower() if vessel_type else "" + + if vessel_type_lower in OFFSHORE_TYPES: + # Skip sailboat-only sources, float commercial ones to front + sources_to_use = [s for s in sources_to_use if s["name"] not in SAILBOAT_ONLY_SOURCES] + commercial = [s for s in sources_to_use if s["type"] in ("commercial", "salvage", "auction")] + rest = [s for s in sources_to_use if s["type"] not in ("commercial", "salvage", "auction")] + sources_to_use = commercial + rest + elif vessel_type_lower in SAILBOAT_TYPES or "sail" in base.lower() or "velero" in base.lower(): + # Skip commercial-only offshore sources for sailboat searches + sources_to_use = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES] + elif not vessel_type_lower: + # Generic search: keep all but put commercial sources after general ones + commercial = [s for s in sources_to_use if s["name"] in COMMERCIAL_ONLY_SOURCES] + rest = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES] + sources_to_use = rest + commercial + + print(f"[Search] Querying {len(sources_to_use)} sources for: {base}") + search_state['total_sources'] = len(sources_to_use) + search_state['log'].append(f"Consultando {len(sources_to_use)} fuentes...") + + def get_query_for_source(src): + """Match query language to source region.""" + cat = src.get("category","").lower() + if any(x in cat for x in ["france","franc","veleros franc"]): + return base + elif any(x in cat for x in ["spain","españa","espana","mexico","colombia","latin"]): + return base + else: + return f"{base} for sale" if "for sale" not in base.lower() else base + + # Build web search queries targeting specific sites + web_queries = build_web_queries(base, filters) + + total = len(sources_to_use) + len(web_queries) + search_state['total_sources'] = total + search_state['log'].append(f"Consultando {len(sources_to_use)} sitios directos + {len(web_queries)} búsquedas web...") + print(f"[Search] {len(sources_to_use)} direct + {len(web_queries)} web searches for: {base}") + + # Run BOTH direct scraping AND web searches in parallel + all_raw = [] + + # ── SailboatListings: dedicated parallel thread (handles its own AI extraction) ── + # Only for sailboat/velero or generic searches, not for offshore/tug/barge/etc. + sbl_thread = None + if vessel_type_lower not in OFFSHORE_TYPES and vessel_type_lower not in {"motor", "motorboat"}: + sbl_thread = threading.Thread( + target=scrape_and_extract_sailboatlistings, + args=(query, filters, search_state.get('search_id', ''), 8), + daemon=True, + ) + sbl_thread.start() + search_state['log'].append("SailboatListings: iniciado en paralelo (hilo dedicado)...") + print("[Search] SailboatListings dedicated thread started") + + # ── Breadth-First Search across all sources ────────────────────────────── + # Round 1: page 1 of all sources simultaneously + # Round 2: page 2 of sources that had results + # Round 3: page 3, etc. + # Between rounds, a natural pause occurs as we process results + # This avoids hammering any single source with consecutive requests + + MAX_ROUNDS = 6 # max pages per source + active_srcs = {src["name"]: {"src": src, "page": 1, "has_more": True} + for src in sources_to_use} + + # Web searches only run once (no pagination) + web_done = False + + for round_num in range(1, MAX_ROUNDS + 1): + if search_state.get("cancelled"): + break + + round_sources = {name: info for name, info in active_srcs.items() + if info["has_more"]} + if not round_sources: + break + + search_state['log'].append(f"Ronda {round_num}: consultando {len(round_sources)} fuentes...") + print(f"[Search] Round {round_num}: {len(round_sources)} active sources") + + round_raw = [] + with ThreadPoolExecutor(max_workers=12) as executor: + futures = {} + + # Submit page N of all active sources + for name, info in round_sources.items(): + src = info["src"] + q = get_query_for_source(src) + # Add page parameter to URL if supported and page > 1 + src_with_page = dict(src) + if round_num > 1: + url = src["search_url"] + # Common pagination patterns + if "craigslist.org" in url: + src_with_page["search_url"] = url + f"&s={round_num * 25 - 25}" + elif "ebay.com" in url: + src_with_page["search_url"] = url + f"&_pgn={round_num}" + elif "seaboats.net" in url: + src_with_page["search_url"] = url + f"&page={round_num}" + elif "kijiji.ca" in url: + src_with_page["search_url"] = url.rstrip('/') + f"/page-{round_num}/" + else: + # Most sites don't support pagination via URL params we know + # Mark as done after page 1 + active_srcs[name]["has_more"] = False + continue + futures[executor.submit(scrape_source_router, src_with_page, q, filters, round_num)] = name + + # Web searches on round 1 only + if round_num == 1 and not web_done: + for wq in web_queries: + futures[executor.submit(web_search, wq, 6)] = f"Web:{wq[:20]}" + web_done = True + + # Collect results for this round + for future in as_completed(futures, timeout=90): + name = futures[future] + try: + results = future.result() + count = len(results) + round_raw.extend(results) + search_state['sources_done'] += 1 + + if name.startswith("Web:"): + if count: + search_state['log'].append(f"🌐 Web: {count} resultados") + else: + if count: + search_state['log'].append(f"✓ {name} p{round_num}: {count}") + print(f"[Round {round_num}] {name}: {count} listings") + else: + # No results this round — remove from future rounds + if name in active_srcs: + active_srcs[name]["has_more"] = False + except Exception as e: + search_state['sources_done'] += 1 + if name in active_srcs: + active_srcs[name]["has_more"] = False + + all_raw.extend(round_raw) + print(f"[Search] Round {round_num} complete: {len(round_raw)} new results (total: {len(all_raw)})") + + # Small pause between rounds — natural break + if round_num < MAX_ROUNDS and not search_state.get("cancelled"): + polite_pause("BFS-round") + + print(f"[Search] Got {len(all_raw)} raw results, extracting vessel data...") + + if not all_raw: + return [] + + # Extract vessel data — parallel with dedup and real-time save + vessels = [] + lock = threading.Lock() + max_price = float(filters.get("max_price") or 0) + min_loa = float(filters.get("min_loa") or 0) + query_words = [w.lower() for w in query.split() if len(w) > 2] + + # Deduplicate raw results by URL + seen_urls = set() + unique_raw = [] + for r in all_raw: + if r["url"] not in seen_urls: + seen_urls.add(r["url"]) + unique_raw.append(r) + + print(f"[Extract] Processing {len(unique_raw)} unique URLs...") + + SYNONYMS = { + "sailboat":["sail","velero","vela","ketch","sloop","schooner","yawl","voilier"], + "velero": ["sail","sailboat","vela","ketch","sloop"], + "tug": ["tugboat","remolcador","tug boat","schlepper"], + "barge": ["barcaza","chaland","ponton","landing craft","lct"], + "fishing": ["pesquero","trawler","seiner","longliner","fisher"], + "offshore":["osv","supply vessel","supply boat","platform"], + "yacht": ["yate","motoryacht","m/y"], + "motor": ["motorboat","lancha","speedboat","cruiser"], + } + NON_VESSELS = ["outboard motor","engine only","motor only","parts only", + "trailer only","propeller","honda bf","yamaha f","suzuki df", + "life jacket","anchor","marine insurance","boat storage", + # Land vehicles — never boats + "ford expedition","ford explorer","ford f-1","ford ranger", + "ford bronco","ford mustang","ford escape","ford transit", + "chevy silverado","chevy tahoe","chevy suburban","chevy colorado", + "chevrolet silverado","chevrolet tahoe","chevrolet suburban", + "gmc sierra","gmc yukon","gmc terrain","gmc canyon", + "dodge ram","ram 1500","ram 2500","ram 3500", + "jeep wrangler","jeep cherokee","jeep grand","jeep gladiator", + "toyota camry","toyota tacoma","toyota tundra","toyota 4runner", + "toyota highlander","toyota rav4","toyota sienna", + "subaru outback","subaru forester","subaru crosstrek", + "honda cr-v","honda pilot","honda accord","honda civic","honda odyssey", + "tesla model","bmw x","mercedes benz","audi q","volkswagen jetta", + "cadillac escalade","cadillac xt","buick enclave","buick encore", + # Non-vessel services + "sailing lesson","sailing partner","sailing school","sailing class", + "sailing instruction","boating lesson","boat lesson","boating class", + "sailing instructor","boat rental","kayak rental","canoe rental", + ] + + def expand_query(words): + expanded = set(words) + for w in words: + for key, syns in SYNONYMS.items(): + if w == key or w in syns: + expanded.add(key) + expanded.update(syns) + return expanded + + expanded_query = expand_query(query_words) + + GENERIC_NAMES = { + "sailboat","velero","barco","yacht","boat","vessel","embarcación", + "sailboat for sale","velero en venta","boat for sale","barco en venta", + "motor boat","motorboat","fishing boat","tug boat","tugboat", + "within25 mi","within 25 mi","results","listing","listings", + } + + def process_one(raw): + try: + if search_state.get("cancelled"): + return + + # Quick title pre-check + title_lower = raw["title"].lower() + if any(kw in title_lower for kw in NON_VESSELS): + return + + src_type = raw.get("source_type", "") + all_images = [] + data = None + + # ── FAST PATH: known boat marketplace → pure regex, no AI ──────── + if src_type in ("broker","classifieds","auction","salvage","commercial"): + data = extract_vessel_fast(raw) + if data: + img = raw.get("img_url","") + if img: + all_images = [img] + else: + # Derive thumbnail from URL (no page fetch needed) + listing_url = raw.get("url","") + ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url) + if ebay_m: + all_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"] + cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url) + if cl_m: + all_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"] + + # ── Fast path: validate the listing is actually a boat ────────────── + if data and data.get("_fast"): + combined_text = (raw.get("title","") + " " + raw.get("snippet","")).lower() + url_l = raw.get("url","").lower() + + # URLs that are guaranteed to be boat listings (trusted sections) + BOAT_URLS = ("/boa","/boat","/sail","sailboatlistings","yachtworld", + "boattrader","seaboats","apolloduck","rightboat","boat24", + "annonces-bateau","barcos.net","tradeaboat","marinetraffic") + is_boat_url = any(k in url_l for k in BOAT_URLS) + + # General auction sites (sell everything) need a boat keyword in the text + BOAT_WORDS = ["boat","sail","yacht","vessel","ketch","sloop","catamaran", + "trimaran","mast","hull","marina","keel","watercraft","cruiser", + "trawler","dinghy","skiff","pontoon","motorboat","powerboat", + "sailboat","barge","tugboat","outboard","inboard","nautical", + "marine","stern","bow","aft","draft","beam","knot","starboard"] + has_boat_word = any(k in combined_text for k in BOAT_WORDS) + + if not is_boat_url and not has_boat_word: + return # Cars, furniture, etc. from general auction sites — skip + + # ── SLOW PATH: web-search results → fetch page + AI ────────────── + if not data: + page_text, page_images = "", [] + try: + fut = ThreadPoolExecutor(max_workers=1).submit(fetch_page_with_images, raw["url"]) + page_text, page_images = fut.result(timeout=12) + except Exception: + page_text = (f"Title: {raw['title']} " + f"| Location: {raw.get('location','')} | {raw.get('snippet','')}") + + if not page_images and raw.get("img_url"): + page_images = [raw["img_url"]] + if not page_images: + listing_url = raw.get("url", "") + ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url) + if ebay_m: + page_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"] + cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url) + if cl_m: + page_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"] + all_images = page_images + + status = ("auction" if src_type == "auction" + else "salvage" if src_type == "salvage" + else "active") + + context = ("URL: " + raw["url"] + "\nTitle: " + raw["title"] + + "\nPrice: " + raw.get("price_text","") + "\n" + page_text[:1500]) + + prompt = ( + "Analyze this boat listing from " + str(raw.get('source','')) + + ". Search was: " + query + "\n" + "TEXT: " + context + "\n\n" + "If NOT a boat for sale respond {skip:true}. " + "If IS a boat respond JSON with: skip=false, name, vessel_type " + "(Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Other), " + "loa_m, beam_m, draft_m (ALWAYS in METERS — detect unit from text; " + "if feet multiply by 0.3048, e.g. 45ft=13.7m, 60ft=18.3m, 100ft=30.5m), " + "year_built, hull, propulsion, " + "status=" + status + ", price_usd, currency, location, country, " + "description (Spanish max 150 chars), flags=[], score 0-100." + ) + + response = ollama_generate(prompt, model=MODELS['classify'], json_mode=True) + m = re.search(r'\{.*\}', response or '', re.DOTALL) + if not m: + return + data = json.loads(m.group()) + if data.get("skip") or not data.get("name"): + return + + # Override AI loa_m with regex (AI misses feet→m conversion) + loa_from_ctx = None + for pat in [ + r'(?:length|loa|eslora)[:\s]+([\d.]+)\s*(?:ft|\'|feet)', + r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet|\')', + r'^(\d{2,3}(?:\.\d)?)\s*\'', + ]: + lm = re.search(pat, context, re.IGNORECASE) + if not lm: + lm = re.search(pat, raw.get("title",""), re.IGNORECASE) + if lm: + try: + ft = float(lm.group(1)) + if 10 < ft < 500: + loa_from_ctx = round(ft * 0.3048, 1) + break + except: pass + if loa_from_ctx and not data.get("loa_m"): + data["loa_m"] = loa_from_ctx + elif loa_from_ctx and data.get("loa_m") and data["loa_m"] > 25: + data["loa_m"] = round(data["loa_m"] * 0.3048, 1) + + # AI unit conversion guard (only needed for AI output) + ctx_lower = (page_text + " " + raw.get("title","")).lower() + has_feet = bool(re.search(r"\d+\s*(?:ft|feet|')\b|loa[:\s]+\d+\s*(?:ft|')", ctx_lower)) + vtype_lower = data.get("vessel_type","").lower() + MAX_M = {"sailboat":25,"yacht":35,"motor":30,"fishing":30, + "tug":60,"barge":120,"offshore":90,"ferry":100,"other":50} + max_reasonable = MAX_M.get(vtype_lower, 50) + for dim in ["loa_m","beam_m","draft_m"]: + val = data.get(dim) + if not val or not isinstance(val,(int,float)): + continue + convert = False + if dim == "loa_m" and (val > 100 or val > max_reasonable or (val > 25 and has_feet)): convert = True + elif dim == "beam_m" and (val > 30 or (val > 8 and has_feet)): convert = True + elif dim == "draft_m"and (val > 15 or (val > 5 and has_feet)): convert = True + if convert: + data[dim] = round(val * 0.3048, 1) + + # ── Shared post-processing (fast path + AI path) ────────────────── + if not data or not data.get("name"): + return + + # Query match check + combined = (data.get("name","") + " " + data.get("description","") + + " " + data.get("vessel_type","") + " " + + raw.get("title","") + " " + raw.get("url","")).lower() + if query_words: + if not any(qw in combined for qw in expanded_query): + # Skip query-match filter for results from direct scrapers (not web search). + # Web search results have category="Web Search" and may return off-topic pages. + # Direct scraper results already passed through a relevant search query. + is_web_search = raw.get("category","").lower() == "web search" + if is_web_search: + source_lower = raw.get("source","").lower() + if not any(kw in source_lower for kw in + ["sailboat","yacht","workboat","offshore","tug","commercial", + "boats","boattrader","apolloduck","rightboat","seaboats", + "yachtworld","govplanet","govdeals","hibid","copart","ebay", + "salvex","kijiji","craigslist","denison","galati","hmy"]): + return + + # Non-vessel + generic name check + if any(kw in data.get("name","").lower() for kw in NON_VESSELS): + return + if data.get("name","").lower().strip() in GENERIC_NAMES: + return + + # Filters (price + LOA) + if max_price and data.get("price_usd") and data["price_usd"] > max_price * 1.01: + return + if min_loa and data.get("loa_m") and data["loa_m"] < (min_loa - 0.15): + return + + data["images"] = all_images[:8] + data["source_url"] = raw["url"] + data["source_name"] = raw["source"] + + vid = save_vessel(data) + if vid > 0: + with lock: + search_state["found"] += 1 + vessels.append(data) + tag = "[Fast]" if data.get("_fast") else "[AI]" + msg = f"✓ {data.get('name','?')} — {raw['source']}" + print(f"{tag} {msg}") + search_state["log"].append(msg) + except Exception as e: + print(f"[Extract] Error: {e}") + + # Fast path: more workers + more URLs since most results skip AI now + with ThreadPoolExecutor(max_workers=16) as ex: + futs = [ex.submit(process_one, r) for r in unique_raw[:300]] + for f in as_completed(futs, timeout=180): + if search_state.get("cancelled"): + break + try: + f.result() + except Exception: + pass + + print(f"[Search] Done — {len(vessels)} vessels found") + return vessels + + return vessels + + + +# ── Fingerprint ─────────────────────────────────────────────────────────────── +def fingerprint(v: dict) -> str: + raw = f"{v.get('name','').lower().strip()}|{round(v.get('loa_m') or 0)}|{v.get('year_built',0)}|{v.get('vessel_type','')}" + return hashlib.sha256(raw.encode()).hexdigest()[:16] + +def save_vessel(v: dict) -> int: + # Reject pure shells — need at least name + 1 real data field + if not v.get("name") or v["name"].strip() in ("", "Unknown"): + return -1 + data_points = sum(1 for f in ['price_usd', 'loa_m', 'year_built', 'location'] if v.get(f)) + if data_points < 1: + return -1 + + fp = fingerprint(v) + conn = get_db() + c = conn.cursor() + existing = c.execute("SELECT id FROM vessels WHERE fingerprint=?", (fp,)).fetchone() + if existing: + conn.close() + return existing['id'] + try: + c.execute("""INSERT INTO vessels + (name,vessel_type,loa_m,beam_m,draft_m,year_built,hull,propulsion, + status,price_usd,currency,location,country,source_name,source_url, + description,images,flags,score,fingerprint,raw_data) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + (v.get('name'), v.get('vessel_type'), v.get('loa_m'), + v.get('beam_m'), v.get('draft_m'), v.get('year_built'), + v.get('hull'), v.get('propulsion'), v.get('status','active'), + v.get('price_usd'), v.get('currency','USD'), + v.get('location'), v.get('country'), + v.get('source_name'), v.get('source_url'), + v.get('description'), json.dumps(v.get('images',[])), + json.dumps(v.get('flags',[])), v.get('score',50), + fp, json.dumps(v))) + vid = c.lastrowid + conn.commit() + except Exception as e: + print(f"[DB] Error: {e}") + vid = -1 + finally: + conn.close() + return vid + +# ── API Routes ──────────────────────────────────────────────────────────────── + +def hash_pw(pw): + return _hashlib.sha256(pw.encode()).hexdigest() + +def seed_admin(): + conn = get_db() + existing = conn.execute("SELECT id FROM users WHERE username='admin'").fetchone() + if not existing: + conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)", + ('admin', hash_pw('admin123'), 'admin')) + conn.commit() + print("[Auth] Default user created: admin / admin123") + conn.close() + +@app.route('/api/login', methods=['POST']) +def login(): + body = request.json or {} + username = body.get('username','').strip() + password = body.get('password','') + conn = get_db() + user = conn.execute("SELECT * FROM users WHERE username=? AND password=?", + (username, hash_pw(password))).fetchone() + conn.close() + if user: + session['user_id'] = user['id'] + session['username'] = user['username'] + session['role'] = user['role'] + return jsonify({'ok': True, 'username': user['username'], 'role': user['role']}) + return jsonify({'ok': False, 'error': 'Usuario o contraseña incorrectos'}), 401 + +@app.route('/api/logout', methods=['POST']) +def logout(): + session.clear() + return jsonify({'ok': True}) + +@app.route('/api/me') +def me(): + if 'user_id' not in session: + return jsonify({'logged_in': False}), 401 + return jsonify({'logged_in': True, 'username': session.get('username'), 'role': session.get('role')}) + +@app.route('/api/users', methods=['GET']) +def list_users(): + if session.get('role') != 'admin': + return jsonify({'error': 'forbidden'}), 403 + conn = get_db() + rows = [dict(r) for r in conn.execute("SELECT id,username,role,created_at FROM users").fetchall()] + conn.close() + return jsonify({'users': rows}) + +@app.route('/api/users', methods=['POST']) +def create_user(): + if session.get('role') != 'admin': + return jsonify({'error': 'forbidden'}), 403 + body = request.json or {} + username = body.get('username','').strip() + password = body.get('password','') + role = body.get('role','user') + if not username or not password: + return jsonify({'error': 'username and password required'}), 400 + conn = get_db() + try: + conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)", + (username, hash_pw(password), role)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + except: + conn.close() + return jsonify({'error': 'username already exists'}), 400 + +@app.route('/api/change_password', methods=['POST']) +def change_password(): + if 'user_id' not in session: + return jsonify({'error': 'not logged in'}), 401 + body = request.json or {} + old_pw = body.get('old_password','') + new_pw = body.get('new_password','') + conn = get_db() + user = conn.execute("SELECT * FROM users WHERE id=? AND password=?", + (session['user_id'], hash_pw(old_pw))).fetchone() + if not user: + conn.close() + return jsonify({'error': 'Contraseña actual incorrecta'}), 400 + conn.execute("UPDATE users SET password=? WHERE id=?", (hash_pw(new_pw), session['user_id'])) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/') +def index(): + return send_from_directory('static', 'index.html') + +@app.route('/api/status') +def status(): + models = ollama_models() + conn = get_db() + counts = { + 'vessels': conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0], + 'saved': conn.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0], + 'alerts': conn.execute("SELECT COUNT(*) FROM alerts WHERE active=1").fetchone()[0], + } + conn.close() + return jsonify({ + 'ok': True, + 'ollama_models': models, + 'active_model': MODELS['extract'], + 'db_counts': counts, + 'sources_count': len(DIRECT_SOURCES), + 'categories': list(set(s['category'] for s in DIRECT_SOURCES)), + }) + +@app.route('/api/vessels') +def list_vessels(): + conn = get_db() + q = "SELECT * FROM vessels WHERE 1=1" + params = [] + if t := request.args.get('type'): + q += " AND vessel_type=?"; params.append(t) + if s := request.args.get('status'): + q += " AND status=?"; params.append(s) + if h := request.args.get('hull'): + q += " AND hull=?"; params.append(h) + if mp := request.args.get('max_price'): + q += " AND price_usd <= ?"; params.append(float(mp)) + if ml := request.args.get('min_loa'): + q += " AND loa_m IS NOT NULL AND loa_m >= ?"; params.append(round(float(ml) - 0.15, 2)) + if yr_min := request.args.get('year_min'): + try: q += " AND year_built >= ?"; params.append(int(yr_min)) + except: pass + if yr_max := request.args.get('year_max'): + try: q += " AND year_built <= ?"; params.append(int(yr_max)) + except: pass + sort = request.args.get('sort', 'score') + sorts = { + 'score':'score DESC', 'price_asc':'price_usd ASC', + 'price_desc':'price_usd DESC', 'loa':'loa_m DESC', + 'year':'year_built DESC', 'newest':'created_at DESC' + } + q += f" ORDER BY {sorts.get(sort,'score DESC')}" + q += f" LIMIT {min(int(request.args.get('limit',200)),500)}" + rows = [dict(r) for r in conn.execute(q, params).fetchall()] + for r in rows: + r['flags'] = json.loads(r.get('flags') or '[]') + r['images'] = json.loads(r.get('images') or '[]') + conn.close() + return jsonify({'vessels': rows, 'count': len(rows)}) + +_PROXY_ALLOWED = [ + 'sailboatlistings.com', 'yachtworld.com', 'boattrader.com', + 'apolloduck.com', 'rightboat.com', 'boat24.com', 'seaboats.net', + 'boats.com', 'iboats.com', 'yachtworld.co.uk', +] + +@app.route('/api/img_proxy') +def img_proxy(): + url = request.args.get('url', '') + if not url: + return '', 404 + from urllib.parse import urlparse + host = urlparse(url).hostname or '' + if not any(d in host for d in _PROXY_ALLOWED): + return '', 403 + try: + resp = requests.get(url, timeout=10, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Referer': f'https://{host}/', + 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', + }) + if resp.status_code == 200: + ct = resp.headers.get('content-type', 'image/jpeg') + return Response(resp.content, content_type=ct, + headers={'Cache-Control': 'public, max-age=86400'}) + return '', resp.status_code + except Exception as e: + app.logger.debug(f"img_proxy error: {e}") + return '', 502 + + +# Global search state +search_state = { + 'running': False, + 'cancelled': False, + 'query': '', + 'found': 0, + 'total_sources': 0, + 'sources_done': 0, + 'log': [], +} + +@app.route('/api/search', methods=['POST']) +def search(): + body = request.json or {} + query = body.get('query', '') + filters = body.get('filters', {}) + if not query: + return jsonify({'error': 'query requerido'}), 400 + + # Clear previous results immediately + conn = get_db() + conn.execute("DELETE FROM vessels") + conn.execute("DELETE FROM saved_vessels") + conn.execute("INSERT INTO search_history (query,filters) VALUES (?,?)", + (query, json.dumps(filters))) + conn.commit() + conn.close() + + # Reset state + search_state['running'] = True + search_state['cancelled'] = False + search_state['query'] = query + search_state['found'] = 0 + search_state['sources_done'] = 0 + search_state['total_sources'] = len(DIRECT_SOURCES) + search_state['log'] = [f"Iniciando búsqueda: {query}"] + + # Tag this search with a unique ID so old threads don't pollute new searches + import uuid + search_id = str(uuid.uuid4()) + search_state['search_id'] = search_id + + # Run search in background thread + def run_bg(sid): + try: + search_with_ai(query, filters) + except Exception as e: + search_state['log'].append(f"Error: {e}") + print(f"[BG] Error: {e}") + finally: + if search_state.get('search_id') == sid: + search_state['running'] = False + total = search_state['found'] + msg = f"✓ Búsqueda completa — {total} embarcaciones encontradas" + search_state['log'].append(msg) + print(f"[BG] {msg}") + + t = threading.Thread(target=run_bg, args=(search_id,), daemon=True) + t.start() + + return jsonify({'ok': True, 'message': 'Búsqueda iniciada en background'}) + +@app.route('/api/search/status') +def search_status(): + return jsonify(search_state) + +@app.route('/api/search/cancel', methods=['POST']) +def cancel_search(): + import uuid + search_state['cancelled'] = True + search_state['running'] = False + search_state['search_id'] = str(uuid.uuid4()) # invalidate any running thread + search_state['log'].append('⏹ Búsqueda cancelada por el usuario') + return jsonify({'ok': True}) + +@app.route('/api/fb-status') +def fb_status(): + SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json") + return jsonify({"active": os.path.exists(SESSION_FILE)}) + + +@app.route('/api/fb-setup', methods=['POST']) +def fb_setup(): + """ + Launch a visible Chromium window so the user can log in to Facebook. + After login is detected (marketplace URL is accessible), saves cookies to fb_session.json. + """ + SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json") + import json as _json + try: + from playwright.sync_api import sync_playwright + result = {"ok": False, "msg": ""} + with sync_playwright() as p: + browser = p.chromium.launch( + headless=False, + args=["--disable-blink-features=AutomationControlled"]) + context = browser.new_context( + viewport={"width": 1100, "height": 800}, + user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/122.0.0.0 Safari/537.36")) + page = context.new_page() + page.goto("https://www.facebook.com/login", timeout=30000, + wait_until="domcontentloaded") + # Wait up to 3 minutes for user to log in and reach marketplace + try: + page.wait_for_url( + re.compile(r'facebook\.com/(marketplace|home|feed)'), + timeout=180000) + # Give extra time to fully load + page.wait_for_timeout(3000) + cookies = context.cookies() + with open(SESSION_FILE, "w") as f: + _json.dump(cookies, f) + result = {"ok": True, + "msg": f"Sesión guardada ({len(cookies)} cookies). " + "Facebook Marketplace activado."} + except Exception as e: + result = {"ok": False, "msg": f"Tiempo agotado o error: {e}"} + finally: + try: page.close() + except: pass + browser.close() + return jsonify(result) + except Exception as e: + return jsonify({"ok": False, "msg": str(e)}), 500 + + +@app.route('/api/vessels/', methods=['GET']) +def get_vessel(vid): + conn = get_db() + row = conn.execute("SELECT * FROM vessels WHERE id=?", (vid,)).fetchone() + conn.close() + if not row: + return jsonify({'error': 'not found'}), 404 + v = dict(row) + v['flags'] = json.loads(v.get('flags') or '[]') + v['images'] = json.loads(v.get('images') or '[]') + return jsonify(v) + +@app.route('/api/vessels', methods=['POST']) +def add_vessel(): + v = request.json or {} + v['source_name'] = v.get('source_name', 'Manual') + vid = save_vessel(v) + return jsonify({'id': vid, 'ok': True}) + +@app.route('/api/vessels/', methods=['PUT']) +def update_vessel(vid): + body = request.json or {} + conn = get_db() + fields = ['name','vessel_type','loa_m','beam_m','draft_m','year_built', + 'hull','propulsion','status','price_usd','location','description','score'] + updates = {k: body[k] for k in fields if k in body} + if updates: + set_clause = ', '.join(f"{k}=?" for k in updates) + conn.execute(f"UPDATE vessels SET {set_clause}, updated_at=datetime('now') WHERE id=?", + [*updates.values(), vid]) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/vessels/', methods=['DELETE']) +def delete_vessel(vid): + conn = get_db() + conn.execute("DELETE FROM vessels WHERE id=?", (vid,)) + conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/saved', methods=['GET']) +def list_saved(): + conn = get_db() + rows = conn.execute(""" + SELECT v.*, s.notes, s.saved_at + FROM vessels v JOIN saved_vessels s ON v.id=s.vessel_id + ORDER BY s.saved_at DESC + """).fetchall() + result = [] + for r in rows: + v = dict(r) + v['flags'] = json.loads(v.get('flags') or '[]') + v['images'] = json.loads(v.get('images') or '[]') + result.append(v) + conn.close() + return jsonify({'vessels': result, 'count': len(result)}) + +@app.route('/api/saved/', methods=['POST']) +def save_vessel_fav(vid): + notes = (request.json or {}).get('notes', '') + conn = get_db() + existing = conn.execute("SELECT id FROM saved_vessels WHERE vessel_id=?", (vid,)).fetchone() + if not existing: + conn.execute("INSERT INTO saved_vessels (vessel_id, notes) VALUES (?,?)", (vid, notes)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/saved/', methods=['DELETE']) +def unsave_vessel(vid): + conn = get_db() + conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/alerts', methods=['GET']) +def list_alerts(): + conn = get_db() + rows = [dict(r) for r in conn.execute("SELECT * FROM alerts WHERE active=1").fetchall()] + conn.close() + return jsonify({'alerts': rows}) + +@app.route('/api/alerts', methods=['POST']) +def create_alert(): + body = request.json or {} + conn = get_db() + conn.execute("INSERT INTO alerts (name, filters) VALUES (?,?)", + (body.get('name','Alerta'), json.dumps(body.get('filters',{})))) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/alerts/', methods=['DELETE']) +def delete_alert(aid): + conn = get_db() + conn.execute("UPDATE alerts SET active=0 WHERE id=?", (aid,)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/sources') +def list_sources(): + by_cat = {} + for s in DIRECT_SOURCES: + cat = s['category'] + if cat not in by_cat: + by_cat[cat] = [] + by_cat[cat].append({'name': s['name'], 'url': s['search_url'].split('?')[0], 'type': s['type'], 'builtin': True}) + # Add custom sources + try: + conn = get_db() + custom = [dict(r) for r in conn.execute("SELECT * FROM custom_sources ORDER BY category").fetchall()] + conn.close() + for c in custom: + cat = c['category'] or 'Custom' + if cat not in by_cat: + by_cat[cat] = [] + by_cat[cat].append({ + 'name': c['name'], 'url': c['search_url'].split('?')[0], + 'type': c['source_type'], 'builtin': False, + 'id': c['id'], 'active': bool(c['active']) + }) + except: + pass + return jsonify({'sources': by_cat, 'total': sum(len(v) for v in by_cat.values())}) + +@app.route('/api/history') +def search_history(): + conn = get_db() + rows = [dict(r) for r in conn.execute( + "SELECT * FROM search_history ORDER BY searched_at DESC LIMIT 50").fetchall()] + conn.close() + return jsonify({'history': rows}) + +@app.route('/api/analyze', methods=['POST']) +def analyze_text(): + body = request.json or {} + text = body.get('text', '') + source = body.get('source', 'Manual') + if not text: + return jsonify({'error': 'text requerido'}), 400 + result = extract_vessel_from_text(text, source) + if result: + vid = save_vessel({**result, 'source_name': source}) + result['id'] = vid + return jsonify(result) + +@app.route('/api/collections', methods=['GET']) +def list_collections(): + conn = get_db() + cols = [dict(r) for r in conn.execute( + "SELECT c.*, COUNT(cv.vessel_id) as vessel_count FROM collections c " + "LEFT JOIN collection_vessels cv ON c.id=cv.collection_id " + "GROUP BY c.id ORDER BY c.created_at DESC").fetchall()] + conn.close() + return jsonify({'collections': cols}) + +@app.route('/api/collections', methods=['POST']) +def create_collection(): + body = request.json or {} + name = body.get('name','').strip() + if not name: + return jsonify({'error': 'name required'}), 400 + conn = get_db() + conn.execute("INSERT INTO collections (name,description,color,icon) VALUES (?,?,?,?)", + (name, body.get('description',''), body.get('color','#00b4ff'), body.get('icon','📁'))) + conn.commit() + cid = conn.execute("SELECT last_insert_rowid()").fetchone()[0] + conn.close() + return jsonify({'ok': True, 'id': cid}) + +@app.route('/api/collections/', methods=['DELETE']) +def delete_collection(cid): + conn = get_db() + conn.execute("DELETE FROM collection_vessels WHERE collection_id=?", (cid,)) + conn.execute("DELETE FROM collections WHERE id=?", (cid,)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/collections//vessels', methods=['GET']) +def collection_vessels(cid): + conn = get_db() + rows = conn.execute(""" + SELECT v.*, cv.notes, cv.added_at FROM vessels v + JOIN collection_vessels cv ON v.id=cv.vessel_id + WHERE cv.collection_id=? ORDER BY cv.added_at DESC""", (cid,)).fetchall() + result = [] + for r in rows: + v = dict(r) + v['flags'] = json.loads(v.get('flags') or '[]') + v['images'] = json.loads(v.get('images') or '[]') + result.append(v) + conn.close() + return jsonify({'vessels': result, 'count': len(result)}) + +@app.route('/api/collections//vessels', methods=['POST']) +def add_to_collection(cid): + body = request.json or {} + vessel_ids = body.get('vessel_ids', []) + notes = body.get('notes', '') + conn = get_db() + added = 0 + for vid in vessel_ids: + try: + conn.execute("INSERT OR IGNORE INTO collection_vessels (collection_id,vessel_id,notes) VALUES (?,?,?)", + (cid, vid, notes)) + added += 1 + except: + pass + conn.commit() + conn.close() + return jsonify({'ok': True, 'added': added}) + +@app.route('/api/collections//vessels/', methods=['DELETE']) +def remove_from_collection(cid, vid): + conn = get_db() + conn.execute("DELETE FROM collection_vessels WHERE collection_id=? AND vessel_id=?", (cid, vid)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/custom_sources', methods=['GET']) +def get_custom_sources(): + conn = get_db() + rows = [dict(r) for r in conn.execute( + "SELECT * FROM custom_sources ORDER BY created_at DESC").fetchall()] + conn.close() + return jsonify({'sources': rows}) + +@app.route('/api/custom_sources', methods=['POST']) +def add_custom_source(): + body = request.json or {} + name = body.get('name','').strip() + url = body.get('search_url','').strip() + if not name or not url: + return jsonify({'error': 'name and search_url required'}), 400 + # Ensure URL has {query} placeholder + if '{query}' not in url: + url = url.rstrip('/') + '?q={query}' + conn = get_db() + conn.execute("""INSERT INTO custom_sources (name,category,search_url,source_type,added_by) + VALUES (?,?,?,?,?)""", + (name, body.get('category','Custom'), + url, body.get('source_type','broker'), + session.get('username','admin'))) + conn.commit() + sid = conn.execute("SELECT last_insert_rowid()").fetchone()[0] + conn.close() + return jsonify({'ok': True, 'id': sid}) + +@app.route('/api/custom_sources/', methods=['PUT']) +def update_custom_source(sid): + body = request.json or {} + conn = get_db() + fields = ['name','category','search_url','source_type','active'] + updates = {k: body[k] for k in fields if k in body} + if updates: + set_clause = ', '.join(f"{k}=?" for k in updates) + conn.execute(f"UPDATE custom_sources SET {set_clause} WHERE id=?", + [*updates.values(), sid]) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/custom_sources/', methods=['DELETE']) +def delete_custom_source(sid): + conn = get_db() + conn.execute("DELETE FROM custom_sources WHERE id=?", (sid,)) + conn.commit() + conn.close() + return jsonify({'ok': True}) + +@app.route('/api/stats') +def stats(): + conn = get_db() + c = conn.cursor() + data = { + 'total': c.execute("SELECT COUNT(*) FROM vessels").fetchone()[0], + 'saved': c.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0], + 'by_type': dict(c.execute("SELECT vessel_type, COUNT(*) FROM vessels GROUP BY vessel_type").fetchall()), + 'by_status': dict(c.execute("SELECT status, COUNT(*) FROM vessels GROUP BY status").fetchall()), + 'by_country':dict((k or 'Unknown', v) for k,v in c.execute("SELECT country, COUNT(*) FROM vessels WHERE country IS NOT NULL GROUP BY country ORDER BY COUNT(*) DESC LIMIT 10").fetchall()), + 'avg_score': c.execute("SELECT AVG(score) FROM vessels").fetchone()[0] or 0, + 'avg_price': c.execute("SELECT AVG(price_usd) FROM vessels WHERE price_usd > 0").fetchone()[0] or 0, + 'top_opportunities': [dict(r) for r in c.execute( + "SELECT id,name,vessel_type,price_usd,score,location FROM vessels ORDER BY score DESC LIMIT 5").fetchall()], + } + conn.close() + return jsonify(data) + +# ── Seed sample data ────────────────────────────────────────────────────────── +def seed_sample_data(): + samples = [ + {"name":"M/Y Stella Maris","vessel_type":"Yacht","loa_m":28.4,"beam_m":6.8,"draft_m":1.9,"year_built":2008,"hull":"Fiberglass","propulsion":"Diesel","status":"active","price_usd":189000,"location":"Fort Lauderdale, FL","country":"US","source_name":"YachtWorld","source_url":"https://yachtworld.com","description":"Yate motor bien mantenido, twin Volvo IPS, refit 2022.","flags":["below_market","motivated_seller"],"score":87}, + {"name":"F/V Cape Hatteras","vessel_type":"Fishing","loa_m":19.2,"beam_m":5.1,"draft_m":1.4,"year_built":1997,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":22000,"location":"Gloucester, MA","country":"US","source_name":"GovDeals","source_url":"https://govdeals.com","description":"Ex buque NOAA, motor operativo, casco requiere trabajo.","flags":["rare","salvage_value","below_market"],"score":94}, + {"name":"TUG Bravo Eagle","vessel_type":"Tug","loa_m":32.0,"beam_m":9.4,"draft_m":3.8,"year_built":1989,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":310000,"location":"New Orleans, LA","country":"US","source_name":"AuctionTime","source_url":"https://auctiontime.com","description":"Remolcador 2400HP, clase ABS, listo para operación comercial.","flags":["rare","auction","motivated_seller"],"score":91}, + {"name":"OSV Pacific Ranger","vessel_type":"Offshore","loa_m":52.0,"beam_m":13.2,"draft_m":4.1,"year_built":2005,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":890000,"location":"Port Fourchon, LA","country":"US","source_name":"GovPlanet","source_url":"https://govplanet.com","description":"Buque apoyo offshore DP1, 400T carga, documentación completa.","flags":["rare","auction","government_surplus"],"score":79}, + {"name":"Barge RJ-440","vessel_type":"Barge","loa_m":44.0,"beam_m":12.0,"draft_m":1.8,"year_built":1978,"hull":"Steel","propulsion":"None","status":"active","price_usd":55000,"location":"Houston, TX","country":"US","source_name":"WorkBoat Classifieds","source_url":"https://workboat.com","description":"Barcaza cubierta, capacidad 800T, buen estado estructural.","flags":["below_market","rare"],"score":73}, + {"name":"LCT Endeavour","vessel_type":"Barge","loa_m":61.0,"beam_m":14.6,"draft_m":1.5,"year_built":1968,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":38000,"location":"Manila, Filipinas","country":"PH","source_name":"Salvex","source_url":"https://salvex.com","description":"Landing craft, estructura sólida, motores requieren overhaul.","flags":["salvage_value","rare","below_market"],"score":82}, + ] + for s in samples: + save_vessel(s) + +# ── Main ────────────────────────────────────────────────────────────────────── +if __name__ == '__main__': + import socket, signal, atexit, sys + + BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + PID_FILE = os.path.join(BASE_DIR, ".server.pid") + + # ── Handle existing instance ─────────────────────────────────────────────── + def kill_pid(pid): + try: + import ctypes + handle = ctypes.windll.kernel32.OpenProcess(1, False, pid) + ctypes.windll.kernel32.TerminateProcess(handle, -1) + ctypes.windll.kernel32.CloseHandle(handle) + return True + except: + try: + os.kill(pid, 9) + return True + except: + return False + + def pid_running(pid): + try: + os.kill(pid, 0) + return True + except OSError: + return False + + if os.path.exists(PID_FILE): + try: + old_pid = int(open(PID_FILE).read().strip()) + if pid_running(old_pid): + print(f"\n ⚠️ Ya hay una instancia corriendo (PID {old_pid})") + resp = input(" ¿Cerrar la instancia anterior y continuar? [S/n]: ").strip().lower() + if resp in ("", "s", "si", "sí", "y", "yes"): + if kill_pid(old_pid): + print(f" ✓ Instancia anterior (PID {old_pid}) cerrada.") + import time; time.sleep(1) + else: + print(f" ✗ No se pudo cerrar. Ciérrala manualmente y vuelve a intentar.") + sys.exit(1) + else: + print(" Saliendo sin cambios.") + sys.exit(0) + except (ValueError, IOError): + pass # PID file corrupted — ignore + + # ── Write PID file ───────────────────────────────────────────────────────── + with open(PID_FILE, "w") as f: + f.write(str(os.getpid())) + + def cleanup_pid(): + try: os.remove(PID_FILE) + except: pass + + atexit.register(cleanup_pid) + + def handle_signal(sig, frame): + print("\n\n 👋 Cerrando Boat&Ship-Finder...") + cleanup_pid() + sys.exit(0) + + signal.signal(signal.SIGINT, handle_signal) + signal.signal(signal.SIGTERM, handle_signal) + + # ── Port selection ───────────────────────────────────────────────────────── + def port_free(p): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: s.bind(("0.0.0.0", p)); return True + except: return False + + desired = int(os.environ.get('MARINE_PORT', 8765)) + port = desired + if not port_free(desired): + for p in range(desired + 1, desired + 20): + if port_free(p): + port = p + break + print(f"\n ⚠️ Puerto {desired} ocupado — usando {port}") + + # ── DB init ──────────────────────────────────────────────────────────────── + print("\n" + "="*55) + print(" Boat&Ship-Finder — Iniciando...") + print("="*55) + init_db() + seed_admin() + conn = get_db() + count = conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0] + conn.close() + if count == 0: + print("[DB] Base de datos vacía — lista para búsquedas reales") + else: + print(f"[DB] {count} embarcaciones en caché de sesión anterior") + + print(f"\n Local: http://localhost:{port}") + print(f" Tailscale: http://:{port}") + print(f" Fuentes directas: {len(DIRECT_SOURCES)}") + print(f" Modelos Ollama: {list(MODELS.values())}") + print(f" PID: {os.getpid()} (guardado en .server.pid)") + print("\n [Ctrl+C para detener]\n") + app.run(host='0.0.0.0', port=port, debug=False) diff --git a/static/index.html b/static/index.html new file mode 100644 index 0000000..69d3448 --- /dev/null +++ b/static/index.html @@ -0,0 +1,1344 @@ + + + + + +Boat&Ship-Finder — Global Vessel Intelligence + + + + + +
+ +
+ + +
+
+ + + + + + +
+
+ + +
+ + + + + + + +
+ + + + + +
+ + + + + + + + +
+ + +
+
Total
+
Score prom.
+
Precio prom.
+
Subastas
+
Salvage
+
Guardadas
+
+ + +
+ + +
+
+
+ Conectando… +
+
+ + +
+
+ +
+ + +
+
+
⚡ Búsqueda Global Real-Time
+

+ Busca en todo internet simultáneamente — subastas, clasificados, astilleros, periódicos, revistas náuticas, registros de buques y más. La IA extrae datos reales de cada resultado. +

+
+ +
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+
+ +
+
+ +
+ +
+
+
+ + +
+
+
📋 Analizar Anuncio
+

Pega cualquier texto — anuncio de periódico, email, clasificado, descripción de subasta. La IA extrae todos los datos técnicos.

+
+ +
+
+ +
+ +
+
+
+ + +
+
+
+ Agregar Embarcación
+
+
+
+ +
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+
+
+
+
+ +
+
+ + +
+
+
🌐 Fuentes Globales
+ + +
+
+ 📘 +
+
Facebook Marketplace
+
Verificando sesión…
+
+ +
+
+ ⚠ Usa una cuenta secundaria, no tu cuenta personal. Al hacer clic, se abrirá un navegador para que inicies sesión manualmente — solo se hace una vez. +
+
+ + +
+
+ Agregar nueva fuente
+
+ + +
+
+ +
+
+ + +
+
+ 💡 Si la URL ya tiene búsqueda: pega la URL completa con {query} donde va el texto.
+ Ejemplo: https://www.sitio.com/buscar?texto={query}&tipo=barco +
+
+ +
+
+
+ + +
+
+
🔔 Nueva Alerta
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/stop.bat b/stop.bat new file mode 100644 index 0000000..fccf5a0 --- /dev/null +++ b/stop.bat @@ -0,0 +1,15 @@ +@echo off +set PID_FILE=%~dp0.server.pid + +if not exist "%PID_FILE%" ( + echo No hay ninguna instancia de Boat^&Ship-Finder corriendo. + pause + exit /b 0 +) + +set /p PID=<"%PID_FILE%" +echo Cerrando Boat^&Ship-Finder (PID %PID%)... +taskkill /F /PID %PID% >nul 2>&1 +del "%PID_FILE%" >nul 2>&1 +echo Servidor cerrado. +pause