Files
alro65 235a9abbfe security: SECRET_KEY from env, CORS restricted to localhost
- Replace hardcoded secret_key with os.environ.get('SECRET_KEY')
- RuntimeError if SECRET_KEY not set (fail fast)
- Restrict CORS to localhost:8765 origins (was allow all with credentials)
- Add .gitignore excluding db, env, __pycache__, backups

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-03 12:55:19 -04:00

5403 lines
247 KiB
Python

"""
Boat&Ship-Finder - Backend Server
Requiere: pip install flask flask-cors requests beautifulsoup4 playwright
"""
from flask import Flask, jsonify, request, send_from_directory, session
import hashlib as _hashlib
from flask_cors import CORS
import requests
import json
import sqlite3
import os
import re
import time
import hashlib
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import threading
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
app = Flask(__name__, static_folder='static')
_secret = os.environ.get('SECRET_KEY')
if not _secret:
raise RuntimeError("SECRET_KEY not set — add SECRET_KEY=<random> to your environment")
app.secret_key = _secret
CORS(app,
origins=["http://localhost:8765", "http://127.0.0.1:8765"],
supports_credentials=True)
DB_PATH = 'marine.db'
OLLAMA_URL = 'http://localhost:11434/api/generate'
# ── Modelos Ollama por tarea ──────────────────────────────────────────────────
MODELS = {
'extract': 'qwen2.5:32b', # Extracción de specs (más rápido que 72b, igual de preciso)
'classify': 'llama3.1:8b', # Clasificación rápida
'embed': 'nomic-embed-text:latest', # Embeddings para dedup
'parse': 'qwen3-coder:latest' # Parsing estructurado
}
# ── Fuentes globales por categoría ────────────────────────────────────────────
SOURCES = {
"Subastas USA": [
{"name": "GovPlanet", "url": "https://www.govplanet.com/boats", "type": "auction"},
{"name": "GovDeals", "url": "https://www.govdeals.com", "type": "auction"},
{"name": "PropertyRoom", "url": "https://www.propertyroom.com/boats", "type": "auction"},
{"name": "PublicSurplus", "url": "https://www.publicsurplus.com", "type": "auction"},
{"name": "AuctionTime", "url": "https://www.auctiontime.com/boats", "type": "auction"},
{"name": "IronPlanet", "url": "https://www.ironplanet.com/boats", "type": "auction"},
{"name": "HiBid", "url": "https://www.hibid.com/boats", "type": "auction"},
{"name": "Copart Marine", "url": "https://www.copart.com/boats", "type": "auction"},
{"name": "BidSpotter", "url": "https://www.bidspotter.com/boats", "type": "auction"},
{"name": "32auctions", "url": "https://www.32auctions.com", "type": "auction"},
],
"Subastas Internacionales": [
{"name": "Ritchie Bros", "url": "https://www.rbauction.com/boats", "type": "auction"},
{"name": "Euro Auctions", "url": "https://www.euroauctions.com", "type": "auction"},
{"name": "Troostwijk", "url": "https://www.troostwijkauctions.com", "type": "auction"},
{"name": "Surplex", "url": "https://www.surplex.com/marine", "type": "auction"},
{"name": "BVA Auctions", "url": "https://www.bva-auctions.com", "type": "auction"},
{"name": "Catawiki Marine", "url": "https://www.catawiki.com/boats", "type": "auction"},
{"name": "Barnebys", "url": "https://www.barnebys.com/boats", "type": "auction"},
{"name": "ShipXchange", "url": "https://www.shipxchange.com", "type": "auction"},
],
"Venta Especializada": [
{"name": "YachtWorld", "url": "https://www.yachtworld.com", "type": "broker"},
{"name": "Boats.com", "url": "https://www.boats.com", "type": "broker",
"search_url": "https://www.boats.com/boats-for-sale/?query={query}"},
{"name": "BoatTrader", "url": "https://www.boattrader.com", "type": "broker",
"search_url": "https://www.boattrader.com/boats/?query={query}"},
{"name": "Apollo Duck", "url": "https://www.apolloduck.com", "type": "broker",
"search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1"},
{"name": "Rightboat", "url": "https://www.rightboat.com", "type": "broker",
"search_url": "https://www.rightboat.com/boats-for-sale/?q={query}"},
{"name": "Boat24", "url": "https://www.boat24.com", "type": "broker",
"search_url": "https://www.boat24.com/en/usedboats/"},
{"name": "Inautia", "url": "https://www.inautia.com", "type": "broker",
"search_url": "https://www.inautia.com/boats/?q={query}"},
# ── US Brokers ────────────────────────────────────────────────────────
{"name": "HMY Yachts", "url": "https://hmy.com", "type": "broker",
"search_url": "https://www.hmy.com/yachts-for-sale/?SaleClassCode=used",
"category": "Brokers USA"},
{"name": "Denison Yachting","url": "https://www.denisonyachtsales.com", "type": "broker",
"search_url": "https://www.denisonyachtsales.com/yachts-for-sale/?search={query}",
"category": "Brokers USA"},
{"name": "United Yacht", "url": "https://www.unitedyacht.com", "type": "broker",
"search_url": "https://www.unitedyacht.com/yachts-for-sale/",
"category": "Brokers USA"},
{"name": "Northrop & Johnson","url": "https://www.n-j.com", "type": "broker",
"search_url": "https://www.n-j.com/yachts-for-sale/",
"category": "Brokers USA"},
{"name": "Worth Ave Yachts","url": "https://www.worthavenueyachts.com", "type": "broker",
"search_url": "https://www.worthavenueyachts.com/yachts-for-sale/",
"category": "Brokers USA"},
{"name": "Bluewater Yachting","url": "https://www.bluewateryachting.com", "type": "broker",
"category": "Brokers USA"},
{"name": "Galati Yachts", "url": "https://www.galatiyachts.com", "type": "broker",
"search_url": "https://www.galatiyachts.com/boat-search/?q={query}",
"category": "Brokers USA"},
{"name": "Fraser Yachts", "url": "https://www.fraseryachts.com", "type": "broker",
"search_url": "https://www.fraseryachts.com/en/yachts-for-sale/?search={query}",
"category": "Brokers INT"},
{"name": "Burgess Yachts", "url": "https://www.burgessyachts.com", "type": "broker",
"search_url": "https://www.burgessyachts.com/en/yacht-sale?q={query}",
"category": "Brokers INT"},
{"name": "Ocean Alexander", "url": "https://www.oceanalexander.com", "type": "broker",
"search_url": "https://www.oceanalexander.com/find-a-boat/?q={query}",
"category": "Brokers USA"},
{"name": "Merle Wood", "url": "https://www.merlewood.com", "type": "broker",
"search_url": "https://www.merlewood.com/yachts-for-sale/",
"category": "Brokers INT"},
# ── Other ─────────────────────────────────────────────────────────────
{"name": "NauticExpo", "url": "https://www.nauticexpo.com", "type": "broker"},
{"name": "Seaboats", "url": "https://www.seaboats.net", "type": "broker"},
{"name": "YachtBroker", "url": "https://www.yachtbroker.com", "type": "broker"},
],
"Comercial / Industrial": [
{"name": "WorkBoat", "url": "https://www.workboat.com/classifieds", "type": "commercial"},
{"name": "TradeABoat", "url": "https://www.tradeaboat.com.au", "type": "broker"},
{"name": "Boatpoint", "url": "https://www.boatpoint.com.au", "type": "broker"},
{"name": "Boats & Outboards","url": "https://www.boatsandoutboards.co.uk", "type": "broker"},
{"name": "Commercial Vessel","url": "https://www.commercialvessel.com", "type": "commercial"},
{"name": "ShipServ", "url": "https://www.shipserv.com", "type": "commercial"},
{"name": "Marine Classifieds","url": "https://www.marineclassifieds.com", "type": "classifieds"},
{"name": "Barcos.net", "url": "https://www.barcos.net", "type": "broker"},
# ── Offshore / DP / OSV ───────────────────────────────────────────────
{"name": "Offshore Vessel Exchange","url": "https://www.offshorevessel.exchange","type": "commercial",
"search_url": "https://www.offshorevessel.exchange/?s={query}",
"category": "Offshore / DP"},
{"name": "MarineTraffic Vessels For Sale","url": "https://www.marinetraffic.com/en/ads/p/list","type": "commercial",
"search_url": "https://www.marinetraffic.com/en/ads/p/list?search={query}",
"category": "Offshore / DP"},
{"name": "YachtWorld Commercial","url": "https://www.yachtworld.com","type": "commercial",
"search_url": "https://www.yachtworld.com/boats-for-sale/type-commercial/?query={query}",
"category": "Offshore / DP"},
{"name": "Apollo Duck Workboats","url": "https://www.apolloduck.com","type": "commercial",
"search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1",
"category": "Offshore / DP"},
{"name": "Seawork Classifieds","url": "https://www.seawork.com","type": "commercial",
"search_url": "https://www.seawork.com/classifieds/",
"category": "Offshore / DP"},
{"name": "ShipXchange OSV", "url": "https://www.shipxchange.com", "type": "commercial",
"search_url": "https://www.shipxchange.com/en/vessel-types/offshore-support-vessel",
"category": "Offshore / DP"},
{"name": "Vessel Sales & Acquisitions","url": "https://www.vsl.no", "type": "commercial",
"search_url": "https://www.vsl.no/vessels-for-sale/",
"category": "Offshore / DP"},
],
"Clasificados Generales": [
{"name": "Craigslist Boats", "url": "https://www.craigslist.org/search/boa", "type": "classifieds"},
{"name": "eBay Motors Marine","url": "https://www.ebay.com/b/Boats/26429", "type": "classifieds",
"search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10"},
{"name": "Facebook Marketplace","url": "https://www.facebook.com/marketplace/boats","type": "classifieds"},
{"name": "BoatCrazy", "url": "https://boatcrazy.com", "type": "classifieds",
"search_url": "https://boatcrazy.com/boats?q={query}",
"category": "Clasificados USA"},
{"name": "Kijiji Marine", "url": "https://www.kijiji.ca/b-boats", "type": "classifieds"},
{"name": "Gumtree Boats", "url": "https://www.gumtree.com/boats", "type": "classifieds"},
{"name": "Subito.it Barche", "url": "https://www.subito.it/barche", "type": "classifieds"},
{"name": "LeBonCoin Bateaux","url": "https://www.leboncoin.fr/bateaux", "type": "classifieds"},
{"name": "Wallapop Barcos", "url": "https://es.wallapop.com/barcos", "type": "classifieds"},
{"name": "MercadoLibre", "url": "https://www.mercadolibre.com/barcos", "type": "classifieds"},
{"name": "OLX Marine", "url": "https://www.olx.com/boats", "type": "classifieds"},
],
"Salvage & Wrecks": [
{"name": "Cooper Capital Salvage", "url": "https://www.cooperss.com", "type": "salvage",
"search_url": "https://www.cooperss.com/",
"category": "Salvage USA"},
{"name": "Salvex", "url": "https://www.salvex.com", "type": "salvage",
"search_url": "https://www.salvex.com/search/?q={query}&cat=30",
"category": "Salvage USA"},
{"name": "Copart Marine", "url": "https://www.copart.com", "type": "salvage",
"search_url": "https://www.copart.com/public/data/lotSearchResults/?query={query}&vehicleType=BOAT",
"category": "Salvage USA"},
{"name": "IAA Watercraft", "url": "https://www.iaai.com", "type": "salvage",
"search_url": "https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft",
"category": "Salvage USA"},
{"name": "Ritchie Bros Marine","url": "https://www.rbauction.com", "type": "auction",
"search_url": "https://www.rbauction.com/used-equipment?q={query}&searchType=MODEL&equipmentCategory=marine",
"category": "Salvage USA"},
{"name": "NavAuctions", "url": "https://www.navauctions.com", "type": "salvage"},
{"name": "MarineWrecks", "url": "https://www.marinewrecks.com", "type": "salvage"},
{"name": "BoatBreakers", "url": "https://www.boatbreakers.com", "type": "salvage"},
{"name": "Barnacle Marine", "url": "https://www.barnaclemarine.com", "type": "salvage"},
{"name": "Boat Breakers AU","url": "https://www.boatbreakersnz.com", "type": "salvage"},
],
"Revistas & Noticias": [
{"name": "Trade Only Today","url": "https://www.tradeonlytoday.com", "type": "news"},
{"name": "Nautical News", "url": "https://www.nauticalnews.com", "type": "news"},
{"name": "Boat International","url": "https://www.boatinternational.com/yachts","type": "magazine"},
{"name": "Superyacht Times","url": "https://www.superyachttimes.com", "type": "magazine"},
{"name": "The Triton", "url": "https://www.the-triton.com/classifieds", "type": "magazine"},
{"name": "Passagemaker", "url": "https://www.passagemaker.com", "type": "magazine"},
{"name": "WorkBoat Mag", "url": "https://www.workboat.com", "type": "magazine"},
{"name": "Lloyd's List", "url": "https://lloydslist.maritimeintelligence.informa.com", "type": "news"},
{"name": "Tradewinds", "url": "https://www.tradewindsnews.com", "type": "news"},
{"name": "Maritime Executive","url": "https://www.maritime-executive.com", "type": "news"},
{"name": "Splash247", "url": "https://splash247.com", "type": "news"},
{"name": "Bairdmaritime", "url": "https://www.bairdmaritime.com", "type": "news"},
],
"Registros & Gobierno": [
{"name": "USCG Docs", "url": "https://www.dco.uscg.mil/nvdc", "type": "registry"},
{"name": "UK Ship Register","url": "https://www.ukshipregister.co.uk", "type": "registry"},
{"name": "Panama Registry", "url": "https://www.segumar.com", "type": "registry"},
{"name": "Marshall Islands","url": "https://www.register-iri.com", "type": "registry"},
{"name": "Liberian Registry","url": "https://www.liscr.com", "type": "registry"},
{"name": "Bahamas Maritime","url": "https://www.bahamasmaritime.com", "type": "registry"},
{"name": "IHS Sea-web", "url": "https://maritime.ihs.com", "type": "registry"},
],
}
# ── Database ──────────────────────────────────────────────────────────────────
def init_db():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.executescript("""
CREATE TABLE IF NOT EXISTS vessels (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
vessel_type TEXT,
loa_m REAL,
beam_m REAL,
draft_m REAL,
year_built INTEGER,
hull TEXT,
propulsion TEXT,
status TEXT,
price_usd REAL,
currency TEXT DEFAULT 'USD',
location TEXT,
country TEXT,
source_name TEXT,
source_url TEXT,
description TEXT,
images TEXT,
flags TEXT,
score REAL DEFAULT 0,
fingerprint TEXT UNIQUE,
raw_data TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS saved_vessels (
id INTEGER PRIMARY KEY AUTOINCREMENT,
vessel_id INTEGER REFERENCES vessels(id),
notes TEXT,
saved_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS search_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT,
filters TEXT,
results INTEGER,
searched_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS custom_sources (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
category TEXT DEFAULT 'Custom',
search_url TEXT NOT NULL,
source_type TEXT DEFAULT 'broker',
active INTEGER DEFAULT 1,
added_by TEXT,
last_status TEXT DEFAULT 'unknown',
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS users (
id INTEGER PRIMARY KEY AUTOINCREMENT,
username TEXT UNIQUE NOT NULL,
password TEXT NOT NULL,
role TEXT DEFAULT 'user',
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS collections (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
description TEXT,
color TEXT DEFAULT '#00b4ff',
icon TEXT DEFAULT '📁',
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS collection_vessels (
id INTEGER PRIMARY KEY AUTOINCREMENT,
collection_id INTEGER REFERENCES collections(id),
vessel_id INTEGER REFERENCES vessels(id),
notes TEXT,
added_at TEXT DEFAULT (datetime('now')),
UNIQUE(collection_id, vessel_id)
);
CREATE TABLE IF NOT EXISTS alerts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
filters TEXT,
last_match INTEGER DEFAULT 0,
active INTEGER DEFAULT 1,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_vessels_type ON vessels(vessel_type);
CREATE INDEX IF NOT EXISTS idx_vessels_status ON vessels(status);
CREATE INDEX IF NOT EXISTS idx_vessels_price ON vessels(price_usd);
CREATE INDEX IF NOT EXISTS idx_vessels_score ON vessels(score DESC);
""")
conn.commit()
conn.close()
def get_db():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
# ── Ollama helpers ─────────────────────────────────────────────────────────────
_ollama_sem = threading.Semaphore(3) # max 3 concurrent Ollama calls
def ollama_generate(prompt: str, model: str = None, json_mode: bool = False) -> str:
model = model or MODELS['classify']
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 2048}
}
if json_mode:
payload["format"] = "json"
with _ollama_sem:
try:
r = requests.post(OLLAMA_URL, json=payload, timeout=120)
r.raise_for_status()
return r.json().get("response", "")
except Exception as e:
print(f"[Ollama] Error: {e}")
return ""
def ollama_models() -> list:
try:
r = requests.get("http://localhost:11434/api/tags", timeout=5)
return [m["name"] for m in r.json().get("models", [])]
except:
return []
def extract_vessel_from_text(text: str, source: str) -> dict:
"""Use Ollama to extract structured vessel data from raw text."""
prompt = f"""Eres un experto en inteligencia de mercado marítimo.
Analiza este texto de un anuncio de embarcación y extrae los datos disponibles.
Fuente: {source}
TEXTO:
{text[:3000]}
Responde SOLO con JSON válido. Si el texto NO es un listing de embarcación específica responde {{"skip": true}}.
{{
"skip": false,
"name": "nombre del barco o descripción corta",
"vessel_type": "Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Salvage|Other",
"loa_m": número o null,
"beam_m": número o null,
"draft_m": número o null,
"year_built": número o null,
"hull": "Fiberglass|Steel|Aluminum|Wood|Composite|Unknown",
"propulsion": "Diesel|Gasoline|Electric|Sail|None|Unknown",
"status": "active|auction|salvage|abandoned|sold",
"price_usd": número o null,
"currency": "USD|EUR|GBP|CAD|AUD|etc",
"location": "ciudad, país",
"country": "código ISO 2 letras",
"description": "resumen en español máximo 200 caracteres",
"flags": ["below_market","rare","auction","salvage_value","motivated_seller","commercial","government_surplus"],
"score": número del 0 al 100 según oportunidad para un broker
}}"""
response = ollama_generate(prompt, model=MODELS['extract'], json_mode=True)
try:
data = json.loads(response)
if data.get("skip"):
return {}
return data
except:
match = re.search(r'\{.*\}', response, re.DOTALL)
if match:
try:
data = json.loads(match.group())
if data.get("skip"):
return {}
return data
except:
pass
return {}
# ── Direct source scrapers — no search engine middleman ──────────────────
import random
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
]
def get_headers(referer=None):
ua = random.choice(USER_AGENTS)
h = {
'User-Agent': ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,fr;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
}
if referer:
h['Referer'] = referer
return h
HEADERS = get_headers()
# Each source has its own search URL pattern and CSS selectors
# Sites we scrape directly (confirmed working)
DIRECT_SOURCES = [
# ── Craigslist ─────────────────────────────────────────────────────────────
# Single multi-city entry (uses scrape_craigslist internally — Playwright, 3+ cities)
{"name":"Craigslist", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"},
# Individual cities — each makes one targeted request via scrape_direct_source
{"name":"Craigslist Miami", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Tampa", "category":"Clasificados USA", "search_url":"https://tampa.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Fort Laud", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}&sort=date", "type":"classifieds"},
{"name":"Craigslist New Orleans","category":"Clasificados USA", "search_url":"https://neworleans.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Houston", "category":"Clasificados USA", "search_url":"https://houston.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Seattle", "category":"Clasificados USA", "search_url":"https://seattle.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist LA", "category":"Clasificados USA", "search_url":"https://losangeles.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist SF", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Jacksonville","category":"Clasificados USA", "search_url":"https://jacksonville.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Sarasota", "category":"Clasificados USA", "search_url":"https://sarasota.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Chicago", "category":"Clasificados USA", "search_url":"https://chicago.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Boston", "category":"Clasificados USA", "search_url":"https://boston.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Atlanta", "category":"Clasificados USA", "search_url":"https://atlanta.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Baltimore", "category":"Clasificados USA", "search_url":"https://baltimore.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Norfolk", "category":"Clasificados USA", "search_url":"https://norfolk.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist San Diego", "category":"Clasificados USA", "search_url":"https://sandiego.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Portland OR","category":"Clasificados USA", "search_url":"https://portland.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Minneapolis","category":"Clasificados USA", "search_url":"https://minneapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Detroit", "category":"Clasificados USA", "search_url":"https://detroit.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Cleveland", "category":"Clasificados USA", "search_url":"https://cleveland.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Charlotte", "category":"Clasificados USA", "search_url":"https://charlotte.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Denver", "category":"Clasificados USA", "search_url":"https://denver.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Phoenix", "category":"Clasificados USA", "search_url":"https://phoenix.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Annapolis", "category":"Clasificados USA", "search_url":"https://annapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist New Jersey", "category":"Clasificados USA", "search_url":"https://newjersey.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Galveston", "category":"Clasificados USA", "search_url":"https://galveston.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Pensacola", "category":"Clasificados USA", "search_url":"https://pensacola.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Mobile AL", "category":"Clasificados USA", "search_url":"https://mobile.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Key West", "category":"Clasificados USA", "search_url":"https://keys.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Corpus", "category":"Clasificados USA", "search_url":"https://corpuschristi.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Beaumont", "category":"Clasificados USA", "search_url":"https://beaumont.craigslist.org/search/boa?query={query}", "type":"classifieds"},
{"name":"Craigslist Baton Rouge","category":"Clasificados USA", "search_url":"https://batonrouge.craigslist.org/search/boa?query={query}", "type":"classifieds"},
# NOTE: gulfcoast.craigslist.org (Biloxi) no longer exists — replaced with Mobile AL
# ── eBay ──────────────────────────────────────────────────────────────────
{"name":"eBay Marine", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10", "type":"classifieds"},
{"name":"eBay Auction", "category":"Subastas USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_Auction=1", "type":"auction"},
{"name":"eBay Motors Sail", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=36431&LH_BIN=1&_sop=10", "type":"classifieds"},
{"name":"eBay Boats Complete", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+boat&_sacat=26429&LH_BIN=1&_sop=15", "type":"classifieds"},
{"name":"eBay Salvage Boats", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"},
# ── Subastas Gobierno ─────────────────────────────────────────────────────
{"name":"GovDeals", "category":"Subastas Gobierno", "search_url":"https://www.govdeals.com/index.cfm?fa=Main.AdvSearchResultsNew&kWord={query}&category=70", "type":"auction"},
{"name":"PublicSurplus", "category":"Subastas Gobierno", "search_url":"https://www.publicsurplus.com/sms/browse/home?search={query}", "type":"auction"},
{"name":"PropertyRoom", "category":"Subastas Gobierno", "search_url":"https://www.propertyroom.com/s?q={query}+boat", "type":"auction"},
# GovPlanet: correct URL confirmed working (Recreational Marine category)
{"name":"GovPlanet", "category":"Subastas Gobierno", "search_url":"https://www.govplanet.com/Recreational+Marine", "type":"auction"},
# IronPlanet: correct URL confirmed working (Commercial Marine Vessels)
{"name":"IronPlanet", "category":"Subastas Gobierno", "search_url":"https://www.ironplanet.com/Commercial+Marine+Vessels", "type":"auction"},
# HiBid: React SPA — scrape_hibid uses Playwright
{"name":"HiBid", "category":"Subastas USA", "search_url":"https://www.hibid.com/lots?q={query}+boat", "type":"auction"},
{"name":"AuctionTime", "category":"Subastas USA", "search_url":"https://www.auctiontime.com/listings/search?q={query}+boat", "type":"auction"},
{"name":"BidSpotter", "category":"Subastas USA", "search_url":"https://www.bidspotter.com/en-us/auction-catalogues?q={query}+boat", "type":"auction"},
# Copart: Playwright scraper handles JS-rendered lots
{"name":"Copart Marine", "category":"Subastas USA", "search_url":"https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT", "type":"auction"},
# ── Salvage ───────────────────────────────────────────────────────────────
{"name":"Salvex Marine", "category":"Salvage / Subastas", "search_url":"https://salvex.com/listings/?q={query}&cat=marine", "type":"salvage"},
{"name":"Barnacle Marine", "category":"Salvage / Subastas", "search_url":"https://www.barnaclemarine.com/?s={query}", "type":"salvage"},
{"name":"eBay Salvage", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"},
{"name":"Cooper Capital Salvage","category":"Salvage USA", "search_url":"https://www.cooperss.com/", "type":"salvage"},
{"name":"IAA Watercraft", "category":"Salvage USA", "search_url":"https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft", "type":"salvage"},
# ── Venta Especializada — principales ────────────────────────────────────
{"name":"YachtWorld", "category":"Venta Especializada", "search_url":"https://www.yachtworld.com/boats-for-sale/", "type":"broker"},
{"name":"BoatTrader", "category":"Venta Especializada", "search_url":"https://www.boattrader.com/boats/?query={query}", "type":"broker"},
{"name":"Boats.com", "category":"Venta Especializada", "search_url":"https://www.boats.com/boats-for-sale/?query={query}", "type":"broker"},
{"name":"Apollo Duck", "category":"Venta Especializada", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"broker"},
{"name":"Rightboat", "category":"Venta Especializada", "search_url":"https://www.rightboat.com/boats-for-sale/?q={query}", "type":"broker"},
# Boat24: 403 on requests — scrape_eu_broker uses Playwright
{"name":"Boat24", "category":"Venta Especializada", "search_url":"https://www.boat24.com/en/boats/?q={query}", "type":"broker"},
# YachtMarket: uses scrape_eu_broker (Playwright) in case of blocks
{"name":"YachtMarket", "category":"Venta Especializada", "search_url":"https://www.yachtmarket.com/boats-for-sale/?q={query}", "type":"broker"},
# ── SailboatListings (dedicated thread also runs in parallel) ────────────
{"name":"SailboatListings", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=&material=&hull=&state=&keyword={query}&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True},
{"name":"SailboatListings View", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=Sail&material=&hull=&state=&keyword=&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True},
# Forums: Playwright scraper handles vBulletin/XenForo FS sections
{"name":"TheHullTruth", "category":"Veleros Global", "search_url":"https://www.thehulltruth.com/boating-forum/search.php?do=process&query={query}&prefixid=FS&type=post", "type":"classifieds"},
{"name":"Cruisers Forum", "category":"Veleros Global", "search_url":"https://www.cruisersforum.com/forums/f152/", "type":"classifieds"},
# ── Comercial / Offshore ──────────────────────────────────────────────────
{"name":"WorkBoat Classifieds", "category":"Comercial Offshore", "search_url":"https://www.workboat.com/classifieds/?keywords={query}", "type":"commercial"},
{"name":"Commercial Vessel", "category":"Comercial Offshore", "search_url":"https://www.commercialvessel.com/search?keywords={query}", "type":"commercial"},
{"name":"OSV Broker", "category":"Comercial Offshore", "search_url":"https://www.osvbroker.com/?s={query}", "type":"commercial"},
{"name":"Marine Classifieds", "category":"Comercial Offshore", "search_url":"https://www.marineclassifieds.com/search.php?search={query}", "type":"commercial"},
{"name":"Seaboats", "category":"Comercial Global", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=0", "type":"commercial"},
{"name":"Seaboats Offshore", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=offshore+support+vessels", "type":"commercial"},
{"name":"Seaboats Tug", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=tugs+%26+pushboats", "type":"commercial"},
{"name":"Seaboats Barge", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=barges+%26+lighters", "type":"commercial"},
{"name":"Seaboats Fishing", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=fishing+vessels", "type":"commercial"},
{"name":"Apollo Duck Workboats", "category":"Comercial Offshore", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"commercial"},
{"name":"YachtWorld Commercial", "category":"Comercial Offshore", "search_url":"https://www.yachtworld.com/boats-for-sale/type-commercial/", "type":"commercial"},
# ── Australia / Pacífico ─────────────────────────────────────────────────
# Trade a Boat AU: server-rendered, correct URL confirmed working
{"name":"Trade a Boat AU", "category":"Australia / Pacifico","search_url":"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query}", "type":"broker"},
# Boatsales.com.au (Boatpoint redirects here): scrape_eu_broker via Playwright
{"name":"Boatsales AU", "category":"Australia / Pacifico","search_url":"https://www.boatsales.com.au/boats-for-sale/?q={query}", "type":"broker"},
# ── Reino Unido ───────────────────────────────────────────────────────────
# Boats & Outboards UK: 403 on requests — scrape_eu_broker uses Playwright
{"name":"Boats & Outboards UK", "category":"Reino Unido", "search_url":"https://www.boatsandoutboards.co.uk/boats-for-sale/?q={query}", "type":"broker"},
# Apollo Duck UK: use same apolloduck.com (no separate UK subdomain)
{"name":"Apollo Duck UK", "category":"Reino Unido", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1&country=GB", "type":"broker"},
# ── Francia ───────────────────────────────────────────────────────────────
# Annonces Bateau: 403 on requests — scrape_eu_broker uses Playwright
{"name":"Annonces Bateau", "category":"Francia", "search_url":"https://www.annoncesbateau.com/bateaux/annonces-bateaux?keyword={query}", "type":"broker"},
# ── España / Mediterráneo ────────────────────────────────────────────────
# Inautia ES: 403 on requests — scrape_eu_broker uses Playwright
{"name":"Inautia ES", "category":"Espana / Global", "search_url":"https://www.inautia.es/barca?q={query}", "type":"broker"},
{"name":"Barcos.net", "category":"Espana / Global", "search_url":"https://www.barcos.net/busqueda/?q={query}", "type":"broker"},
# ── Europa / Global ───────────────────────────────────────────────────────
# YachtAll: 403 on requests — scrape_eu_broker uses Playwright
{"name":"YachtAll", "category":"Clasificados EU", "search_url":"https://yachtall.com/yachts/?search={query}", "type":"broker"},
# ── Brokers USA ───────────────────────────────────────────────────────────
{"name":"HMY Yachts", "category":"Brokers USA", "search_url":"https://www.hmy.com/yachts-for-sale/?SaleClassCode=used", "type":"broker"},
{"name":"Denison Yachting", "category":"Brokers USA", "search_url":"https://www.denisonyachtsales.com/yachts-for-sale/?search={query}", "type":"broker"},
{"name":"BoatCrazy", "category":"Brokers USA", "search_url":"https://boatcrazy.com/boats?q={query}", "type":"classifieds"},
# Galati Yachts: server-rendered WP site — scrape_galati uses requests
{"name":"Galati Yachts", "category":"Brokers USA", "search_url":"https://www.galatiyachts.com/yachts-for-sale/?keywords={query}", "type":"broker"},
{"name":"United Yacht Sales", "category":"Brokers USA", "search_url":"https://www.unitedyacht.com/yachts-for-sale/", "type":"broker"},
# Worth Ave Yachts: hybrid server-rendered — scrape_luxury_broker uses Playwright
{"name":"Worth Ave Yachts", "category":"Brokers USA", "search_url":"https://www.worthavenueyachts.com/yachts-for-sale/", "type":"broker"},
# ── Brokers Internacionales ───────────────────────────────────────────────
# Fraser Yachts: Vue/JS SPA — scrape_luxury_broker uses Playwright
{"name":"Fraser Yachts", "category":"Brokers Internacional","search_url":"https://www.fraseryachts.com/en/yachts-for-sale/", "type":"broker"},
# Burgess Yachts: JS-loaded — scrape_luxury_broker uses Playwright
{"name":"Burgess Yachts", "category":"Brokers Internacional","search_url":"https://www.burgessyachts.com/en/yachts/sale/", "type":"broker"},
# Northrop & Johnson: JS-loaded — scrape_luxury_broker uses Playwright
{"name":"Northrop & Johnson", "category":"Brokers Internacional","search_url":"https://www.njcharters.com/yachts-for-sale/", "type":"broker"},
{"name":"Merle Wood", "category":"Brokers Internacional","search_url":"https://www.merlewood.com/yachts-for-sale/", "type":"broker"},
# ── Canada ────────────────────────────────────────────────────────────────
{"name":"Kijiji Boats CA", "category":"Canada", "search_url":"https://www.kijiji.ca/b-boats/{query}/k0c132", "type":"classifieds"},
]
# Web search queries — finds listings on ANY site including blocked ones
# DuckDuckGo returns results from YachtWorld, Boats.com, Apollo Duck, etc.
# Base web search templates — {query} is replaced at runtime
# Dynamic templates also get price/loa filters appended when available
WEB_SEARCH_TEMPLATES = [
'"{query}" boat for sale',
'"{query}" sailboat for sale',
'"{query}" vessel for sale',
'"{query}" yacht for sale',
'"{query}" barco venta',
'"{query}" bateau vendre occasion',
'site:yachtworld.com {query} for sale sail cruiser',
'site:boats.com {query} sailboat for sale',
'site:apolloduck.com {query} for sale',
'site:rightboat.com {query} for sale',
'site:boat24.com {query} for sale',
'site:yachtall.com {query} sailboat',
'site:annoncesbateau.com {query} voilier',
'site:cruisersforum.com {query} for sale',
'site:thehulltruth.com {query} for sale fs',
'site:govplanet.com {query} vessel',
'site:ironplanet.com {query} boat vessel',
'site:govdeals.com {query} vessel boat',
'site:publicsurplus.com {query} vessel',
'site:hibid.com {query} boat',
'site:copart.com {query} boat vessel',
'site:rbauction.com {query} boat',
'"{query}" boat auction government surplus',
'"{query}" vessel auction salvage',
# Salvage specific
'site:salvex.com {query} marine vessel',
'site:copart.com {query} boat salvage',
'site:iaai.com {query} boat',
'site:boatbreakers.com {query}',
'"{query}" salvage boat for sale',
'"{query}" insurance total loss boat',
'"{query}" wrecked boat for sale parts',
'"{query}" boat salvage title for sale',
'site:seaboats.net {query}',
'site:workboat.com {query} for sale',
'site:commercialvessel.com {query}',
# Offshore / commercial
'site:osvbroker.com {query}',
'site:marineclassifieds.com {query} for sale',
'site:apolloduck.com {query} offshore tug barge',
'"{query}" offshore supply vessel for sale',
'"{query}" OSV for sale broker',
'"{query}" crew boat for sale',
'"{query}" workboat for sale',
'"{query}" tug for sale',
'"{query}" barge for sale',
'"{query}" supply vessel for sale',
'"{query}" fishing vessel for sale',
'"{query}" commercial vessel for sale',
# Australia / Pacific
'site:tradeaboat.com.au {query} for sale',
'site:boatpoint.com.au {query} for sale',
# Europe classifieds
'site:boatsandoutboards.co.uk {query} for sale',
'site:annoncesbateau.com {query} voilier',
'site:inautia.com {query} barco venta',
]
def build_web_queries(base_query: str, filters: dict) -> list:
"""Build web search queries filtered by vessel type/status to avoid irrelevant searches."""
price_ctx = ""
loa_ctx = ""
if filters.get("max_price"):
price_ctx = f" under ${filters['max_price']}"
if filters.get("min_loa"):
ft = int(float(filters["min_loa"]) / 0.3048)
loa_ctx = f" {ft}ft+"
vtype = (filters.get("type","") or "").lower()
status = (filters.get("status","") or "").lower()
# Categorize templates so we only include relevant ones
SALVAGE_KWORDS = {"salvage","copart","iaai","boatbreakers","insurance","total loss","wrecked","salvage title"}
OFFSHORE_KWORDS = {"workboat","commercial","osvbroker","offshore","osv","crew boat","supply vessel","tug","barge","fishing vessel"}
SAIL_KWORDS = {"sailboat","yachtall","annoncesbateau","voilier","cruisersforum","sail cruiser"}
GENERIC_KWORDS = {"boat for sale","vessel for sale","yacht for sale","barco venta","bateau","yachtworld","boats.com","apolloduck","rightboat","boat24","govplanet","ironplanet","govdeals","publicsurplus","hibid","rbauction","tradeaboat","boatpoint","boatsandoutboards","inautia"}
is_salvage = status == "salvage" or "salvage" in base_query.lower()
is_offshore = vtype in {"offshore","tug","barge","ferry","fishing","commercial"} or any(k in base_query.lower() for k in {"tug","barge","osv","crew boat","workboat"})
is_sail = vtype in {"sailboat","velero","sail"} or any(k in base_query.lower() for k in {"sail","velero","ketch","sloop"})
queries = []
for tmpl in WEB_SEARCH_TEMPLATES:
tmpl_l = tmpl.lower()
# Skip salvage templates for non-salvage searches
if any(k in tmpl_l for k in SALVAGE_KWORDS) and not is_salvage:
continue
# Skip offshore templates for clearly non-offshore searches (sailboat/velero)
if any(k in tmpl_l for k in OFFSHORE_KWORDS) and is_sail and not is_offshore:
continue
# Skip sailboat templates for offshore/salvage searches
if any(k in tmpl_l for k in SAIL_KWORDS) and (is_offshore or is_salvage) and not is_sail:
continue
q = tmpl.replace("{query}", base_query)
if not q.startswith("site:") and (price_ctx or loa_ctx):
q += loa_ctx + price_ctx
queries.append(q)
return queries
SEARCH_ENGINES = [
{
"name": "DuckDuckGo",
"url": "https://html.duckduckgo.com/html/?q={query}",
"link_sel": "a.result__a",
"snippet_sel": "a.result__snippet",
},
{
"name": "Bing",
"url": "https://www.bing.com/search?q={query}&count=20",
"link_sel": "h2 a",
"snippet_sel": ".b_caption p",
},
]
def web_search(query: str, max_results: int = 8) -> list[dict]:
"""Search web engines for real listings."""
results = []
seen = set()
skip = ["google.","bing.","duckduckgo.","yahoo.","wikipedia.","youtube.",
"facebook.com/login","instagram.","twitter.","linkedin.",
"pinterest.","reddit.com/r/",".pdf","amazon.com/s?"]
for engine in SEARCH_ENGINES:
try:
url = engine["url"].format(query=requests.utils.quote(query))
time.sleep(1.0)
r = requests.get(url, headers=get_headers(), timeout=20, verify=False)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
links = soup.select(engine["link_sel"])
snippets = soup.select(engine["snippet_sel"])
for i, link in enumerate(links[:max_results*2]):
href = link.get("href","")
# Clean DDG redirect
if "duckduckgo.com" in href:
m = re.search(r'uddg=([^&]+)', href)
if m: href = requests.utils.unquote(m.group(1))
if not href.startswith("http"): continue
if any(s in href for s in skip): continue
if href in seen: continue
seen.add(href)
title = link.get_text(strip=True)
snippet = snippets[i].get_text(strip=True) if i < len(snippets) else ""
try: source = href.split("/")[2].replace("www.","")
except: source = "web"
results.append({
"url": href, "title": title, "snippet": snippet,
"price_text": "", "img_url": "",
"location": "", "source": source,
"source_type": "broker", "category": "Web Search"
})
if len(results) >= max_results: break
except Exception as e:
pass
if len(results) >= max_results: break
return results
def scrape_direct_source(source: dict, query: str, filters: dict = None) -> list[dict]:
if filters is None: filters = {}
"""AI-powered scraper — no CSS selectors, reads HTML like a human."""
results = []
try:
# Build URL — expand filter placeholders if source supports them
raw_url = source["search_url"]
if source.get("supports_filters"):
min_loa_m = float(filters.get("min_loa") or 0)
max_price = filters.get("max_price") or ""
min_price = filters.get("min_price") or ""
loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else ""
loa_max_ft = "" # no max LOA filter in current UI
raw_url = raw_url.replace("{loa_min_ft}", str(loa_min_ft))
raw_url = raw_url.replace("{loa_max_ft}", str(loa_max_ft))
raw_url = raw_url.replace("{price_min}", str(min_price))
raw_url = raw_url.replace("{price_max}", str(max_price))
# Clean query - remove duplicate "for sale"
# Clean query - remove duplicates
clean_q = query.strip()
for phrase in [" for sale for sale", "for sale for sale", " velero velero", " sailboat sailboat"]:
clean_q = clean_q.replace(phrase, phrase.split()[0] + " " + phrase.split()[1])
clean_q = ' '.join(dict.fromkeys(clean_q.split())) # remove duplicate words
url = raw_url.format(query=requests.utils.quote(clean_q.replace(' for sale for sale',' for sale')))
time.sleep(1.0)
domain = url.split('/')[2]
headers = get_headers(referer=f"https://{domain}/")
r = requests.get(url, headers=headers, timeout=25, verify=False)
# Retry with different UA if blocked
if r.status_code in [403, 429, 503]:
time.sleep(2)
headers = get_headers()
r = requests.get(url, headers=headers, timeout=25, verify=False)
if r.status_code not in [200, 206]:
print(f"[{source['name']}] HTTP {r.status_code}")
return []
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup(["script","style","nav","footer","header","aside","noscript","meta","link"]):
tag.decompose()
base_url = "/".join(url.split("/")[:3])
raw_links = []
skip_words = ["login","register","signup","about","contact","help",
"privacy","terms","facebook.com","twitter.com","instagram.com"]
for a in soup.find_all("a", href=True)[:80]:
href = a["href"].strip()
if not href or href.startswith("#") or href.startswith("javascript"):
continue
if not href.startswith("http"):
href = base_url + ("" if href.startswith("/") else "/") + href
if any(s in href.lower() for s in skip_words):
continue
text = a.get_text(strip=True)[:150]
parent = a.find_parent()
price = ""
img = ""
if parent:
ptxt = parent.get_text(" ", strip=True)
pm = re.search(r'[\d,]+(?:\.\d+)?\s*(?:USD|EUR|GBP|CAD|\$|€|£)', ptxt)
if pm:
price = pm.group()
# Traverse up to 4 levels to find a thumbnail image
node = parent
for _ in range(4):
if node is None:
break
im = node.find("img")
if im:
src = _extract_best_src(im)
if src:
# Convert relative to absolute
if src.startswith("//"):
src = "https:" + src
elif src.startswith("/"):
src = base_url + src
if src.startswith("http") and len(src) > 20:
img = src
break
node = node.parent
if text and len(text) > 8:
raw_links.append({"url":href,"title":text,"price":price,"img":img})
if not raw_links:
print(f"[{source['name']}] No links found")
return []
seen = set()
unique = []
for lnk in raw_links:
if lnk["url"] not in seen:
seen.add(lnk["url"])
unique.append(lnk)
# ── Heuristic listing filter (no AI needed) ──────────────────────────
# Score each link — higher = more likely to be an actual vessel listing
BOAT_KW = ["boat","yacht","vessel","sail","ketch","sloop","cutter","schooner",
"yawl","catamaran","trimaran","motor","tug","barge","cruiser","skiff",
"fishing","trawler","offshore","cabin","dinghy","pontoon","runabout"]
def listing_score(lnk):
url_l = lnk["url"].lower()
title_l = lnk["title"].lower()
sc = 0
if lnk["price"]: sc += 4 # price is strong signal
if lnk["img"]: sc += 1 # has photo
if re.search(r'/\d{5,}', url_l): sc += 3 # 5+ digit ID
if re.search(r'/(view|detail|listing|item|vessel|boat|ship|for-sale)[-/]', url_l): sc += 2
if re.search(r'-for-sale[/-]?$', url_l): sc += 2
if re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title_l): sc += 3 # year in title
if re.search(r'\d{2,3}\s*(?:\'|ft|feet|meter)', title_l): sc += 2 # size
if any(k in title_l for k in BOAT_KW): sc += 1
if re.search(r'\b(for sale|en vente|vendre|en venta)\b', title_l): sc += 1
if len(lnk["title"]) > 15: sc += 1 # nav links are short
return sc
scored = [(listing_score(lnk), lnk) for lnk in unique[:30]]
scored.sort(key=lambda x: x[0], reverse=True)
# Keep links with score >= 3, or fall back to top-5 if nothing qualifies
good = [lnk for sc, lnk in scored if sc >= 3]
if not good:
good = [lnk for _, lnk in scored[:5]] # best guesses from this source
for lnk in good[:20]:
results.append({
"url": lnk["url"],
"title": lnk["title"],
"snippet": f"Price: {lnk['price']}",
"price_text": lnk["price"],
"img_url": lnk["img"],
"location": "",
"source": source["name"],
"source_type": source["type"],
"category": source["category"],
})
print(f"[{source['name']}] {len(results)} listings found")
except Exception as e:
print(f"[{source['name']}] Error: {e}")
return results
# Interleave queue for polite scraping
_interleave_lock = threading.Lock()
_interleave_sites = [
"https://miami.craigslist.org",
"https://www.seaboats.net",
"https://www.barcos.net",
"https://www.ebay.com",
"https://boston.craigslist.org",
"https://seattle.craigslist.org",
]
_interleave_idx = 0
def polite_pause(source_name: str):
"""
Between pages of the same site, make a quick request to a different
site so we look like a human browsing — not a bot hammering one server.
"""
global _interleave_idx
with _interleave_lock:
site = _interleave_sites[_interleave_idx % len(_interleave_sites)]
_interleave_idx += 1
try:
requests.get(site, headers=get_headers(), timeout=5, verify=False)
except Exception:
pass
# Random human-like delay: 2-5 seconds
time.sleep(random.uniform(2.0, 5.0))
print(f"[{source_name}] Polite pause done — continuing...")
def scrape_sailboatlistings(query: str, filters: dict, max_pages: int = 8) -> list[dict]:
"""
Multi-page scraper for SailboatListings.com.
Captures MAIN listings (sailboat=XXXXX) with full structured data,
plus SIDEBAR featured listings (/view/XXXXX) as bonus.
"""
results = []
seen_urls = set()
min_loa_m = float(filters.get("min_loa") or 0)
max_loa_m = float(filters.get("max_loa") or 0)
max_price = filters.get("max_price") or ""
loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else ""
loa_max_ft = int(max_loa_m / 0.3048) if max_loa_m else ""
vessel_type = filters.get("type","").lower()
sbl_type_map = {
"sailboat": "Sail", "sail": "Sail",
"yacht": "cruiser",
"motor": "powerboat", "motorboat": "powerboat",
"fishing": "fishing",
"tug": "", "barge": "", "offshore": "", "ferry": "", "commercial": "",
}
# Default "" → search ALL types on SailboatListings
sbl_type = sbl_type_map.get(vessel_type, "")
hull = filters.get("hull","").lower()
sbl_hull_map = {
"fiberglass":"fiberglass","steel":"steel",
"aluminum":"aluminum","wood":"wood",
}
sbl_material = sbl_hull_map.get(hull, "")
year_min = filters.get("year_min","---")
year_max = filters.get("year_max","---")
if not year_min: year_min = "---"
if not year_max: year_max = "---"
base_url = (
"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi"
"?db=default&uid=default&sb=33&so=descend&websearch=1"
f"&manufacturer=&model="
f"&length-gt={loa_min_ft}&length-lt={loa_max_ft}"
f"&year-lt={year_max}&year-gt={year_min}&price-lt={max_price}"
f"&type={sbl_type}&material={sbl_material}&hull=&state="
f"&keyword={requests.utils.quote(query)}"
f"&view_records=+Show+Matching+Boats+"
)
for page in range(1, max_pages + 1):
if page > 1:
polite_pause("SailboatListings")
try:
url = base_url if page == 1 else base_url + f"&nh={page}"
r = requests.get(url, headers=get_headers(), timeout=25, verify=False)
if r.status_code == 429:
print(f"[SailboatListings] Rate limited on page {page} — stopping")
break
if r.status_code != 200:
print(f"[SailboatListings] Page {page} HTTP {r.status_code}")
break
soup = BeautifulSoup(r.text, "html.parser")
body_text = soup.get_text()
if "no records" in body_text.lower() or "0 matches" in body_text.lower():
print(f"[SailboatListings] No more results at page {page}")
break
page_results = 0
# ── MAIN LISTINGS (sailboat=XXXXX) — full structured data ──
for header_link in soup.find_all("a", class_="sailheader"):
href = header_link.get("href", "")
m = re.search(r'sailboat=(\d+)', href)
if not m:
continue
sid = m.group(1)
canonical = f"https://www.sailboatlistings.com/view/{sid}"
if canonical in seen_urls:
continue
seen_urls.add(canonical)
title = header_link.get_text(strip=True)
# Parent table contains all structured sailvb/sailvk spans
listing_table = header_link.find_parent("table")
if not listing_table:
continue
# Extract structured fields
fields = {}
for label_span in listing_table.find_all("span", class_="sailvb"):
label = label_span.get_text(strip=True).rstrip(":").strip()
value_span = label_span.find_next("span", class_="sailvk")
if value_span:
fields[label] = value_span.get_text(strip=True)
price_text = fields.get("Asking", "")
location = fields.get("Location", "")
# Build context string from structured fields
context_parts = [f"{k}: {v}" for k, v in fields.items()]
context = " | ".join(context_parts)
# Extract image — upgrade thumbnail to full-size
img_src = ""
img_tag = listing_table.find("img")
if img_tag:
img_src = img_tag.get("src", "") or img_tag.get("data-src", "")
if img_src and not img_src.startswith("http"):
img_src = "https://www.sailboatlistings.com" + img_src
# Upgrade /sailimg/t/ (thumbnail) or /sailimg/m/ (medium) → /sailimg/ (full)
for thumb in ["/sailimg/t/", "/sailimg/m/"]:
if thumb in img_src:
img_src = img_src.replace(thumb, "/sailimg/")
break
if not img_src:
img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg"
results.append({
"url": canonical,
"title": title or context[:80],
"snippet": context,
"price_text": price_text,
"img_url": img_src,
"location": location,
"source": "SailboatListings",
"source_type": "broker",
"category": "Veleros Global",
"fields": fields, # pass structured fields for direct extraction
})
page_results += 1
# ── SIDEBAR FEATURED (/view/XXXXX) — less data but more listings ──
for a in soup.find_all("a", class_="featured"):
href = a.get("href", "")
view_m = re.search(r'/view/(\d+)', href)
if not view_m:
continue
sid = view_m.group(1)
canonical = f"https://www.sailboatlistings.com/view/{sid}"
if canonical in seen_urls:
continue
seen_urls.add(canonical)
link_text = a.get_text(" ", strip=True)
# Extract price from link text: "45' Alden 45 Falmouth, Maine Asking $355,000"
price_m = re.search(r'Asking\s*\$([\d,]+)', link_text)
price_text = f"${price_m.group(1)}" if price_m else ""
# Extract location from featurespec span
spec_span = a.find("span", class_="featurespec")
location = ""
if spec_span:
spec_text = spec_span.get_text(" ", strip=True)
# Location is before "Asking"
loc_m = re.search(r'^(.+?)\s*Asking', spec_text)
if loc_m:
location = loc_m.group(1).strip()
img_src = ""
img_tag = a.find("img")
if img_tag:
img_src = img_tag.get("src", "") or ""
if img_src and not img_src.startswith("http"):
img_src = "https://www.sailboatlistings.com" + img_src
for thumb in ["/sailimg/t/", "/sailimg/m/"]:
if thumb in img_src:
img_src = img_src.replace(thumb, "/sailimg/")
break
if not img_src:
img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg"
results.append({
"url": canonical,
"title": link_text.split("Asking")[0].strip() if "Asking" in link_text else link_text,
"snippet": link_text,
"price_text": price_text,
"img_url": img_src,
"location": location,
"source": "SailboatListings",
"source_type": "broker",
"category": "Veleros Global",
"fields": {}, # no structured fields for sidebar listings
})
page_results += 1
print(f"[SailboatListings] Page {page}: {page_results} listings (total: {len(results)})")
if page_results == 0:
break
except Exception as e:
print(f"[SailboatListings] Error page {page}: {e}")
break
print(f"[SailboatListings] Done — {len(results)} listings total")
return results
def scrape_and_extract_sailboatlistings(query: str, filters: dict, search_id: str, max_pages: int = 8):
"""
Runs SailboatListings scraping + AI extraction inline.
Saves each vessel to DB immediately so it appears in dashboard in real-time.
"""
print(f"[SBL-Thread] Starting SailboatListings extraction...")
raw_results = scrape_sailboatlistings(query, filters, max_pages)
if not raw_results:
print("[SBL-Thread] No results from SailboatListings")
return
sbl_min_loa = float(filters.get("min_loa") or 0)
sbl_max_price = float(filters.get("max_price") or 0)
saved = 0
for raw in raw_results:
if search_state.get('search_id') != search_id or search_state.get('cancelled'):
print("[SBL-Thread] Search cancelled — stopping")
return
try:
snippet = raw.get("snippet", "")
title = raw.get("title", "")
fields = raw.get("fields", {}) # structured fields from main listings
src = snippet + " " + title
# ── Helper to parse feet values like "30'" or "5.25'" ──
def parse_ft(val):
if not val: return None
m = re.match(r'([\d.]+)', val)
return float(m.group(1)) if m else None
# ── Use structured fields directly when available (main listings) ──
if fields:
loa_ft = parse_ft(fields.get("Length"))
beam_ft = parse_ft(fields.get("Beam"))
draft_ft = parse_ft(fields.get("Draft"))
year_val = fields.get("Year", "")
year_m = re.search(r'(\d{4})', year_val)
asking = fields.get("Asking", "")
price_r = re.search(r'\$\s*([\d,]{3,})', asking)
location = fields.get("Location", "")
hull_val = fields.get("Hull", "").lower()
else:
# Fallback: regex for sidebar/featured listings
length_r = re.search(r'Length:\s*([\d.]+)', src, re.IGNORECASE)
beam_r = re.search(r'Beam:\s*([\d.]+)', src, re.IGNORECASE)
draft_r = re.search(r'Draft:\s*([\d.]+)', src, re.IGNORECASE)
year_r = re.search(r'Year:\s*(\d{4})', src, re.IGNORECASE)
price_r = re.search(r'(?:Asking|Price):?\s*\$\s*([\d,]{3,})', src, re.IGNORECASE)
if not price_r:
price_r = re.search(r'\$\s*([\d,]{4,})', src)
loa_ft = float(length_r.group(1)) if length_r else None
beam_ft = float(beam_r.group(1)) if beam_r else None
draft_ft = float(draft_r.group(1)) if draft_r else None
year_m = year_r
location = raw.get("location", "")
hull_val = ""
loc_r = re.search(r'Location:\s*([^\n\r]{3,60}?)(?:\s{2,}|$)', src, re.IGNORECASE)
if loc_r: location = loc_r.group(1).strip()
hull_r2 = re.search(r'Hull:\s*([^\n\r]{3,50}?)(?:\s{2,}|$)', src, re.IGNORECASE)
if hull_r2: hull_val = hull_r2.group(1).lower()
# Fallback: extract LOA from title e.g. "35' Pearson 35"
if not loa_ft:
tm = re.search(r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', title, re.IGNORECASE)
if tm:
loa_ft = float(tm.group(1))
loa_m = round(loa_ft * 0.3048, 1) if loa_ft else None
beam_m = round(beam_ft * 0.3048, 1) if beam_ft else None
draft_m = round(draft_ft * 0.3048, 1) if draft_ft else None
year = int(year_m.group(1)) if year_m else None
location = location or raw.get("location", "")
price_usd = None
if price_r:
try: price_usd = float(price_r.group(1).replace(",", ""))
except: pass
if not price_usd and raw.get("price_text"):
pm = re.search(r'[\d,]+', raw["price_text"].replace("$",""))
if pm:
try: price_usd = float(pm.group().replace(",",""))
except: pass
# Skip only if absolutely no data
if not loa_m and not year and not price_usd:
continue
# Apply filters
if sbl_min_loa and loa_m and loa_m < (sbl_min_loa - 0.15):
continue
if sbl_max_price and price_usd and price_usd > sbl_max_price * 1.01:
continue
# Hull normalisation
hull_txt = hull_val
hull = ("Fiberglass" if "fiber" in hull_txt or "glass" in hull_txt else
"Steel" if "steel" in hull_txt else
"Aluminum" if "alum" in hull_txt else
"Wood" if "wood" in hull_txt else
"Composite" if "comp" in hull_txt else "Unknown")
# Algorithmic score (fast, no AI)
score = 50
if loa_m:
score += min(15, int((loa_m - 13) * 1.5)) if loa_m >= 13 else 0
if year:
score += min(10, max(0, (year - 1980) // 3))
if price_usd and loa_m:
price_per_ft = price_usd / (loa_m / 0.3048)
if price_per_ft < 500: score += 15
elif price_per_ft < 1000: score += 8
flags = []
if price_usd and loa_m and (price_usd / (loa_m / 0.3048)) < 600:
flags.append("below_market")
data = {
"name": title or "SailboatListings boat",
"vessel_type": "Sailboat",
"loa_m": loa_m,
"beam_m": beam_m,
"draft_m": draft_m,
"year_built": year,
"hull": hull,
"propulsion": "Sail",
"status": "active",
"price_usd": price_usd,
"currency": "USD",
"location": location,
"country": "US",
"description": f"Velero {title}. LOA: {loa_ft}ft. {location}".strip("."),
"flags": flags,
"score": min(100, score),
"images": [raw["img_url"]] if raw.get("img_url") else [],
"source_url": raw["url"],
"source_name": "SailboatListings",
}
vid = save_vessel(data)
if vid > 0:
search_state['found'] += 1
saved += 1
msg = f"✓ {title} ({loa_ft}ft, ${price_usd:,.0f}) — SailboatListings" if price_usd else f"✓ {title} ({loa_ft}ft) — SailboatListings"
print(f"[SBL-Thread] {msg}")
search_state['log'].append(msg)
except Exception as e:
print(f"[SBL-Thread] Error on {raw.get('title','?')}: {e}")
print(f"[SBL-Thread] Done — {saved}/{len(raw_results)} vessels saved")
def stealth_fetch(url: str, max_chars: int = 3000) -> tuple:
"""
Fetch a Cloudflare-protected page using Playwright with human-like behavior.
Returns (text, [image_urls])
Techniques used:
- Realistic viewport and user agent
- Random mouse movements before scrolling
- Human-like delays
- Accept cookies automatically
- Disable webdriver flags
"""
text = ""
images = []
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
]
)
context = browser.new_context(
viewport={'width': 1366, 'height': 768},
user_agent=random.choice(USER_AGENTS),
locale='en-US',
timezone_id='America/New_York',
java_script_enabled=True,
ignore_https_errors=True,
extra_http_headers={
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
}
)
# Remove webdriver property
context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
window.chrome = {runtime: {}};
""")
page = context.new_page()
# Navigate with realistic timeout
page.goto(url, timeout=30000, wait_until='domcontentloaded')
# Random delay like a human reading
page.wait_for_timeout(random.randint(1500, 3000))
# Accept cookies if button exists
for selector in ['button:has-text("Accept")', 'button:has-text("Accept All")',
'#onetrust-accept-btn-handler', '.cookie-accept']:
try:
page.click(selector, timeout=1000)
page.wait_for_timeout(500)
break
except:
pass
# Scroll down naturally
page.evaluate("window.scrollBy(0, 300)")
page.wait_for_timeout(random.randint(500, 1200))
page.evaluate("window.scrollBy(0, 300)")
page.wait_for_timeout(random.randint(300, 800))
# Get content
html = page.content()
browser.close()
from bs4 import BeautifulSoup as BS
soup = BS(html, 'html.parser')
# Extract images — check all lazy-load attributes
skip_words = ['logo','icon','avatar','banner','pixel','sprite','ad',
'placeholder','blank','loading','spacer','1x1']
seen_imgs = set()
for img in soup.find_all('img'):
src = _extract_best_src(img)
if src and src not in seen_imgs:
if not any(s in src.lower() for s in skip_words):
seen_imgs.add(src)
images.append(src)
if len(images) >= 12:
break
for tag in soup(['script','style','nav','footer','header','aside']):
tag.decompose()
text = ' '.join(soup.get_text(' ', strip=True).split())[:max_chars]
except Exception as e:
print(f"[Stealth] Error: {e}")
return text, images
# Sites that need stealth scraping (Cloudflare protected)
STEALTH_REQUIRED = [
'yachtworld.com', 'boats.com', 'boattrader.com',
'rightboat.com', 'boat24.com', 'yachtall.com',
'botentekoop.nl', 'leboncoin.fr', 'annoncesbateau.com',
'thehulltruth.com', 'cruisersforum.com',
]
def smart_fetch(url: str, max_chars: int = 3000) -> tuple:
"""Use stealth for protected sites, regular fetch for others."""
domain = url.split('/')[2].replace('www.','') if '//' in url else ''
needs_stealth = any(s in domain for s in STEALTH_REQUIRED)
if needs_stealth:
print(f"[Fetch] Using stealth for {domain}")
return stealth_fetch(url, max_chars)
else:
return fetch_page_with_images(url)
def scrape_yachtworld(query: str, filters: dict, max_pages: int = 5) -> list:
"""
Dedicated YachtWorld stealth scraper.
Builds filtered URL and navigates with human-like behavior.
"""
results = []
seen = set()
# Build YachtWorld filtered URL
vessel_type = filters.get("type","").lower()
yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power"
min_loa = filters.get("min_loa","")
max_price = filters.get("max_price","")
base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/"
if vessel_type in ["sailboat","sail","velero",""]:
base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/"
if min_loa:
ft = int(float(min_loa) / 0.3048)
base_url += f"length-{ft}/"
if max_price:
base_url += f"price-0,{max_price}/"
print(f"[YachtWorld] Scraping: {base_url}")
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=['--disable-blink-features=AutomationControlled','--no-sandbox']
)
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent=random.choice(USER_AGENTS),
locale='en-US',
timezone_id='America/New_York',
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
"window.chrome = {runtime: {}};"
)
for page_num in range(1, max_pages + 1):
if search_state.get('cancelled'):
break
page_url = base_url if page_num == 1 else base_url + f"?page={page_num}"
page = context.new_page()
try:
page.goto(page_url, timeout=30000, wait_until='domcontentloaded')
page.wait_for_timeout(random.randint(2000, 4000))
# Scroll to load lazy content
for _ in range(3):
page.evaluate("window.scrollBy(0, 400)")
page.wait_for_timeout(random.randint(400, 800))
html = page.content()
page.close()
from bs4 import BeautifulSoup as BS
soup = BS(html, 'html.parser')
# YachtWorld listing cards
page_count = 0
for a in soup.find_all('a', href=True):
href = a['href']
if '/boat-details/' in href or '/yacht/' in href:
if not href.startswith('http'):
href = 'https://www.yachtworld.com' + href
if href in seen:
continue
seen.add(href)
title = a.get_text(strip=True)
parent = a.find_parent() or a
ctx = parent.get_text(' ', strip=True)[:300]
img = ""
for im in parent.find_all('img'):
src = im.get('src') or im.get('data-src','')
if src and 'rendered_yacht' in src:
img = src
break
results.append({
"url": href, "title": title,
"snippet": ctx, "price_text": "",
"img_url": img, "location": "",
"source": "YachtWorld",
"source_type": "broker",
"category": "Brokers Especializados",
})
page_count += 1
print(f"[YachtWorld] Page {page_num}: {page_count} listings")
if page_count == 0:
break
# Polite pause between pages
if page_num < max_pages:
polite_pause("YachtWorld")
except Exception as e:
print(f"[YachtWorld] Page {page_num} error: {e}")
try: page.close()
except: pass
break
browser.close()
except Exception as e:
print(f"[YachtWorld] Fatal error: {e}")
print(f"[YachtWorld] Total: {len(results)} listings")
return results
def fetch_page_text(url: str, max_chars: int = 2000) -> str:
"""Fetch plain text from a page."""
try:
r = requests.get(url, headers=get_headers(), timeout=15, verify=False)
if r.status_code != 200:
return ""
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup(["script","style","nav","footer","header","aside","noscript"]):
tag.decompose()
return " ".join(soup.get_text(" ", strip=True).split())[:max_chars]
except Exception:
return ""
def _extract_best_src(img_tag) -> str:
"""Extract the best image URL from an <img> tag, handling lazy-load patterns."""
candidates = [
img_tag.get("src",""),
img_tag.get("data-src",""),
img_tag.get("data-lazy-src",""),
img_tag.get("data-original",""),
img_tag.get("data-lazy",""),
img_tag.get("data-image",""),
img_tag.get("data-full",""),
img_tag.get("data-url",""),
img_tag.get("data-hi-res-src",""),
]
# Also check srcset — take the largest variant
srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","")
if srcset:
parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()]
candidates.extend(parts)
for c in candidates:
c = c.strip()
if c and c.startswith("http") and not c.startswith("data:"):
return c
return ""
def fetch_page_with_images(url: str) -> tuple:
"""Fetch page text AND images. Returns (text, [image_urls])"""
text = ""
images = []
base_url = "/".join(url.split("/")[:3])
try:
r = requests.get(url, headers=get_headers(referer=url), timeout=18, verify=False)
if r.status_code != 200:
return fetch_page_text(url), []
soup = BeautifulSoup(r.text, "html.parser")
# Extract images before stripping tags
skip_words = ["logo","icon","avatar","banner","pixel","track","ad","sprite","button",
"placeholder","blank","loading","spacer","1x1","transparent"]
seen_imgs = set()
for img in soup.find_all("img"):
src = _extract_best_src(img)
if not src:
continue
# Normalise relative URLs
if src.startswith("//"):
src = "https:" + src
elif src.startswith("/"):
src = base_url + src
if not src.startswith("http"):
continue
if any(s in src.lower() for s in skip_words):
continue
if src in seen_imgs:
continue
try:
w = int(str(img.get("width","0")).replace("px","") or 0)
if 0 < w < 100:
continue
except:
pass
seen_imgs.add(src)
images.append(src)
if len(images) >= 10:
break
for tag in soup(["script","style","nav","footer","header","aside","noscript"]):
tag.decompose()
text = " ".join(soup.get_text(" ", strip=True).split())[:3000]
except Exception:
text = fetch_page_text(url)
return text, images
# ══════════════════════════════════════════════════════════════════════════════
# DEDICATED SOURCE SCRAPERS
# Each function handles one site's quirks. scrape_source_router dispatches here.
# ══════════════════════════════════════════════════════════════════════════════
def scrape_ebay(src: dict, query: str, filters: dict) -> list[dict]:
"""
eBay Marine scraper — uses Playwright (Akamai blocks plain requests).
Handles all eBay entries: Marine, Auction, Sail, Salvage, etc.
New eBay layout (2024+) uses:
- <a class="s-card__link"> for item links
- Text title in nearby spans/divs
- <img> with i.ebayimg.com CDN URLs (s-l500 quality)
"""
results = []
seen = set()
raw_url = src.get("search_url", "")
if not raw_url:
return []
clean_q = " ".join(dict.fromkeys(query.strip().split()))
url = raw_url.replace("{query}", requests.utils.quote(clean_q))
# ── Adjust eBay category based on vessel type filter ──────────────────────
# 26429=All Boats 36431=Sailboats 36432=Powerboats 26430=PWC 63613=Kayaks
vtype = filters.get("type","").lower() if filters else ""
EBAY_CAT = {
"sailboat": "36431", "sail": "36431", "velero": "36431",
"motor": "36432", "motorboat": "36432", "yacht": "36432",
"fishing": "36432", "tug": "36432", "barge": "36432",
"offshore": "36432", "ferry": "36432",
}
if vtype and vtype in EBAY_CAT:
url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url)
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"]
)
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US",
timezone_id="America/New_York",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(url, timeout=30000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(1500, 2500))
# Scroll a bit to trigger lazy images
page.evaluate("window.scrollBy(0,600)")
page.wait_for_timeout(800)
html = page.content()
except Exception as e:
print(f"[{src['name']}] Playwright nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
# ── New layout (2024+): li.s-card ─────────────────────────────────────
cards = soup.find_all("li", class_="s-card")
# ── Old layout fallback: li.s-item ────────────────────────────────────
if not cards:
return _parse_ebay_old_layout(soup, src)
for card in cards:
try:
# Title + URL — a.s-card__link WITHOUT image-treatment class
title_link = None
for a in card.find_all("a", class_="s-card__link"):
if "image-treatment" in (a.get("class") or []):
continue
t = a.get_text(strip=True)
if t and not t.lower().startswith("shop on ebay"):
title_link = a
break
if not title_link:
continue
href = title_link.get("href", "")
if "/itm/" not in href:
continue
m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href)
if not m:
continue
href = m.group(1)
if href in seen:
continue
seen.add(href)
# Clean title — strip eBay UI noise appended to link text
title = title_link.get_text(strip=True)
title = re.sub(r'\s*Opens in a new window or tab.*', '',
title, flags=re.IGNORECASE).strip()
# Price ── .s-card__price
price_tag = (card.find(class_="s-card__price") or
card.find(class_="s-item__price"))
price = price_tag.get_text(strip=True) if price_tag else ""
# Image ── img inside a.s-card__link.image-treatment
img = ""
img_link = card.find("a", class_="image-treatment")
if img_link:
im = img_link.find("img")
if im:
raw = (_extract_best_src(im) or
im.get("src","") or im.get("data-src",""))
if raw:
img = re.sub(r's-l\d+\.(jpg|webp|jpeg)',
r's-l500.\1', raw)
# Fallback: any ebayimg.com src in the card
if not img:
for im in card.find_all("img"):
raw = (_extract_best_src(im) or im.get("src",""))
if raw and "ebayimg.com" in raw:
img = re.sub(r's-l\d+\.(jpg|webp|jpeg)',
r's-l500.\1', raw)
break
# Location ── "Located in: XXX" — stop before "Delivery"
location = ""
card_text = card.get_text(" ", strip=True)
lm = re.search(
r'[Ll]ocated in[:\s]+([A-Za-z][^,\|•\n$\d]{2,30})',
card_text)
if lm:
loc_raw = lm.group(1).strip()
# Trim trailing noise like "Delivery or pickup..."
loc_raw = re.split(r'\s+[Dd]elivery|\s+[Ss]hipping',
loc_raw)[0].strip()
location = loc_raw
results.append({
"url": href,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "eBay"),
"source_type": src.get("type", "classifieds"),
"category": src.get("category", "Clasificados USA"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings (new layout)")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def _parse_ebay_old_layout(soup, src: dict) -> list[dict]:
"""Fallback for the classic eBay li.s-item layout."""
results = []
seen = set()
for item in soup.find_all("li", class_="s-item"):
try:
link_tag = item.find("a", class_="s-item__link")
if not link_tag: continue
href = link_tag.get("href","")
if "/itm/" not in href: continue
m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href)
if m: href = m.group(1)
if href in seen: continue
seen.add(href)
title_tag = (item.find("span", class_="BOLD") or
item.find("div", class_="s-item__title") or
item.find("span", class_="s-item__title"))
title = (title_tag or link_tag).get_text(strip=True)
if not title or title.lower().startswith("shop on ebay"): continue
price_tag = item.find("span", class_="s-item__price")
price = price_tag.get_text(strip=True) if price_tag else ""
img = ""
img_tag = item.find("img")
if img_tag:
img = (_extract_best_src(img_tag) or img_tag.get("src",""))
if img: img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', r's-l500.\1', img)
loc_tag = (item.find("span", class_="s-item__location") or
item.find("span", class_="s-item__itemLocation"))
location = ""
if loc_tag:
location = (loc_tag.get_text(strip=True)
.replace("Located in: ","").strip())
results.append({
"url": href, "title": title, "snippet": f"{price} {location}".strip(),
"price_text": price, "img_url": img, "location": location,
"source": src.get("name","eBay"), "source_type": src.get("type","classifieds"),
"category": src.get("category","Clasificados USA"),
})
except Exception:
continue
print(f"[{src.get('name','eBay')}] {len(results)} listings (old layout)")
return results
def scrape_boattrader(src: dict, query: str, filters: dict) -> list[dict]:
"""
BoatTrader scraper — uses Playwright (Cloudflare Turnstile on plain requests).
Card structure (stable classes):
li.lib-card — card root
a[href^="/boat/...-<ID>/"] — listing URL
[class*=listingTitle] — title element
[class*=listingPrice] — price element
img — photo
city, STATE ZIP pattern in text — location
"""
results = []
seen = set()
raw_url = src.get("search_url", "")
if not raw_url:
return []
clean_q = " ".join(dict.fromkeys(query.strip().split()))
url = raw_url.replace("{query}", requests.utils.quote(clean_q))
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"]
)
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US",
timezone_id="America/New_York",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
# BoatTrader needs time to hydrate React and load listing cards
page.wait_for_timeout(random.randint(4000, 6000))
page.evaluate("window.scrollBy(0, 600)")
page.wait_for_timeout(1500)
html = page.content()
except Exception as e:
print(f"[{src['name']}] Playwright nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
# ── Card root: li.lib-card ─────────────────────────────────────────────
cards = soup.find_all("li", class_="lib-card")
if not cards:
# Fallback: any element with lib-card class
cards = soup.find_all(class_=re.compile(r'\blib-card\b'))
for card in cards:
try:
# Link ── /boat/YEAR-MAKE-...-ID/
link_tag = card.find(
"a", href=re.compile(r'^/boat/[\w-]+-\d+/$'))
if not link_tag:
continue
href = "https://www.boattrader.com" + link_tag["href"]
if href in seen:
continue
seen.add(href)
# Title ── element whose class contains 'listingTitle'
title_el = card.find(
class_=re.compile(r'listingTitle', re.I))
if title_el:
title = title_el.get_text(strip=True)
else:
# Fallback: build from URL slug (2026-catalina-34-123 → 2026 Catalina 34)
slug = link_tag["href"].strip("/").split("/")[-1]
parts = slug.rsplit("-", 1)[0].replace("-", " ").title()
title = parts
if not title:
continue
# Price ── element whose class contains 'listingPrice'
price_el = card.find(
class_=re.compile(r'listingPrice', re.I))
price = ""
if price_el:
raw_price = price_el.get_text(" ", strip=True)
# Extract only the first dollar amount — ignore "/mo*" noise
pm = re.search(r'\$\s*([\d,]+)', raw_price)
if pm:
price = f"${pm.group(1)}"
# Image ── first <img> with a boatsgroup or boattrader CDN src
img = ""
for im in card.find_all("img"):
raw = (_extract_best_src(im) or
im.get("src","") or im.get("data-src",""))
if raw and raw.startswith("http") and not raw.endswith(".svg"):
img = raw
break
# Location ── "City, ST ZIP" pattern in card text
# Use listingCaption element if available (more precise)
location = ""
caption_el = card.find(class_=re.compile(r'listingCaption|listingLocation', re.I))
search_text = caption_el.get_text(" ", strip=True) if caption_el else card.get_text(" ", strip=True)
lm = re.search(
r'\b([A-Z][a-zA-Z\s]{2,20},\s+[A-Z]{2}(?:\s+\d{5})?)',
search_text)
if lm:
location = lm.group(1).strip()
results.append({
"url": href,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "BoatTrader"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_apolloduck(src: dict, query: str, filters: dict) -> list[dict]:
"""
Apollo Duck scraper — plain requests + BS4 (no JS needed).
Two card types on the listing page:
Sidebar cards: div.eastSDFPPanel → a.SidebarTitle, a.SidebarPrice, img
Featured cards: div._FeatureAdPanel → a._FeatureTitle, span._FeaturePrice,
img, td._PanelSpecData (location)
Listing URL pattern: https://www.apolloduck.com/boat/{slug}/{id}
"""
results = []
seen = set()
# Use Apollo Duck keyword search — returns results filtered by query.
# Strip trailing "for sale" / "en venta" / "a vendre" since Apollo Duck
# searches listing titles and those phrases rarely appear there.
stripped_q = re.sub(
r'\s*(for\s+sale|en\s+venta|à\s+vendre|zu\s+verkaufen)\s*$',
'', query.strip(), flags=re.I).strip()
clean_q = requests.utils.quote(stripped_q or query.strip())
if clean_q:
url = f"https://www.apolloduck.com/search.phtml?search={clean_q}&sr=1&q=1"
else:
raw_url = src.get("search_url", "") or "https://www.apolloduck.com/boats/used-boats-for-sale"
url = raw_url.replace("{query}", clean_q)
is_search = bool(clean_q) # only featured cards are query-filtered
try:
headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "html.parser")
def _parse_card(card, title_sel, price_sel, is_featured=False):
"""Common extraction for both card types."""
title_el = card.select_one(title_sel)
if not title_el:
return
title = title_el.get_text(strip=True)
if not title:
return
# URL — from title link or image link
href = title_el.get("href", "")
if not href:
a = card.find("a", href=re.compile(r'/boat/'))
href = a["href"] if a else ""
if not href:
return
full_url = ("https://www.apolloduck.com" + href
if href.startswith("/") else href)
if full_url in seen:
return
seen.add(full_url)
# Price
price_el = card.select_one(price_sel)
price = price_el.get_text(strip=True) if price_el else ""
# Image
img = ""
for im in card.find_all("img"):
raw = (im.get("src") or im.get("data-src") or
im.get("data-lazy-src") or "")
if raw and raw.startswith("http") and not raw.endswith(".svg"):
img = raw
break
# srcset fallback
ss = im.get("srcset","")
if ss:
img = ss.split()[0]
break
# Location — only featured cards have it
location = ""
if is_featured:
for lbl in card.select("td._PanelSpecLabel"):
if "location" in lbl.get_text(strip=True).lower():
loc_td = lbl.find_next_sibling("td")
if loc_td:
location = loc_td.get_text(strip=True)
break
results.append({
"url": full_url,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "Apollo Duck"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
# Featured cards — always query-filtered on search results (~60-100/page)
for card in soup.select("div._FeatureAdPanel"):
_parse_card(card, "a._FeatureTitle", "span._FeaturePrice",
is_featured=True)
# Sidebar cards — only when browsing a category (NOT on keyword search,
# because sidebar is always the same 101 generic listings regardless of query)
if not is_search:
for card in soup.select("div.eastSDFPPanel"):
_parse_card(card, "a.SidebarTitle", "a.SidebarPrice")
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_boatsdotcom(src: dict, query: str, filters: dict) -> list[dict]:
"""
Boats.com scraper — uses Playwright (same Boats Group infrastructure as BoatTrader).
Two card types:
Sponsored/real: li[data-listing-id] → h2+div.year, div.price,
div.img-container img, div.country
OEM specs: li.enhanced.oem → h2+div.year, div.price,
div.img-container img (no location)
Listing URL pattern: https://www.boats.com/{type}/{year}-{make}-{id}/
"""
results = []
seen = set()
raw_url = src.get("search_url", "") or "https://www.boats.com/boats-for-sale/?query={query}"
clean_q = requests.utils.quote(query.strip())
url = raw_url.replace("{query}", clean_q)
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"]
)
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US",
timezone_id="America/New_York",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(4000, 6000))
page.evaluate("window.scrollBy(0, 600)")
page.wait_for_timeout(1500)
html = page.content()
except Exception as e:
print(f"[{src['name']}] Playwright nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
def _extract_card(card, has_location=True):
# URL
a = card.find("a", href=re.compile(r'^/'))
if not a:
return
href = "https://www.boats.com" + a["href"]
if href in seen:
return
seen.add(href)
# Title = year + model name
year_el = card.select_one("div.year")
name_el = card.select_one("h2")
year = year_el.get_text(strip=True) if year_el else ""
name = name_el.get_text(strip=True) if name_el else ""
title = f"{year} {name}".strip() if year else name
if not title:
return
# Price
price_el = card.select_one("div.price")
price = ""
if price_el:
raw_p = price_el.get_text(" ", strip=True)
pm = re.search(r'\$\s*([\d,]+)', raw_p)
price = f"${pm.group(1)}" if pm else raw_p[:30]
# Image
img = ""
img_container = card.select_one("div.img-container")
if img_container:
im = img_container.find("img")
if im:
img = (_extract_best_src(im) or im.get("src","")
or im.get("data-src",""))
# Location
location = ""
if has_location:
loc_el = card.select_one("div.country")
if loc_el:
location = loc_el.get_text(strip=True)
results.append({
"url": href,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "Boats.com"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
# Sponsored/real marketplace listings
for card in soup.select("li[data-listing-id]"):
_extract_card(card, has_location=True)
# OEM spec sheets
for card in soup.select("li.enhanced.oem"):
_extract_card(card, has_location=False)
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_craigslist(src: dict, query: str, filters: dict) -> list[dict]:
"""
Craigslist boats scraper — plain requests + BS4.
Card root : div[data-pid] (class="cl-search-result")
Title : a.posting-title span.label
URL : a.main[href] (full absolute URL with regional subdomain)
Price : span.priceinfo
Location : span.result-location
Image : img[data-image-index="0"] inside div.cl-gallery
"""
results = []
seen = set()
# Craigslist has no national search — scrape several major coastal cities
CITIES = ["sfbay", "losangeles", "seattle", "miami", "boston",
"newyork", "chicago", "houston", "dallas", "denver",
"phoenix", "atlanta", "portland", "sandiego", "tampa",
"minneapolis", "stlouis", "nashville", "raleigh", "saltlakecity"]
qs = requests.utils.quote(query.strip())
try:
from playwright.sync_api import sync_playwright
all_html_parts = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
locale="en-US",
ignore_https_errors=True,
)
# Fetch 3 random cities to keep runtime reasonable
for city in random.sample(CITIES, min(3, len(CITIES))):
city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel"
page = ctx.new_page()
try:
page.goto(city_url, timeout=25000, wait_until="domcontentloaded")
page.wait_for_timeout(2500)
all_html_parts.append(page.content())
except Exception:
pass
finally:
try: page.close()
except: pass
browser.close()
if not all_html_parts:
return []
# Parse all city HTMLs
for html in all_html_parts:
soup = BeautifulSoup(html, "html.parser")
for card in soup.find_all(attrs={"data-pid": True}):
try:
# URL — from the main image link (absolute)
a_main = card.find("a", class_="main")
if not a_main:
continue
listing_url = a_main.get("href", "")
if not listing_url or listing_url in seen:
continue
seen.add(listing_url)
# Title — from card title attr or span.label
title = card.get("title", "")
if not title:
span = card.find("span", class_="label")
title = span.get_text(strip=True) if span else ""
if not title:
continue
# Price
price_el = card.find("span", class_="priceinfo")
price = price_el.get_text(strip=True) if price_el else ""
# Location
loc_el = card.find("span", class_="result-location")
location = loc_el.get_text(strip=True) if loc_el else ""
# Image — first img with data-image-index="0"
img = ""
im = card.find("img", attrs={"data-image-index": "0"})
if im:
img = im.get("src", "") or im.get("data-src", "")
if not img:
im = card.find("img")
if im:
img = im.get("src", "") or im.get("data-src", "")
results.append({
"url": listing_url,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "Craigslist Boats"),
"source_type": src.get("type", "classifieds"),
"category": src.get("category", "Clasificados Generales"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_rightboat(src: dict, query: str, filters: dict) -> list[dict]:
"""
Rightboat scraper — Playwright (JS-rendered, Tailwind CSS).
Card root : div[data-tracking-bound="true"]
Image : img.object-cover (first inside card)
Title : first <a> with href containing /boats-for-sale/ that has text
Price : element containing fa-tag icon's sibling text
Location : element containing fa-location-pin icon's sibling text
"""
results = []
seen = set()
raw_url = (src.get("search_url", "")
or "https://www.rightboat.com/boats-for-sale/?q={query}")
clean_q = requests.utils.quote(query.strip())
url = raw_url.replace("{query}", clean_q)
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"]
)
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US",
ignore_https_errors=True,
)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
)
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(5000, 7000))
page.evaluate("window.scrollBy(0, 800)")
page.wait_for_timeout(1500)
html = page.content()
except Exception as e:
print(f"[{src['name']}] Playwright nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
# Cards are div[data-tracking-bound="true"]
cards = soup.find_all(attrs={"data-tracking-bound": "true"})
for card in cards:
try:
# URL — the card ITSELF is the <a> element
href = card.get("href", "")
if not href or "/boats-for-sale/" not in href:
continue
listing_url = ("https://www.rightboat.com" + href
if href.startswith("/") else href)
if listing_url in seen:
continue
seen.add(listing_url)
# Image — first object-cover img (main photo)
img = ""
im = card.find("img", class_=re.compile(r'object-cover'))
if im:
img = im.get("src", "") or im.get("data-src", "")
# Title — from img alt attribute (most reliable) or heading
title = ""
if im:
title = im.get("alt", "").strip()
if not title:
h_el = card.find(re.compile(r'^h[1-4]$'))
title = h_el.get_text(strip=True) if h_el else ""
if not title:
# Build from URL slug: /boats-for-sale/make/model/rbXXX
parts = href.strip("/").split("/")
if len(parts) >= 3:
title = " ".join(parts[1:-1]).replace("-", " ").title()
if not title:
continue
# Price — <p class="...mb-2 ml-auto font-bold..."> or regex fallback
price = ""
price_el = card.find("p", class_=re.compile(r'font-bold'))
if price_el:
pt = price_el.get_text(strip=True)
if re.search(r'[\$£€]', pt):
price = pt
if not price:
pm = re.search(r'[\$£€]\s*[\d,]+', card.get_text())
if pm:
price = pm.group(0)
# Location — text inside same div as fa-location-pin icon
location = ""
pin_icon = card.find("i", class_=re.compile(r'fa-location'))
if pin_icon:
# Typically: <div><i fa-location-pin/> "City, State"</div>
row = pin_icon.find_parent()
if row:
location = row.get_text(" ", strip=True).strip()
results.append({
"url": listing_url,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "Rightboat"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_cooperss(src: dict, query: str, filters: dict) -> list[dict]:
"""
Cooper Capital Specialty Salvage (cooperss.com).
Salvage / insurance-loss vessels.
Structure (paired divs, same index):
div.listing-thumb — image + link (assets/detail/?name=marine&id=N)
div.listing-detail — h5.blue (name) + table (Year,Size,Location,Min Bid…)
"""
results = []
seen = set()
base = "https://www.cooperss.com"
try:
headers = {"User-Agent": random.choice(USER_AGENTS),
"Accept-Language": "en-US,en;q=0.9"}
resp = requests.get(base + "/", headers=headers, timeout=20)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
thumbs = [el for el in soup.find_all(class_="listing-thumb")
if "slick-cloned" not in (el.get("class") or [])]
details = [el for el in soup.find_all(class_="listing-detail")
if "slick-cloned" not in (el.get("class") or [])]
for thumb, detail in zip(thumbs, details):
try:
# URL
a = thumb.find("a", href=True)
if not a:
continue
href = a["href"]
if not href.startswith("http"):
href = base + "/" + href.lstrip("/")
if href in seen:
continue
seen.add(href)
# Image
img_tag = thumb.find("img")
img = img_tag.get("src", "") if img_tag else ""
if img and not img.startswith("http"):
img = base + "/" + img.lstrip("/")
# Title — h5.blue (vessel name)
h5 = detail.find("h5", class_="blue")
title = h5.get_text(strip=True).split("\n")[0].strip() if h5 else ""
# Remove video-button text artifact
for tag in (h5.find_all("a") if h5 else []):
tag.decompose()
title = h5.get_text(strip=True) if h5 else title
if not title:
continue
# Parse the detail table
rows = {td.get_text(strip=True): tds[1].get_text(strip=True)
for tr in detail.find_all("tr")
if len(tds := tr.find_all("td")) == 2
for td in [tds[0]]}
year = rows.get("Year", "")
size = rows.get("Size", "")
location = rows.get("Location", "")
min_bid = rows.get("Minimum Bid", "")
loss_type= rows.get("Type of Loss", "")
deadline = rows.get("Bid Deadline", "")
if year:
title = f"{year} {title}".strip()
price = f"Min Bid ${min_bid}" if min_bid else ""
snippet_parts = [p for p in [price, loss_type, location, f"Deadline: {deadline}" if deadline else ""] if p]
results.append({
"url": href,
"title": title[:120],
"snippet": " | ".join(snippet_parts),
"price_text": price,
"img_url": img,
"location": location,
"size_m": size,
"source": src.get("name", "Cooper Salvage"),
"source_type": "salvage",
"category": src.get("category", "Salvage & Wrecks"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_inautia(src: dict, query: str, filters: dict) -> list[dict]:
"""
iNautia scraper — same Boats Group platform as BoatTrader/Boats.com.
Card: div[data-grid-index]
Link: a.grid-listing-link[href] → /boat/YEAR-MAKE-MODEL-ID/
Title: [class*=listingTitle]
Price: data-ssr-meta="make|type|len||price_eur" (5th field)
Location: [class*=listingBody]
Image: first CDN img in card
"""
results = []
seen = set()
raw_url = (src.get("search_url", "")
or "https://www.inautia.com/boats/?q={query}")
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"])
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", ignore_https_errors=True)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};")
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(4000, 6000))
page.evaluate("window.scrollBy(0,600)")
page.wait_for_timeout(1500)
html = page.content()
except Exception as e:
print(f"[{src['name']}] nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all(attrs={"data-grid-index": True})
for card in cards:
try:
link_tag = card.find("a", class_=re.compile(r'grid-listing-link'))
if not link_tag:
continue
href = link_tag.get("href", "")
if not href:
continue
full_url = ("https://www.inautia.com" + href
if href.startswith("/") else href)
if full_url in seen:
continue
seen.add(full_url)
# Title
title_el = card.find(class_=re.compile(r'listingTitle', re.I))
title = title_el.get_text(strip=True) if title_el else ""
if not title:
slug = href.strip("/").split("/")[-1]
title = slug.rsplit("-", 1)[0].replace("-", " ").title()
if not title:
continue
# Price from data-ssr-meta (make|type|length||price_eur)
price = ""
meta = link_tag.get("data-ssr-meta", "")
if meta:
parts = meta.split("|")
if len(parts) >= 5 and parts[4]:
try:
price = f"€{int(float(parts[4])):,}"
except ValueError:
pass
if not price:
price_el = card.find(class_=re.compile(r'listingPrice', re.I))
if price_el:
raw_p = price_el.get_text(" ", strip=True)
pm = re.search(r'[\$€£]\s*[\d,]+', raw_p)
price = pm.group(0) if pm else ""
# Location — listingBody contains "Broker | City, Country"
loc_el = card.find(class_=re.compile(r'listingBody', re.I))
location = loc_el.get_text(" ", strip=True) if loc_el else ""
# Image
img = ""
for im in card.find_all("img"):
raw = (_extract_best_src(im) or im.get("src","") or im.get("data-src",""))
if raw and raw.startswith("http") and not raw.endswith(".svg"):
img = raw
break
results.append({
"url": full_url,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "iNautia"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_boat24(src: dict, query: str, filters: dict) -> list[dict]:
"""
Boat24 scraper — European marketplace, plain requests.
Card: div.blurb.blurb--strip
Link: data-link attr (base64 → ROT13 → URL)
Title: h3.blurb__title
Price: p.blurb__price
Location: p.blurb__location
Image: lazy via slider — extract from li.slider__slide img[src] or data-src
"""
results = []
seen = set()
BASE = "https://www.boat24.com"
raw_url = (src.get("search_url", "")
or "https://www.boat24.com/en/usedboats/")
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
_rot13 = str.maketrans(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
"NOPQRSTUVWXYZABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm")
def _decode_link(encoded: str) -> str:
try:
import base64
rot = base64.b64decode(encoded).decode("utf-8", errors="ignore")
return rot.translate(_rot13)
except Exception:
return ""
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"])
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", ignore_https_errors=True)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});")
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(4000, 6000))
html = page.content()
except Exception as e:
print(f"[{src['name']}] nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all("div", class_=re.compile(r'\bblurb\b'))
for card in cards:
try:
encoded = card.get("data-link", "")
if not encoded:
continue
listing_url = _decode_link(encoded)
if not listing_url or not listing_url.startswith("http"):
# Try building from title link
a = card.find("a", href=re.compile(r'/en/'))
if a:
listing_url = (BASE + a["href"] if a["href"].startswith("/")
else a["href"])
else:
continue
if listing_url in seen:
continue
seen.add(listing_url)
title_el = card.select_one("h3.blurb__title, h2.blurb__title")
title = title_el.get_text(strip=True) if title_el else ""
if not title:
continue
price_el = card.select_one("p.blurb__price")
price = price_el.get_text(strip=True) if price_el else ""
loc_el = card.select_one("p.blurb__location")
location = ""
if loc_el:
location = re.sub(r'\s+', ' ',
loc_el.get_text(" ", strip=True)).strip()
# Image — try slider slides or first img
img = ""
for im in card.find_all("img"):
raw = (im.get("data-src") or im.get("data-lazy")
or im.get("srcset","").split()[0] or im.get("src",""))
if raw and raw.startswith("http") and "/alpha.gif" not in raw:
img = raw
break
results.append({
"url": listing_url,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "Boat24"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_facebook_marketplace(src: dict, query: str, filters: dict) -> list[dict]:
"""
Facebook Marketplace scraper.
Requires a saved session file: fb_session.json (cookies from a logged-in session).
If not found, returns a single instructional result.
Setup: POST /api/fb-setup → launches a visible browser for the user to log in.
Session file is saved automatically after login.
"""
import json as _json
results = []
seen = set()
SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json")
SEARCH_URL = ("https://www.facebook.com/marketplace/search/"
f"?query={requests.utils.quote(query.strip())}"
"&deliveryMethod=local_pick_up")
if not os.path.exists(SESSION_FILE):
return [{
"url": "https://www.facebook.com/marketplace/",
"title": "⚠ Facebook Marketplace — Configuración requerida",
"snippet": ("Para habilitar Facebook Marketplace, ve a Fuentes y "
"haz clic en 'Configurar FB'. Solo se necesita una vez."),
"price_text": "",
"img_url": "",
"location": "",
"source": "Facebook Marketplace",
"source_type": "setup_required",
"category": src.get("category", "Clasificados Generales"),
}]
try:
from playwright.sync_api import sync_playwright
with open(SESSION_FILE) as f:
cookies = _json.load(f)
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled",
"--no-sandbox", "--disable-dev-shm-usage"])
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", ignore_https_errors=True)
context.add_cookies(cookies)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};")
page = context.new_page()
try:
page.goto(SEARCH_URL, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(5000, 7000))
page.evaluate("window.scrollBy(0,800)")
page.wait_for_timeout(2000)
html = page.content()
except Exception as e:
print(f"[Facebook Marketplace] nav error: {e}")
html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
# FB Marketplace listing cards — data-testid or aria-label patterns
# Each listing is usually an <a> with href /marketplace/item/ID/
listing_links = soup.find_all(
"a", href=re.compile(r'/marketplace/item/\d+'))
for a in listing_links:
try:
href = a.get("href", "")
full_url = ("https://www.facebook.com" + href
if href.startswith("/") else href)
# Normalize: remove query params after item ID
full_url = re.sub(r'(/marketplace/item/\d+/).*', r'\1', full_url)
if full_url in seen:
continue
seen.add(full_url)
# Title — span or div with listing title
title_el = (a.find("span", style=re.compile(r'line-clamp'))
or a.find("span", class_=re.compile(r'x1lliihq|xt0psk2'))
or a.find("div", class_=re.compile(r'x1lliihq')))
title = title_el.get_text(strip=True) if title_el else ""
if not title:
# Try aria-label on the card
title = a.get("aria-label", "")
if not title:
continue
# Price
price = ""
for span in a.find_all("span"):
t = span.get_text(strip=True)
if re.match(r'[\$£€][\d,]+', t):
price = t
break
# Image
img = ""
im = a.find("img")
if im:
img = im.get("src", "") or im.get("data-src", "")
# Location — usually a second span below price
location = ""
spans = [s.get_text(strip=True) for s in a.find_all("span")
if s.get_text(strip=True) and s.get_text(strip=True) != title]
for s in spans:
if re.search(r'[A-Z][a-z]+,\s+[A-Z]{2}', s) or (
not re.match(r'[\$£€\d]', s) and len(s) > 3 and s != price):
location = s
break
results.append({
"url": full_url,
"title": title[:120],
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": "Facebook Marketplace",
"source_type": "classifieds",
"category": src.get("category", "Clasificados Generales"),
})
except Exception:
continue
print(f"[Facebook Marketplace] {len(results)} listings")
except Exception as e:
print(f"[Facebook Marketplace] Error: {e}")
return results
def scrape_hmy(src: dict, query: str, filters: dict) -> list[dict]:
"""
HMY Yachts — queries Algolia directly (app ECN3QX1VBL).
Fast, no Playwright needed.
"""
results = []
seen = set()
ALGOLIA_URL = "https://ecn3qx1vbl-dsn.algolia.net/1/indexes/*/queries"
ALGOLIA_HEADERS = {
"x-algolia-application-id": "ECN3QX1VBL",
"x-algolia-api-key": "d86ccdd9ac0292ba76ee4755693d0c10",
"content-type": "application/json",
"referer": "https://www.hmy.com/",
"user-agent": random.choice(USER_AGENTS),
}
import urllib.parse
params_str = urllib.parse.urlencode({
"filters": "SalesStatus:Active",
"facetFilters": '[["SaleClassCode:used"]]',
"query": query,
"hitsPerPage": 40,
"page": 0,
})
payload = {
"requests": [{
"indexName": "production_oceanelite_yachts",
"params": params_str,
}]
}
try:
resp = requests.post(ALGOLIA_URL, json=payload, headers=ALGOLIA_HEADERS, timeout=15)
resp.raise_for_status()
data = resp.json()
hits = data.get("results", [{}])[0].get("hits", [])
for h in hits:
try:
slug = h.get("Slug", "")
url = h.get("URL") or (f"https://www.hmy.com/yachts-for-sale/{slug}" if slug else "")
if not url or url in seen:
continue
seen.add(url)
year = h.get("ModelYear", "")
make = h.get("MakeStringExact", "")
model = h.get("ModelExact", "")
name = h.get("BoatName", "")
title = f"{year} {make} {model}".strip()
if name:
title += f' "{name}"'
price_raw = h.get("NormPrice", 0)
price_text = f"${int(price_raw):,}" if price_raw else ""
length = h.get("NominalLengthNormalized", "")
country = h.get("country", "USA")
location = f"{length}ft · {country}" if length else country
img = h.get("mainImage", "")
results.append({
"url": url,
"title": title[:120],
"snippet": f"{price_text} · {location}".strip(" ·"),
"price_text": price_text,
"img_url": img,
"location": country,
"source": src.get("name", "HMY Yachts"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Venta Especializada"),
})
except Exception:
continue
print(f"[{src.get('name','HMY')}] {len(results)} listings")
except Exception as e:
print(f"[{src.get('name','HMY')}] Error: {e}")
return results
def scrape_boatcrazy(src: dict, query: str, filters: dict) -> list[dict]:
"""
BoatCrazy — US aggregator with 105+ listings per page.
Card: div.boat-list-item
Link: a[href*="/boat-for-sale/"]
Image: div.item-img img or div.list-itemimg img
Details: div.item-details
URL pattern: /boat-for-sale/YEAR-MAKE-LOCATION-id
"""
results = []
seen = set()
raw_url = src.get("search_url", "") or "https://boatcrazy.com/boats?q={query}"
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True,
args=["--disable-blink-features=AutomationControlled","--no-sandbox"])
context = browser.new_context(
viewport={"width": 1280, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US", ignore_https_errors=True)
context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};")
page = context.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(random.randint(4000, 6000))
html = page.content()
except Exception as e:
print(f"[{src['name']}] nav error: {e}"); html = ""
finally:
try: page.close()
except: pass
browser.close()
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all(class_="boat-list-item")
if not cards:
# fallback: find by link pattern
cards = []
for a in soup.find_all("a", href=re.compile(r'/boat-for-sale/')):
parent = a.find_parent(class_=re.compile(r'boat|list|item|card'))
if parent and parent not in cards:
cards.append(parent)
for card in cards:
try:
a = card.find("a", href=re.compile(r'/boat-for-sale/'))
if not a:
continue
href = a["href"]
full_url = href if href.startswith("http") else "https://boatcrazy.com" + href
if full_url in seen:
continue
seen.add(full_url)
# Title — prefer h3, then aria-label, then slug
title = ""
h3 = card.find("h3")
if h3:
title = h3.get_text(strip=True)[:80]
if not title:
al = card.find(attrs={"aria-label": True})
if al:
title = al["aria-label"][:80]
if not title:
slug = href.rstrip("/").split("/")[-1]
slug_clean = re.sub(r'-id[-\w]*$', '', slug).replace("-", " ")
title = slug_clean.title()[:80]
if not title:
continue
# Price
price = ""
price_el = card.find(class_=re.compile(r'\bprice\b'))
if price_el:
pm = re.search(r'\$[\d,]+', price_el.get_text())
if pm:
price = pm.group(0)
if not price:
pm = re.search(r'\$[\d,]+', card.get_text(" ", strip=True))
if pm:
price = pm.group(0)
# Location
location = ""
loc_el = card.find(class_="location")
if loc_el:
location = loc_el.get_text(strip=True)[:60]
if not location:
lm = re.search(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)?,\s*[A-Z]{2})', card.get_text(" ", strip=True))
if lm:
location = lm.group(1)
# Image
img = ""
img_div = card.find(class_=re.compile(r'item.?img|list.?item.?img'))
if img_div:
im = img_div.find("img")
if im:
img = (_extract_best_src(im) or im.get("src","") or im.get("data-src",""))
if not img:
im = card.find("img")
if im:
img = im.get("src","") or im.get("data-src","")
results.append({
"url": full_url,
"title": title,
"snippet": f"{price} {location}".strip(),
"price_text": price,
"img_url": img,
"location": location,
"source": src.get("name", "BoatCrazy"),
"source_type": src.get("type", "classifieds"),
"category": src.get("category", "Clasificados Generales"),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
def scrape_denison(src: dict, query: str, filters: dict) -> list:
"""
Denison Yachting — static HTML, 30 cards per page.
Card: div.boat-item
URL: a[href*=/yachts-for-sale/SLUG] (non-dashboard link)
Title: boat_length + make/model + year + name
Price: h4.boat_price[data-price] + [data-default_currency]
Location: h3 text | Image: div.news_pic img
Search: ?search={query}
"""
results = []
seen = set()
base = "https://www.denisonyachtsales.com/yachts-for-sale/"
url = f"{base}?search={requests.utils.quote(query.strip())}"
LISTING_RE = re.compile(r'/yachts-for-sale/[a-z][a-z0-9-]{4,}$', re.I)
CURRENCY_SYMBOLS = {"USD": "$", "EUR": "€", "GBP": "£", "AUD": "A$"}
try:
resp = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)},
timeout=20, verify=False)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
for card in soup.find_all(class_="boat-item"):
try:
a = card.find("a", href=LISTING_RE)
if not a:
continue
href = a["href"]
full_url = href if href.startswith("http") else "https://www.denisonyachtsales.com" + href
if full_url in seen:
continue
seen.add(full_url)
# Title: length + make/model year + "name"
h2 = card.find("h2")
if h2:
length_el = h2.find(class_="boat_length")
length_txt = length_el.get_text(strip=True) if length_el else ""
if length_el:
length_el.extract()
name_el = h2.find("span")
name_txt = name_el.get_text(strip=True) if name_el else ""
if name_el:
name_el.extract()
rest = " ".join(h2.get_text(" ", strip=True).split())
parts = [p for p in [length_txt, rest, f'"{name_txt}"' if name_txt else ""] if p]
title = " ".join(parts)[:100]
else:
title = (a.get("title", "") or "")[:100]
if not title:
continue
# Price
price_text = ""
price_el = card.find(class_="boat_price")
if price_el:
raw_price = price_el.get("data-price", "")
currency = price_el.get("data-default_currency", "USD")
sym = CURRENCY_SYMBOLS.get(currency, currency + " ")
if raw_price:
try:
price_text = f"{sym}{int(raw_price):,}"
except ValueError:
price_text = price_el.get_text(strip=True)[:30]
# Location
location = ""
h3 = card.find("h3")
if h3:
location = h3.get_text(strip=True)[:80]
# Image
img = ""
pic_div = card.find(class_="news_pic")
if pic_div:
im = pic_div.find("img")
if im:
img = im.get("src", "") or im.get("data-src", "")
results.append({
"url": full_url,
"title": title,
"snippet": f"{price_text} · {location}".strip(" ·"),
"price_text": price_text,
"img_url": img,
"location": location,
"source": src.get("name", "Denison Yachting"),
"source_type": src.get("type", "broker"),
"category": src.get("category", "Brokers USA"),
})
except Exception:
continue
print(f"[{src.get('name','Denison')}] {len(results)} listings")
except Exception as e:
print(f"[{src.get('name','Denison')}] Error: {e}")
return results
# =============================================================================
# SCRAPER: GovPlanet + IronPlanet (Ritchie Bros family — same HTML .sr_lot)
# =============================================================================
def scrape_govplanet(src: dict, query: str, filters: dict) -> list[dict]:
"""
GovPlanet (recreational marine) and IronPlanet (commercial marine).
Both share Ritchie Bros HTML: listing cards use .sr_lot selector.
GovPlanet: https://www.govplanet.com/Recreational+Marine
IronPlanet: https://www.ironplanet.com/Commercial+Marine+Vessels
"""
results = []
try:
url = src["search_url"]
base = "https://" + url.split("/")[2]
headers = get_headers(referer=base + "/")
time.sleep(1.0)
r = requests.get(url, headers=headers, timeout=25, verify=False)
if r.status_code not in (200, 206):
print(f"[{src['name']}] HTTP {r.status_code}")
return []
soup = BeautifulSoup(r.text, "html.parser")
seen = set()
for card in soup.select(".sr_lot, .lot-tile, article.lot, [class*=srItem]"):
try:
a = card.find("a", href=True)
if not a:
continue
href = a["href"]
if not href.startswith("http"):
href = base + href
if href in seen:
continue
seen.add(href)
title = a.get_text(strip=True)[:100] or card.get_text(" ", strip=True)[:80]
price_el = card.select_one(".price, .lot-price, span[class*=price]")
price_txt = price_el.get_text(strip=True) if price_el else ""
img_el = card.find("img")
img = _extract_best_src(img_el) if img_el else ""
if img and img.startswith("/"):
img = base + img
if title and len(title) > 4:
results.append({
"title": title,
"url": href,
"snippet": card.get_text(" ", strip=True)[:200],
"price_text": price_txt,
"location": "",
"img_url": img,
"source": src["name"],
"source_type": src.get("type", "auction"),
"category": src.get("category", ""),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} listings")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
# =============================================================================
# SCRAPER: HiBid (React SPA — Playwright required)
# =============================================================================
def scrape_hibid(src: dict, query: str, filters: dict) -> list[dict]:
"""
HiBid online auction platform — React SPA requires Playwright.
URL: https://www.hibid.com/lots?q={query}+boat
Cards: .lot-tile Title: h3/.lot-title Price: .high-bid/.lot-price
"""
results = []
try:
q = requests.utils.quote((query.strip() + " boat"))
url = f"https://www.hibid.com/lots?q={q}"
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width": 1280, "height": 900},
locale="en-US",
ignore_https_errors=True,
)
ctx.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
)
page = ctx.new_page()
try:
page.goto(url, timeout=30000, wait_until="domcontentloaded")
page.wait_for_timeout(4000)
html = page.content()
finally:
try: page.close()
except: pass
browser.close()
soup = BeautifulSoup(html, "html.parser")
seen = set()
for card in soup.select(".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"):
try:
a = card.find("a", href=True)
if not a:
continue
href = a["href"]
if not href.startswith("http"):
href = "https://www.hibid.com" + href
if href in seen:
continue
seen.add(href)
title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]")
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]")
price_txt = price_el.get_text(strip=True) if price_el else ""
img_el = card.find("img")
img = _extract_best_src(img_el) if img_el else ""
if title and len(title) > 4:
results.append({
"title": title,
"url": href,
"snippet": card.get_text(" ", strip=True)[:200],
"price_text": price_txt,
"location": "",
"img_url": img,
"source": src["name"],
"source_type": src.get("type", "auction"),
"category": src.get("category", ""),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} lots")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
# =============================================================================
# SCRAPER: Copart salvage boats (heavy JS SPA — Playwright)
# =============================================================================
def scrape_copart(src: dict, query: str, filters: dict) -> list[dict]:
"""
Copart salvage/insurance lots for watercraft.
URL: https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT
Lots render in a React table after JS executes.
"""
results = []
try:
q = requests.utils.quote(query.strip())
url = f"https://www.copart.com/vehicleFinderSection/?searchStr={q}&vehicleType=BOAT"
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"]
)
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width": 1280, "height": 900},
locale="en-US",
ignore_https_errors=True,
)
ctx.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = ctx.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(5000)
try:
page.wait_for_selector(
".lot-row, tr[data-lot], .lot-details, [class*=lottile], [class*=lot-card]",
timeout=8000
)
except Exception:
pass
html = page.content()
finally:
try: page.close()
except: pass
browser.close()
soup = BeautifulSoup(html, "html.parser")
seen = set()
for row in soup.select(
"tr[data-lot], .lot-row, [class*=lot-card], [class*=lottile], [class*=lot-item]"
):
try:
a = row.find("a", href=re.compile(r"/lot/"))
if not a:
continue
href = a["href"]
if not href.startswith("http"):
href = "https://www.copart.com" + href
if href in seen:
continue
seen.add(href)
title_el = row.select_one("[class*=title], [class*=desc], td.des")
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
price_el = row.select_one("[class*=bid], [class*=price], td.bid")
price_txt = price_el.get_text(strip=True) if price_el else ""
img_el = row.find("img")
img = _extract_best_src(img_el) if img_el else ""
if title and len(title) > 4:
results.append({
"title": title,
"url": href,
"snippet": row.get_text(" ", strip=True)[:200],
"price_text": price_txt,
"location": "",
"img_url": img,
"source": src["name"],
"source_type": "salvage",
"category": src.get("category", ""),
})
except Exception:
continue
print(f"[{src['name']}] {len(results)} lots")
except Exception as e:
print(f"[{src['name']}] Error: {e}")
return results
# =============================================================================
# SCRAPER: Trade a Boat AU (server-rendered Material-UI)
# =============================================================================
def scrape_tradeaboat(src: dict, query: str, filters: dict) -> list[dict]:
"""
TradeABoat Australia — server-rendered with Material-UI CSS classes.
Cards use jss* dynamic class names; fallback to /details/ link detection.
URL: https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query}
"""
results = []
try:
q = requests.utils.quote(query.strip())
url = f"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={q}"
headers = get_headers(referer="https://www.tradeaboat.com.au/")
time.sleep(1.0)
r = requests.get(url, headers=headers, timeout=25, verify=False)
if r.status_code not in (200, 206):
print(f"[Trade a Boat AU] HTTP {r.status_code}")
return []
soup = BeautifulSoup(r.text, "html.parser")
base = "https://www.tradeaboat.com.au"
seen = set()
# MUI class names are dynamic (jss77, jss78 …) — find cards via /details/ links
detail_links = soup.find_all("a", href=re.compile(r"/details/"))
visited_parents = set()
for a in detail_links:
try:
href = a["href"]
if not href.startswith("http"):
href = base + href
if href in seen:
continue
seen.add(href)
# Walk up to find card container
card = a.find_parent("div") or a
card_id = id(card)
if card_id in visited_parents:
continue
visited_parents.add(card_id)
title_el = card.select_one("h2, h3, [class*=title]")
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
price_el = card.select_one("[class*=price], [class*=Price]")
price_txt = price_el.get_text(strip=True) if price_el else ""
img_el = card.find("img")
img = _extract_best_src(img_el) if img_el else ""
if img and img.startswith("/"):
img = base + img
if title and len(title) > 4:
results.append({
"title": title,
"url": href,
"snippet": card.get_text(" ", strip=True)[:200],
"price_text": price_txt,
"location": "Australia",
"img_url": img,
"source": "Trade a Boat AU",
"source_type": "broker",
"category": src.get("category", ""),
})
except Exception:
continue
print(f"[Trade a Boat AU] {len(results)} listings")
except Exception as e:
print(f"[Trade a Boat AU] Error: {e}")
return results
# =============================================================================
# SCRAPER: Galati Yachts (requests, WordPress / YSP plugin)
# =============================================================================
def scrape_galati(src: dict, query: str, filters: dict) -> list[dict]:
"""
Galati Yachts — server-rendered WordPress with YachtSalesPlugin.
URL: https://www.galatiyachts.com/yachts-for-sale/?keywords={query}
"""
results = []
try:
q = requests.utils.quote(query.strip())
url = f"https://www.galatiyachts.com/yachts-for-sale/?keywords={q}"
headers = get_headers(referer="https://www.galatiyachts.com/")
time.sleep(1.0)
r = requests.get(url, headers=headers, timeout=25, verify=False)
if r.status_code not in (200, 206):
print(f"[Galati Yachts] HTTP {r.status_code}")
return []
soup = BeautifulSoup(r.text, "html.parser")
base = "https://www.galatiyachts.com"
seen = set()
# YSP listing cards — try common selectors, fallback to /yachts/ links
cards = soup.select(".ysp-listing, .listing-card, .yacht-card, [class*=yacht-listing]")
if not cards:
# fallback: group by /yachts/details/ anchor
for a in soup.find_all("a", href=re.compile(r"/yachts/")):
href = a["href"]
if not href.startswith("http"):
href = base + href
if href in seen or "galatiyachts.com" not in href:
continue
if href.count("/") < 4:
continue
seen.add(href)
card = a.find_parent("div") or a
title_el = card.select_one("h2, h3, [class*=title]")
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
price_el = card.select_one("[class*=price], .price")
price_txt = price_el.get_text(strip=True) if price_el else ""
img_el = card.find("img")
img = _extract_best_src(img_el) if img_el else ""
if img and img.startswith("/"):
img = base + img
if title and len(title) > 4:
results.append({
"title": title, "url": href,
"snippet": card.get_text(" ", strip=True)[:200],
"price_text": price_txt, "location": "USA",
"img_url": img, "source": "Galati Yachts",
"source_type": "broker", "category": src.get("category", ""),
})
else:
for card in cards:
try:
a = card.find("a", href=True)
if not a:
continue
href = a["href"]
if not href.startswith("http"):
href = base + href
if href in seen:
continue
seen.add(href)
title_el = card.select_one("h2, h3, [class*=title]")
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
price_el = card.select_one("[class*=price], .price")
price_txt = price_el.get_text(strip=True) if price_el else ""
img_el = card.find("img")
img = _extract_best_src(img_el) if img_el else ""
if img and img.startswith("/"):
img = base + img
if title and len(title) > 4:
results.append({
"title": title, "url": href,
"snippet": card.get_text(" ", strip=True)[:200],
"price_text": price_txt, "location": "USA",
"img_url": img, "source": "Galati Yachts",
"source_type": "broker", "category": src.get("category", ""),
})
except Exception:
continue
print(f"[Galati Yachts] {len(results)} listings")
except Exception as e:
print(f"[Galati Yachts] Error: {e}")
return results
# =============================================================================
# SCRAPER: Luxury brokers (Fraser, Burgess, Worth Ave, Merle Wood, N&J)
# Playwright — JS-heavy sites that won't render with plain requests
# =============================================================================
def scrape_luxury_broker(src: dict, query: str, filters: dict) -> list[dict]:
"""
Generic Playwright scraper for luxury yacht broker sites.
Covers: Fraser Yachts, Worth Ave Yachts, Merle Wood, Burgess, N&J.
Follows internal links with /yacht/, /vessel/, /boat/, /listing/ in path.
"""
results = []
name = src.get("name", "Broker")
try:
raw_url = src["search_url"]
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
base = "https://" + url.split("/")[2]
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"]
)
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width": 1280, "height": 900},
locale="en-US",
ignore_https_errors=True,
)
ctx.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"window.chrome={runtime:{}};"
)
page = ctx.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(3000)
page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
page.wait_for_timeout(1500)
html = page.content()
finally:
try: page.close()
except: pass
browser.close()
soup = BeautifulSoup(html, "html.parser")
seen = set()
LISTING_RE = re.compile(
r'/(yacht[s]?|vessel[s]?|boat[s]?|listing[s]?|detail[s]?|sale|for-sale)/',
re.I
)
for a in soup.find_all("a", href=LISTING_RE):
try:
href = a["href"]
if not href.startswith("http"):
href = base + href
if href in seen or len(href) < 25:
continue
path = href.split("?")[0].rstrip("/")
if path.count("/") < 3:
continue
seen.add(href)
parent = a.find_parent("div") or a.find_parent("li") or a
title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80]
title = " ".join(title.split())[:100]
if len(title) < 5:
continue
ctx_txt = parent.get_text(" ", strip=True)[:300]
pm = re.search(r'[\$€£]\s*[\d,\.]+(?:\s*[Mm]illion|M)?', ctx_txt)
price_txt = pm.group() if pm else ""
img_el = parent.find("img")
img = _extract_best_src(img_el) if img_el else ""
if img and img.startswith("/"):
img = base + img
results.append({
"title": title, "url": href,
"snippet": ctx_txt[:200], "price_text": price_txt,
"location": "", "img_url": img,
"source": name, "source_type": src.get("type", "broker"),
"category": src.get("category", ""),
})
if len(results) >= 30:
break
except Exception:
continue
print(f"[{name}] {len(results)} listings")
except Exception as e:
print(f"[{name}] Error: {e}")
return results
# =============================================================================
# SCRAPER: EU/International brokers blocked on requests (Playwright)
# Covers: Boat24, YachtAll, Annonces Bateau, Inautia ES, Boats&Outboards UK,
# Boatsales AU, YachtMarket, Apollo Duck UK subdomain
# =============================================================================
def scrape_eu_broker(src: dict, query: str, filters: dict) -> list[dict]:
"""
Generic Playwright scraper for EU/AU/UK broker sites that block plain
requests (403/ECONNREFUSED). Navigates with real browser, extracts listings.
"""
results = []
name = src.get("name", "EU Broker")
try:
raw_url = src["search_url"]
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
base = "https://" + url.split("/")[2]
domain = url.split("/")[2]
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width": 1280, "height": 900},
locale="en-US",
ignore_https_errors=True,
)
ctx.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
)
page = ctx.new_page()
try:
page.goto(url, timeout=35000, wait_until="domcontentloaded")
page.wait_for_timeout(3000)
html = page.content()
finally:
try: page.close()
except: pass
browser.close()
soup = BeautifulSoup(html, "html.parser")
seen = set()
for a in soup.find_all("a", href=True):
try:
href = a["href"]
if not href.startswith("http"):
href = base + href
if domain not in href or href in seen:
continue
path = href.split("?")[0].rstrip("/")
if path.count("/") < 3:
continue
if any(s in href.lower() for s in [
"login","register","contact","about","help","privacy",
"sitemap","category","search","tag","page=","lang="
]):
continue
seen.add(href)
parent = a.find_parent("div") or a.find_parent("li") or a
title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80]
title = " ".join(title.split())[:100]
if len(title) < 5:
continue
ctx_txt = parent.get_text(" ", strip=True)[:300]
pm = re.search(r'[\$€£]\s*[\d,\.]+', ctx_txt)
price_txt = pm.group() if pm else ""
img_el = parent.find("img")
img = _extract_best_src(img_el) if img_el else ""
if img and img.startswith("/"):
img = base + img
results.append({
"title": title, "url": href,
"snippet": ctx_txt[:200], "price_text": price_txt,
"location": "", "img_url": img,
"source": name, "source_type": src.get("type", "broker"),
"category": src.get("category", ""),
})
if len(results) >= 30:
break
except Exception:
continue
print(f"[{name}] {len(results)} listings")
except Exception as e:
print(f"[{name}] Error: {e}")
return results
# =============================================================================
# SCRAPER: Forum For-Sale sections (TheHullTruth, Cruisers Forum)
# =============================================================================
def scrape_forum_fs(src: dict, query: str, filters: dict) -> list[dict]:
"""
Scrapes For-Sale classified threads from boating forums (Playwright).
TheHullTruth: /boating-forum/search.php?do=process&query={query}&prefixid=FS
Cruisers Forum: /forums/f152/ (Classifieds subforum)
"""
results = []
name = src.get("name", "Forum")
try:
raw_url = src["search_url"]
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
base = "https://" + url.split("/")[2]
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width": 1280, "height": 900},
locale="en-US",
ignore_https_errors=True,
)
page = ctx.new_page()
try:
page.goto(url, timeout=30000, wait_until="domcontentloaded")
page.wait_for_timeout(2000)
html = page.content()
finally:
try: page.close()
except: pass
browser.close()
soup = BeautifulSoup(html, "html.parser")
seen = set()
# vBulletin/XenForo thread rows
for row in soup.select(
"li.threadbit, div.threadbit, .thread-item, "
"tr.odd, tr.even, .search-result, [class*=thread], "
".js-threadListItem, li[id*=thread]"
):
try:
a = row.find("a", href=re.compile(
r'showthread|/thread[s]?/|/t/\d|/post', re.I
))
if not a:
a = row.find("a", href=True)
if not a:
continue
href = a["href"]
if not href.startswith("http"):
href = base + href
if href in seen:
continue
seen.add(href)
title = a.get_text(strip=True)[:100]
ctx_txt = row.get_text(" ", strip=True)[:200]
pm = re.search(r'\$\s*[\d,]{3,}', ctx_txt)
price_txt = pm.group() if pm else ""
if title and len(title) > 5:
results.append({
"title": title, "url": href,
"snippet": ctx_txt, "price_text": price_txt,
"location": "", "img_url": "",
"source": name, "source_type": "classifieds",
"category": src.get("category", ""),
})
except Exception:
continue
print(f"[{name}] {len(results)} threads")
except Exception as e:
print(f"[{name}] Error: {e}")
return results
def scrape_source_router(src: dict, query: str, filters: dict, page: int = 1):
"""Central dispatcher — routes each source to its dedicated scraper."""
name = src.get("name", "")
# ── Dedicated scrapers ────────────────────────────────────────────────────
if name == "YachtWorld":
return scrape_yachtworld(query, filters, max_pages=1)
if name.startswith("eBay"): # covers all 5 eBay entries
return scrape_ebay(src, query, filters)
if name == "BoatTrader":
return scrape_boattrader(src, query, filters)
if name in ("Apollo Duck", "Apollo Duck Workboats"):
return scrape_apolloduck(src, query, filters)
if name == "Boats.com":
return scrape_boatsdotcom(src, query, filters)
if name == "Craigslist": # single multi-city Craigslist entry
return scrape_craigslist(src, query, filters)
if name.startswith("Craigslist "): # individual city entries — one request each
return scrape_direct_source(src, query, filters)
if name in ("GovPlanet", "GovPlanet Recreational",
"IronPlanet", "IronPlanet Marine"):
return scrape_govplanet(src, query, filters)
if name == "HiBid":
return scrape_hibid(src, query, filters)
if name in ("Copart Marine", "Copart Boats", "Copart Watercraft"):
return scrape_copart(src, query, filters)
if name == "Trade a Boat AU":
return scrape_tradeaboat(src, query, filters)
if name == "Galati Yachts":
return scrape_galati(src, query, filters)
if name in ("Fraser Yachts", "Burgess Yachts", "Northrop & Johnson",
"Worth Ave Yachts"):
return scrape_luxury_broker(src, query, filters)
# Boat24 handled below by dedicated scrape_boat24; Inautia handled by scrape_inautia
if name in ("Boat24 EU", "YachtAll", "Annonces Bateau",
"Annonces Bateau FR", "Inautia ES", "Boats & Outboards UK",
"Boats Outboards UK", "Apollo Duck UK",
"Boatsales AU", "YachtMarket", "Boatpoint AU"):
return scrape_eu_broker(src, query, filters)
if name in ("TheHullTruth", "Cruisers Forum"):
return scrape_forum_fs(src, query, filters)
if name == "YachtWorld Commercial":
return scrape_yachtworld(query, filters, max_pages=1)
if name == "Rightboat":
return scrape_rightboat(src, query, filters)
if name in ("Cooper Salvage", "Cooper Capital Salvage"):
return scrape_cooperss(src, query, filters)
if name == "Inautia":
return scrape_inautia(src, query, filters)
if name == "Boat24":
return scrape_boat24(src, query, filters)
if name == "Facebook Marketplace":
return scrape_facebook_marketplace(src, query, filters)
if name == "HMY Yachts":
return scrape_hmy(src, query, filters)
if name == "BoatCrazy":
return scrape_boatcrazy(src, query, filters)
if name == "Denison Yachting":
return scrape_denison(src, query, filters)
# ── Generic HTML scraper (fallback) ──────────────────────────────────────
return scrape_direct_source(src, query, filters)
def extract_vessel_fast(raw: dict) -> dict | None:
"""
Pure-regex vessel extraction — no Ollama call.
Used for results from known boat marketplaces (broker/classifieds/auction/etc.)
Returns a data dict compatible with save_vessel(), or None if too sparse.
"""
title = (raw.get("title") or "").strip()
snippet = (raw.get("snippet") or "")
price_text = (raw.get("price_text") or "")
location = (raw.get("location") or "")
src_name = (raw.get("source") or "").lower()
src_type = (raw.get("source_type") or "")
category = (raw.get("category") or "").lower()
if not title or len(title) < 5:
return None
combined = f"{title} {snippet} {price_text}"
# ── Price ────────────────────────────────────────────────────────────────
price_usd = None
currency_out = "USD"
for txt in [price_text, snippet, title]:
# USD
m = re.search(r'\$\s*([\d,]{3,})', txt)
if m:
try:
v = float(m.group(1).replace(",",""))
if 500 < v < 50_000_000:
price_usd = v; currency_out = "USD"; break
except: pass
# GBP
m = re.search(r'£\s*([\d,]{3,})', txt)
if m:
try:
v = float(m.group(1).replace(",","")) * 1.27
if 500 < v < 50_000_000:
price_usd = round(v); currency_out = "GBP"; break
except: pass
# EUR
m = re.search(r'€\s*([\d,]{3,})', txt)
if m:
try:
v = float(m.group(1).replace(",","")) * 1.09
if 500 < v < 50_000_000:
price_usd = round(v); currency_out = "EUR"; break
except: pass
# plain number + currency word
m = re.search(r'([\d,]{4,})\s*(?:USD|usd|GBP|gbp|EUR|eur)', txt)
if m:
try:
v = float(m.group(1).replace(",",""))
if 500 < v < 50_000_000:
price_usd = round(v); break
except: pass
# ── LOA ──────────────────────────────────────────────────────────────────
loa_m = None
for pat, in_meters in [
(r'(?:loa|length)[:\s]+([\d.]+)\s*(?:ft|\'|feet)', False),
(r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', False), # starts with size
(r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet)\b', False),
(r"(\d{2,3}(?:\.\d)?)'", False),
(r'(?:loa|length)[:\s]+([\d.]+)\s*m\b', True),
]:
m = re.search(pat, combined, re.IGNORECASE)
if m:
try:
v = float(m.group(1))
if in_meters:
if 5 < v < 200: loa_m = round(v, 1); break
else:
if 10 < v < 500: loa_m = round(v * 0.3048, 1); break
except: pass
# ── Year ─────────────────────────────────────────────────────────────────
year = None
ym = re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title)
if ym: year = int(ym.group(1))
# ── Vessel type ──────────────────────────────────────────────────────────
cl = combined.lower()
if any(k in src_name for k in ["sailboat","sail"]) or "veleros" in category:
vtype = "Sailboat"
elif any(k in src_name for k in ["workboat","commercial","osv","offshore"]):
vtype = "Offshore"
elif "tug" in src_name: vtype = "Tug"
elif "barge" in src_name: vtype = "Barge"
elif any(k in cl for k in ["sailboat","sailing","velero","ketch","sloop","schooner",
"yawl","cutter","catamaran","trimaran","voilier"]):
vtype = "Sailboat"
elif any(k in cl for k in ["tugboat","tug boat","remolcador"]): vtype = "Tug"
elif "barge" in cl or "barcaza" in cl: vtype = "Barge"
elif any(k in cl for k in ["offshore","osv","supply vessel","crew boat"]): vtype = "Offshore"
elif any(k in cl for k in ["fishing","trawler","seiner","pesquero"]): vtype = "Fishing"
elif any(k in cl for k in ["yacht","motor yacht","motoryacht"]): vtype = "Yacht"
else: vtype = "Motor"
status = ("auction" if src_type == "auction" else
"salvage" if src_type == "salvage" else "active")
# Infer location from source name when missing (e.g. "Craigslist Houston" → "Houston")
if not location and raw.get("source"):
src_full = raw["source"]
if re.search(r'[Cc]raigslist', src_full):
city = re.sub(r'[Cc]raigslist\s*', '', src_full).strip()
if city: location = city
elif "Kijiji" in src_full: location = "Canada"
elif "Gumtree" in src_full: location = "Australia"
elif "LeBonCoin" in src_full: location = "France"
elif "Subito" in src_full: location = "Italy"
# For trusted marketplace sources keep the result even with partial data.
# For web-search results require at least one data point to avoid garbage.
is_trusted = src_type in ("broker", "classifieds", "salvage", "commercial", "auction")
if not is_trusted and not (price_usd or loa_m or year or location):
return None
score = 50
if loa_m:
score += min(10, int(loa_m - 10))
if year and year > 1990:
score += min(10, (year - 1990) // 3)
if price_usd and loa_m:
pft = price_usd / max(loa_m / 0.3048, 1)
if pft < 600: score += 15
elif pft < 1200: score += 8
score = min(100, max(0, score))
return {
"_fast": True, # flag: skip unit-conversion block downstream
"skip": False,
"name": title[:100],
"vessel_type": vtype,
"loa_m": loa_m,
"beam_m": None,
"draft_m": None,
"year_built": year,
"hull": "Unknown",
"propulsion": "Sail" if vtype == "Sailboat" else "Diesel",
"status": status,
"price_usd": price_usd,
"currency": currency_out,
"location": location,
"country": None,
"description": f"{title[:140]}",
"flags": [],
"score": score,
}
def search_with_ai(query: str, filters: dict) -> list:
"""
Hybrid search: direct scraping of open sources + web search to reach
blocked sites (YachtWorld, Boats.com, Apollo Duck, etc.)
"""
vessel_type = filters.get("type", "")
region = filters.get("region", "").lower()
base = query
if vessel_type and vessel_type.lower() not in query.lower():
base = f"{vessel_type} {base}"
# Filter sources by region if specified
# Load custom sources from DB and merge with built-in
try:
conn = get_db()
custom = [dict(r) for r in conn.execute(
"SELECT * FROM custom_sources WHERE active=1").fetchall()]
conn.close()
all_sources = DIRECT_SOURCES + [{
"name": c["name"],
"category": c["category"],
"search_url": c["search_url"],
"result_sel": "a[href]",
"price_sel": "",
"img_sel": "img",
"loc_sel": "",
"type": c["source_type"],
} for c in custom]
except:
all_sources = DIRECT_SOURCES
sources_to_use = all_sources
if region and region not in ["global", "todo", "all", ""]:
region_map = {
"usa": ["USA", "Clasificados USA", "Subastas Gobierno USA", "Subastas USA", "Subastas Gobierno", "Comercial Offshore"],
"europa": ["Europa", "Brokers Europa", "Francia", "Italia", "Reino Unido", "España", "España / Global"],
"caribe": ["Latinoamérica", "Latinoamérica / España", "España / Global"],
"latin": ["Latinoamérica", "Latinoamérica / España", "España", "España / Global"],
"asia": ["Australia / Pacífico"],
"australia": ["Australia / Pacífico"],
}
allowed_cats = None
for key, cats in region_map.items():
if key in region:
allowed_cats = cats
break
if allowed_cats:
sources_to_use = [s for s in all_sources if any(c in s["category"] for c in allowed_cats)]
if not sources_to_use:
sources_to_use = all_sources
# Filter by status
status = filters.get("status", "")
if status == "auction":
sources_to_use = [s for s in sources_to_use if s["type"] in ["auction", "salvage"]] or sources_to_use
elif status == "salvage":
sources_to_use = [s for s in sources_to_use if s["type"] == "salvage"] or sources_to_use
elif status not in ("salvage",):
# Exclude salvage-only sources unless explicitly searching for salvage
sources_to_use = [s for s in sources_to_use if s["type"] != "salvage"] or sources_to_use
# Vessel-type-aware source prioritization
OFFSHORE_TYPES = {"offshore", "tug", "barge", "ferry", "fishing", "commercial", "salvage"}
SAILBOAT_TYPES = {"sailboat", "sail", "velero", "ketch", "sloop", "cutter", "schooner"}
COMMERCIAL_ONLY_SOURCES = {
"Seaboats Tug", "Seaboats Barge", "Seaboats Offshore", "Seaboats Fishing",
"OSV Broker", "OSVBroker", "WorkBoat Classifieds", "VT Halter Marine",
"Maritime Connector", "ShipXchange", "Commercial Vessel",
}
SAILBOAT_ONLY_SOURCES = {"SailboatListings", "SailboatListings View", "Cruisers Forum", "Sailboat Listing"}
vessel_type_lower = vessel_type.lower() if vessel_type else ""
if vessel_type_lower in OFFSHORE_TYPES:
# Skip sailboat-only sources, float commercial ones to front
sources_to_use = [s for s in sources_to_use if s["name"] not in SAILBOAT_ONLY_SOURCES]
commercial = [s for s in sources_to_use if s["type"] in ("commercial", "salvage", "auction")]
rest = [s for s in sources_to_use if s["type"] not in ("commercial", "salvage", "auction")]
sources_to_use = commercial + rest
elif vessel_type_lower in SAILBOAT_TYPES or "sail" in base.lower() or "velero" in base.lower():
# Skip commercial-only offshore sources for sailboat searches
sources_to_use = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES]
elif not vessel_type_lower:
# Generic search: keep all but put commercial sources after general ones
commercial = [s for s in sources_to_use if s["name"] in COMMERCIAL_ONLY_SOURCES]
rest = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES]
sources_to_use = rest + commercial
print(f"[Search] Querying {len(sources_to_use)} sources for: {base}")
search_state['total_sources'] = len(sources_to_use)
search_state['log'].append(f"Consultando {len(sources_to_use)} fuentes...")
def get_query_for_source(src):
"""Match query language to source region."""
cat = src.get("category","").lower()
if any(x in cat for x in ["france","franc","veleros franc"]):
return base
elif any(x in cat for x in ["spain","españa","espana","mexico","colombia","latin"]):
return base
else:
return f"{base} for sale" if "for sale" not in base.lower() else base
# Build web search queries targeting specific sites
web_queries = build_web_queries(base, filters)
total = len(sources_to_use) + len(web_queries)
search_state['total_sources'] = total
search_state['log'].append(f"Consultando {len(sources_to_use)} sitios directos + {len(web_queries)} búsquedas web...")
print(f"[Search] {len(sources_to_use)} direct + {len(web_queries)} web searches for: {base}")
# Run BOTH direct scraping AND web searches in parallel
all_raw = []
# ── SailboatListings: dedicated parallel thread (handles its own AI extraction) ──
# Only for sailboat/velero or generic searches, not for offshore/tug/barge/etc.
sbl_thread = None
if vessel_type_lower not in OFFSHORE_TYPES and vessel_type_lower not in {"motor", "motorboat"}:
sbl_thread = threading.Thread(
target=scrape_and_extract_sailboatlistings,
args=(query, filters, search_state.get('search_id', ''), 8),
daemon=True,
)
sbl_thread.start()
search_state['log'].append("SailboatListings: iniciado en paralelo (hilo dedicado)...")
print("[Search] SailboatListings dedicated thread started")
# ── Breadth-First Search across all sources ──────────────────────────────
# Round 1: page 1 of all sources simultaneously
# Round 2: page 2 of sources that had results
# Round 3: page 3, etc.
# Between rounds, a natural pause occurs as we process results
# This avoids hammering any single source with consecutive requests
MAX_ROUNDS = 6 # max pages per source
active_srcs = {src["name"]: {"src": src, "page": 1, "has_more": True}
for src in sources_to_use}
# Web searches only run once (no pagination)
web_done = False
for round_num in range(1, MAX_ROUNDS + 1):
if search_state.get("cancelled"):
break
round_sources = {name: info for name, info in active_srcs.items()
if info["has_more"]}
if not round_sources:
break
search_state['log'].append(f"Ronda {round_num}: consultando {len(round_sources)} fuentes...")
print(f"[Search] Round {round_num}: {len(round_sources)} active sources")
round_raw = []
with ThreadPoolExecutor(max_workers=12) as executor:
futures = {}
# Submit page N of all active sources
for name, info in round_sources.items():
src = info["src"]
q = get_query_for_source(src)
# Add page parameter to URL if supported and page > 1
src_with_page = dict(src)
if round_num > 1:
url = src["search_url"]
# Common pagination patterns
if "craigslist.org" in url:
src_with_page["search_url"] = url + f"&s={round_num * 25 - 25}"
elif "ebay.com" in url:
src_with_page["search_url"] = url + f"&_pgn={round_num}"
elif "seaboats.net" in url:
src_with_page["search_url"] = url + f"&page={round_num}"
elif "kijiji.ca" in url:
src_with_page["search_url"] = url.rstrip('/') + f"/page-{round_num}/"
else:
# Most sites don't support pagination via URL params we know
# Mark as done after page 1
active_srcs[name]["has_more"] = False
continue
futures[executor.submit(scrape_source_router, src_with_page, q, filters, round_num)] = name
# Web searches on round 1 only
if round_num == 1 and not web_done:
for wq in web_queries:
futures[executor.submit(web_search, wq, 6)] = f"Web:{wq[:20]}"
web_done = True
# Collect results for this round
for future in as_completed(futures, timeout=90):
name = futures[future]
try:
results = future.result()
count = len(results)
round_raw.extend(results)
search_state['sources_done'] += 1
if name.startswith("Web:"):
if count:
search_state['log'].append(f"🌐 Web: {count} resultados")
else:
if count:
search_state['log'].append(f"✓ {name} p{round_num}: {count}")
print(f"[Round {round_num}] {name}: {count} listings")
else:
# No results this round — remove from future rounds
if name in active_srcs:
active_srcs[name]["has_more"] = False
except Exception as e:
search_state['sources_done'] += 1
if name in active_srcs:
active_srcs[name]["has_more"] = False
all_raw.extend(round_raw)
print(f"[Search] Round {round_num} complete: {len(round_raw)} new results (total: {len(all_raw)})")
# Small pause between rounds — natural break
if round_num < MAX_ROUNDS and not search_state.get("cancelled"):
polite_pause("BFS-round")
print(f"[Search] Got {len(all_raw)} raw results, extracting vessel data...")
if not all_raw:
return []
# Extract vessel data — parallel with dedup and real-time save
vessels = []
lock = threading.Lock()
max_price = float(filters.get("max_price") or 0)
min_loa = float(filters.get("min_loa") or 0)
query_words = [w.lower() for w in query.split() if len(w) > 2]
# Deduplicate raw results by URL
seen_urls = set()
unique_raw = []
for r in all_raw:
if r["url"] not in seen_urls:
seen_urls.add(r["url"])
unique_raw.append(r)
print(f"[Extract] Processing {len(unique_raw)} unique URLs...")
SYNONYMS = {
"sailboat":["sail","velero","vela","ketch","sloop","schooner","yawl","voilier"],
"velero": ["sail","sailboat","vela","ketch","sloop"],
"tug": ["tugboat","remolcador","tug boat","schlepper"],
"barge": ["barcaza","chaland","ponton","landing craft","lct"],
"fishing": ["pesquero","trawler","seiner","longliner","fisher"],
"offshore":["osv","supply vessel","supply boat","platform"],
"yacht": ["yate","motoryacht","m/y"],
"motor": ["motorboat","lancha","speedboat","cruiser"],
}
NON_VESSELS = ["outboard motor","engine only","motor only","parts only",
"trailer only","propeller","honda bf","yamaha f","suzuki df",
"life jacket","anchor","marine insurance","boat storage",
# Land vehicles — never boats
"ford expedition","ford explorer","ford f-1","ford ranger",
"ford bronco","ford mustang","ford escape","ford transit",
"chevy silverado","chevy tahoe","chevy suburban","chevy colorado",
"chevrolet silverado","chevrolet tahoe","chevrolet suburban",
"gmc sierra","gmc yukon","gmc terrain","gmc canyon",
"dodge ram","ram 1500","ram 2500","ram 3500",
"jeep wrangler","jeep cherokee","jeep grand","jeep gladiator",
"toyota camry","toyota tacoma","toyota tundra","toyota 4runner",
"toyota highlander","toyota rav4","toyota sienna",
"subaru outback","subaru forester","subaru crosstrek",
"honda cr-v","honda pilot","honda accord","honda civic","honda odyssey",
"tesla model","bmw x","mercedes benz","audi q","volkswagen jetta",
"cadillac escalade","cadillac xt","buick enclave","buick encore",
# Non-vessel services
"sailing lesson","sailing partner","sailing school","sailing class",
"sailing instruction","boating lesson","boat lesson","boating class",
"sailing instructor","boat rental","kayak rental","canoe rental",
]
def expand_query(words):
expanded = set(words)
for w in words:
for key, syns in SYNONYMS.items():
if w == key or w in syns:
expanded.add(key)
expanded.update(syns)
return expanded
expanded_query = expand_query(query_words)
GENERIC_NAMES = {
"sailboat","velero","barco","yacht","boat","vessel","embarcación",
"sailboat for sale","velero en venta","boat for sale","barco en venta",
"motor boat","motorboat","fishing boat","tug boat","tugboat",
"within25 mi","within 25 mi","results","listing","listings",
}
def process_one(raw):
try:
if search_state.get("cancelled"):
return
# Quick title pre-check
title_lower = raw["title"].lower()
if any(kw in title_lower for kw in NON_VESSELS):
return
src_type = raw.get("source_type", "")
all_images = []
data = None
# ── FAST PATH: known boat marketplace → pure regex, no AI ────────
if src_type in ("broker","classifieds","auction","salvage","commercial"):
data = extract_vessel_fast(raw)
if data:
img = raw.get("img_url","")
if img:
all_images = [img]
else:
# Derive thumbnail from URL (no page fetch needed)
listing_url = raw.get("url","")
ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url)
if ebay_m:
all_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"]
cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url)
if cl_m:
all_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"]
# ── Fast path: validate the listing is actually a boat ──────────────
if data and data.get("_fast"):
combined_text = (raw.get("title","") + " " + raw.get("snippet","")).lower()
url_l = raw.get("url","").lower()
# URLs that are guaranteed to be boat listings (trusted sections)
BOAT_URLS = ("/boa","/boat","/sail","sailboatlistings","yachtworld",
"boattrader","seaboats","apolloduck","rightboat","boat24",
"annonces-bateau","barcos.net","tradeaboat","marinetraffic")
is_boat_url = any(k in url_l for k in BOAT_URLS)
# General auction sites (sell everything) need a boat keyword in the text
BOAT_WORDS = ["boat","sail","yacht","vessel","ketch","sloop","catamaran",
"trimaran","mast","hull","marina","keel","watercraft","cruiser",
"trawler","dinghy","skiff","pontoon","motorboat","powerboat",
"sailboat","barge","tugboat","outboard","inboard","nautical",
"marine","stern","bow","aft","draft","beam","knot","starboard"]
has_boat_word = any(k in combined_text for k in BOAT_WORDS)
if not is_boat_url and not has_boat_word:
return # Cars, furniture, etc. from general auction sites — skip
# ── SLOW PATH: web-search results → fetch page + AI ──────────────
if not data:
page_text, page_images = "", []
try:
fut = ThreadPoolExecutor(max_workers=1).submit(fetch_page_with_images, raw["url"])
page_text, page_images = fut.result(timeout=12)
except Exception:
page_text = (f"Title: {raw['title']} "
f"| Location: {raw.get('location','')} | {raw.get('snippet','')}")
if not page_images and raw.get("img_url"):
page_images = [raw["img_url"]]
if not page_images:
listing_url = raw.get("url", "")
ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url)
if ebay_m:
page_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"]
cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url)
if cl_m:
page_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"]
all_images = page_images
status = ("auction" if src_type == "auction"
else "salvage" if src_type == "salvage"
else "active")
context = ("URL: " + raw["url"] + "\nTitle: " + raw["title"] +
"\nPrice: " + raw.get("price_text","") + "\n" + page_text[:1500])
prompt = (
"Analyze this boat listing from " + str(raw.get('source','')) +
". Search was: " + query + "\n"
"TEXT: " + context + "\n\n"
"If NOT a boat for sale respond {skip:true}. "
"If IS a boat respond JSON with: skip=false, name, vessel_type "
"(Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Other), "
"loa_m, beam_m, draft_m (ALWAYS in METERS — detect unit from text; "
"if feet multiply by 0.3048, e.g. 45ft=13.7m, 60ft=18.3m, 100ft=30.5m), "
"year_built, hull, propulsion, "
"status=" + status + ", price_usd, currency, location, country, "
"description (Spanish max 150 chars), flags=[], score 0-100."
)
response = ollama_generate(prompt, model=MODELS['classify'], json_mode=True)
m = re.search(r'\{.*\}', response or '', re.DOTALL)
if not m:
return
data = json.loads(m.group())
if data.get("skip") or not data.get("name"):
return
# Override AI loa_m with regex (AI misses feet→m conversion)
loa_from_ctx = None
for pat in [
r'(?:length|loa|eslora)[:\s]+([\d.]+)\s*(?:ft|\'|feet)',
r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet|\')',
r'^(\d{2,3}(?:\.\d)?)\s*\'',
]:
lm = re.search(pat, context, re.IGNORECASE)
if not lm:
lm = re.search(pat, raw.get("title",""), re.IGNORECASE)
if lm:
try:
ft = float(lm.group(1))
if 10 < ft < 500:
loa_from_ctx = round(ft * 0.3048, 1)
break
except: pass
if loa_from_ctx and not data.get("loa_m"):
data["loa_m"] = loa_from_ctx
elif loa_from_ctx and data.get("loa_m") and data["loa_m"] > 25:
data["loa_m"] = round(data["loa_m"] * 0.3048, 1)
# AI unit conversion guard (only needed for AI output)
ctx_lower = (page_text + " " + raw.get("title","")).lower()
has_feet = bool(re.search(r"\d+\s*(?:ft|feet|')\b|loa[:\s]+\d+\s*(?:ft|')", ctx_lower))
vtype_lower = data.get("vessel_type","").lower()
MAX_M = {"sailboat":25,"yacht":35,"motor":30,"fishing":30,
"tug":60,"barge":120,"offshore":90,"ferry":100,"other":50}
max_reasonable = MAX_M.get(vtype_lower, 50)
for dim in ["loa_m","beam_m","draft_m"]:
val = data.get(dim)
if not val or not isinstance(val,(int,float)):
continue
convert = False
if dim == "loa_m" and (val > 100 or val > max_reasonable or (val > 25 and has_feet)): convert = True
elif dim == "beam_m" and (val > 30 or (val > 8 and has_feet)): convert = True
elif dim == "draft_m"and (val > 15 or (val > 5 and has_feet)): convert = True
if convert:
data[dim] = round(val * 0.3048, 1)
# ── Shared post-processing (fast path + AI path) ──────────────────
if not data or not data.get("name"):
return
# Query match check
combined = (data.get("name","") + " " + data.get("description","") +
" " + data.get("vessel_type","") + " " +
raw.get("title","") + " " + raw.get("url","")).lower()
if query_words:
if not any(qw in combined for qw in expanded_query):
# Skip query-match filter for results from direct scrapers (not web search).
# Web search results have category="Web Search" and may return off-topic pages.
# Direct scraper results already passed through a relevant search query.
is_web_search = raw.get("category","").lower() == "web search"
if is_web_search:
source_lower = raw.get("source","").lower()
if not any(kw in source_lower for kw in
["sailboat","yacht","workboat","offshore","tug","commercial",
"boats","boattrader","apolloduck","rightboat","seaboats",
"yachtworld","govplanet","govdeals","hibid","copart","ebay",
"salvex","kijiji","craigslist","denison","galati","hmy"]):
return
# Non-vessel + generic name check
if any(kw in data.get("name","").lower() for kw in NON_VESSELS):
return
if data.get("name","").lower().strip() in GENERIC_NAMES:
return
# Filters (price + LOA)
if max_price and data.get("price_usd") and data["price_usd"] > max_price * 1.01:
return
if min_loa and data.get("loa_m") and data["loa_m"] < (min_loa - 0.15):
return
data["images"] = all_images[:8]
data["source_url"] = raw["url"]
data["source_name"] = raw["source"]
vid = save_vessel(data)
if vid > 0:
with lock:
search_state["found"] += 1
vessels.append(data)
tag = "[Fast]" if data.get("_fast") else "[AI]"
msg = f"✓ {data.get('name','?')}{raw['source']}"
print(f"{tag} {msg}")
search_state["log"].append(msg)
except Exception as e:
print(f"[Extract] Error: {e}")
# Fast path: more workers + more URLs since most results skip AI now
with ThreadPoolExecutor(max_workers=16) as ex:
futs = [ex.submit(process_one, r) for r in unique_raw[:300]]
for f in as_completed(futs, timeout=180):
if search_state.get("cancelled"):
break
try:
f.result()
except Exception:
pass
print(f"[Search] Done — {len(vessels)} vessels found")
return vessels
return vessels
# ── Fingerprint ───────────────────────────────────────────────────────────────
def fingerprint(v: dict) -> str:
raw = f"{v.get('name','').lower().strip()}|{round(v.get('loa_m') or 0)}|{v.get('year_built',0)}|{v.get('vessel_type','')}"
return hashlib.sha256(raw.encode()).hexdigest()[:16]
def save_vessel(v: dict) -> int:
# Reject pure shells — need at least name + 1 real data field
if not v.get("name") or v["name"].strip() in ("", "Unknown"):
return -1
data_points = sum(1 for f in ['price_usd', 'loa_m', 'year_built', 'location'] if v.get(f))
if data_points < 1:
return -1
fp = fingerprint(v)
conn = get_db()
c = conn.cursor()
existing = c.execute("SELECT id FROM vessels WHERE fingerprint=?", (fp,)).fetchone()
if existing:
conn.close()
return existing['id']
try:
c.execute("""INSERT INTO vessels
(name,vessel_type,loa_m,beam_m,draft_m,year_built,hull,propulsion,
status,price_usd,currency,location,country,source_name,source_url,
description,images,flags,score,fingerprint,raw_data)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
(v.get('name'), v.get('vessel_type'), v.get('loa_m'),
v.get('beam_m'), v.get('draft_m'), v.get('year_built'),
v.get('hull'), v.get('propulsion'), v.get('status','active'),
v.get('price_usd'), v.get('currency','USD'),
v.get('location'), v.get('country'),
v.get('source_name'), v.get('source_url'),
v.get('description'), json.dumps(v.get('images',[])),
json.dumps(v.get('flags',[])), v.get('score',50),
fp, json.dumps(v)))
vid = c.lastrowid
conn.commit()
except Exception as e:
print(f"[DB] Error: {e}")
vid = -1
finally:
conn.close()
return vid
# ── API Routes ────────────────────────────────────────────────────────────────
def hash_pw(pw):
return _hashlib.sha256(pw.encode()).hexdigest()
def seed_admin():
conn = get_db()
existing = conn.execute("SELECT id FROM users WHERE username='admin'").fetchone()
if not existing:
conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)",
('admin', hash_pw('admin123'), 'admin'))
conn.commit()
print("[Auth] Default user created: admin / admin123")
conn.close()
@app.route('/api/login', methods=['POST'])
def login():
body = request.json or {}
username = body.get('username','').strip()
password = body.get('password','')
conn = get_db()
user = conn.execute("SELECT * FROM users WHERE username=? AND password=?",
(username, hash_pw(password))).fetchone()
conn.close()
if user:
session['user_id'] = user['id']
session['username'] = user['username']
session['role'] = user['role']
return jsonify({'ok': True, 'username': user['username'], 'role': user['role']})
return jsonify({'ok': False, 'error': 'Usuario o contraseña incorrectos'}), 401
@app.route('/api/logout', methods=['POST'])
def logout():
session.clear()
return jsonify({'ok': True})
@app.route('/api/me')
def me():
if 'user_id' not in session:
return jsonify({'logged_in': False}), 401
return jsonify({'logged_in': True, 'username': session.get('username'), 'role': session.get('role')})
@app.route('/api/users', methods=['GET'])
def list_users():
if session.get('role') != 'admin':
return jsonify({'error': 'forbidden'}), 403
conn = get_db()
rows = [dict(r) for r in conn.execute("SELECT id,username,role,created_at FROM users").fetchall()]
conn.close()
return jsonify({'users': rows})
@app.route('/api/users', methods=['POST'])
def create_user():
if session.get('role') != 'admin':
return jsonify({'error': 'forbidden'}), 403
body = request.json or {}
username = body.get('username','').strip()
password = body.get('password','')
role = body.get('role','user')
if not username or not password:
return jsonify({'error': 'username and password required'}), 400
conn = get_db()
try:
conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)",
(username, hash_pw(password), role))
conn.commit()
conn.close()
return jsonify({'ok': True})
except:
conn.close()
return jsonify({'error': 'username already exists'}), 400
@app.route('/api/change_password', methods=['POST'])
def change_password():
if 'user_id' not in session:
return jsonify({'error': 'not logged in'}), 401
body = request.json or {}
old_pw = body.get('old_password','')
new_pw = body.get('new_password','')
conn = get_db()
user = conn.execute("SELECT * FROM users WHERE id=? AND password=?",
(session['user_id'], hash_pw(old_pw))).fetchone()
if not user:
conn.close()
return jsonify({'error': 'Contraseña actual incorrecta'}), 400
conn.execute("UPDATE users SET password=? WHERE id=?", (hash_pw(new_pw), session['user_id']))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/')
def index():
return send_from_directory('static', 'index.html')
@app.route('/api/status')
def status():
models = ollama_models()
conn = get_db()
counts = {
'vessels': conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0],
'saved': conn.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0],
'alerts': conn.execute("SELECT COUNT(*) FROM alerts WHERE active=1").fetchone()[0],
}
conn.close()
return jsonify({
'ok': True,
'ollama_models': models,
'active_model': MODELS['extract'],
'db_counts': counts,
'sources_count': len(DIRECT_SOURCES),
'categories': list(set(s['category'] for s in DIRECT_SOURCES)),
})
@app.route('/api/vessels')
def list_vessels():
conn = get_db()
q = "SELECT * FROM vessels WHERE 1=1"
params = []
if t := request.args.get('type'):
q += " AND vessel_type=?"; params.append(t)
if s := request.args.get('status'):
q += " AND status=?"; params.append(s)
if h := request.args.get('hull'):
q += " AND hull=?"; params.append(h)
if mp := request.args.get('max_price'):
q += " AND price_usd <= ?"; params.append(float(mp))
if ml := request.args.get('min_loa'):
q += " AND loa_m IS NOT NULL AND loa_m >= ?"; params.append(round(float(ml) - 0.15, 2))
if yr_min := request.args.get('year_min'):
try: q += " AND year_built >= ?"; params.append(int(yr_min))
except: pass
if yr_max := request.args.get('year_max'):
try: q += " AND year_built <= ?"; params.append(int(yr_max))
except: pass
sort = request.args.get('sort', 'score')
sorts = {
'score':'score DESC', 'price_asc':'price_usd ASC',
'price_desc':'price_usd DESC', 'loa':'loa_m DESC',
'year':'year_built DESC', 'newest':'created_at DESC'
}
q += f" ORDER BY {sorts.get(sort,'score DESC')}"
q += f" LIMIT {min(int(request.args.get('limit',200)),500)}"
rows = [dict(r) for r in conn.execute(q, params).fetchall()]
for r in rows:
r['flags'] = json.loads(r.get('flags') or '[]')
r['images'] = json.loads(r.get('images') or '[]')
conn.close()
return jsonify({'vessels': rows, 'count': len(rows)})
_PROXY_ALLOWED = [
'sailboatlistings.com', 'yachtworld.com', 'boattrader.com',
'apolloduck.com', 'rightboat.com', 'boat24.com', 'seaboats.net',
'boats.com', 'iboats.com', 'yachtworld.co.uk',
]
@app.route('/api/img_proxy')
def img_proxy():
url = request.args.get('url', '')
if not url:
return '', 404
from urllib.parse import urlparse
host = urlparse(url).hostname or ''
if not any(d in host for d in _PROXY_ALLOWED):
return '', 403
try:
resp = requests.get(url, timeout=10, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': f'https://{host}/',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
})
if resp.status_code == 200:
ct = resp.headers.get('content-type', 'image/jpeg')
return Response(resp.content, content_type=ct,
headers={'Cache-Control': 'public, max-age=86400'})
return '', resp.status_code
except Exception as e:
app.logger.debug(f"img_proxy error: {e}")
return '', 502
# Global search state
search_state = {
'running': False,
'cancelled': False,
'query': '',
'found': 0,
'total_sources': 0,
'sources_done': 0,
'log': [],
}
@app.route('/api/search', methods=['POST'])
def search():
body = request.json or {}
query = body.get('query', '')
filters = body.get('filters', {})
if not query:
return jsonify({'error': 'query requerido'}), 400
# Clear previous results immediately
conn = get_db()
conn.execute("DELETE FROM vessels")
conn.execute("DELETE FROM saved_vessels")
conn.execute("INSERT INTO search_history (query,filters) VALUES (?,?)",
(query, json.dumps(filters)))
conn.commit()
conn.close()
# Reset state
search_state['running'] = True
search_state['cancelled'] = False
search_state['query'] = query
search_state['found'] = 0
search_state['sources_done'] = 0
search_state['total_sources'] = len(DIRECT_SOURCES)
search_state['log'] = [f"Iniciando búsqueda: {query}"]
# Tag this search with a unique ID so old threads don't pollute new searches
import uuid
search_id = str(uuid.uuid4())
search_state['search_id'] = search_id
# Run search in background thread
def run_bg(sid):
try:
search_with_ai(query, filters)
except Exception as e:
search_state['log'].append(f"Error: {e}")
print(f"[BG] Error: {e}")
finally:
if search_state.get('search_id') == sid:
search_state['running'] = False
total = search_state['found']
msg = f"✓ Búsqueda completa — {total} embarcaciones encontradas"
search_state['log'].append(msg)
print(f"[BG] {msg}")
t = threading.Thread(target=run_bg, args=(search_id,), daemon=True)
t.start()
return jsonify({'ok': True, 'message': 'Búsqueda iniciada en background'})
@app.route('/api/search/status')
def search_status():
return jsonify(search_state)
@app.route('/api/search/cancel', methods=['POST'])
def cancel_search():
import uuid
search_state['cancelled'] = True
search_state['running'] = False
search_state['search_id'] = str(uuid.uuid4()) # invalidate any running thread
search_state['log'].append('⏹ Búsqueda cancelada por el usuario')
return jsonify({'ok': True})
@app.route('/api/fb-status')
def fb_status():
SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json")
return jsonify({"active": os.path.exists(SESSION_FILE)})
@app.route('/api/fb-setup', methods=['POST'])
def fb_setup():
"""
Launch a visible Chromium window so the user can log in to Facebook.
After login is detected (marketplace URL is accessible), saves cookies to fb_session.json.
"""
SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json")
import json as _json
try:
from playwright.sync_api import sync_playwright
result = {"ok": False, "msg": ""}
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False,
args=["--disable-blink-features=AutomationControlled"])
context = browser.new_context(
viewport={"width": 1100, "height": 800},
user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"))
page = context.new_page()
page.goto("https://www.facebook.com/login", timeout=30000,
wait_until="domcontentloaded")
# Wait up to 3 minutes for user to log in and reach marketplace
try:
page.wait_for_url(
re.compile(r'facebook\.com/(marketplace|home|feed)'),
timeout=180000)
# Give extra time to fully load
page.wait_for_timeout(3000)
cookies = context.cookies()
with open(SESSION_FILE, "w") as f:
_json.dump(cookies, f)
result = {"ok": True,
"msg": f"Sesión guardada ({len(cookies)} cookies). "
"Facebook Marketplace activado."}
except Exception as e:
result = {"ok": False, "msg": f"Tiempo agotado o error: {e}"}
finally:
try: page.close()
except: pass
browser.close()
return jsonify(result)
except Exception as e:
return jsonify({"ok": False, "msg": str(e)}), 500
@app.route('/api/vessels/<int:vid>', methods=['GET'])
def get_vessel(vid):
conn = get_db()
row = conn.execute("SELECT * FROM vessels WHERE id=?", (vid,)).fetchone()
conn.close()
if not row:
return jsonify({'error': 'not found'}), 404
v = dict(row)
v['flags'] = json.loads(v.get('flags') or '[]')
v['images'] = json.loads(v.get('images') or '[]')
return jsonify(v)
@app.route('/api/vessels', methods=['POST'])
def add_vessel():
v = request.json or {}
v['source_name'] = v.get('source_name', 'Manual')
vid = save_vessel(v)
return jsonify({'id': vid, 'ok': True})
@app.route('/api/vessels/<int:vid>', methods=['PUT'])
def update_vessel(vid):
body = request.json or {}
conn = get_db()
fields = ['name','vessel_type','loa_m','beam_m','draft_m','year_built',
'hull','propulsion','status','price_usd','location','description','score']
updates = {k: body[k] for k in fields if k in body}
if updates:
set_clause = ', '.join(f"{k}=?" for k in updates)
conn.execute(f"UPDATE vessels SET {set_clause}, updated_at=datetime('now') WHERE id=?",
[*updates.values(), vid])
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/vessels/<int:vid>', methods=['DELETE'])
def delete_vessel(vid):
conn = get_db()
conn.execute("DELETE FROM vessels WHERE id=?", (vid,))
conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/saved', methods=['GET'])
def list_saved():
conn = get_db()
rows = conn.execute("""
SELECT v.*, s.notes, s.saved_at
FROM vessels v JOIN saved_vessels s ON v.id=s.vessel_id
ORDER BY s.saved_at DESC
""").fetchall()
result = []
for r in rows:
v = dict(r)
v['flags'] = json.loads(v.get('flags') or '[]')
v['images'] = json.loads(v.get('images') or '[]')
result.append(v)
conn.close()
return jsonify({'vessels': result, 'count': len(result)})
@app.route('/api/saved/<int:vid>', methods=['POST'])
def save_vessel_fav(vid):
notes = (request.json or {}).get('notes', '')
conn = get_db()
existing = conn.execute("SELECT id FROM saved_vessels WHERE vessel_id=?", (vid,)).fetchone()
if not existing:
conn.execute("INSERT INTO saved_vessels (vessel_id, notes) VALUES (?,?)", (vid, notes))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/saved/<int:vid>', methods=['DELETE'])
def unsave_vessel(vid):
conn = get_db()
conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/alerts', methods=['GET'])
def list_alerts():
conn = get_db()
rows = [dict(r) for r in conn.execute("SELECT * FROM alerts WHERE active=1").fetchall()]
conn.close()
return jsonify({'alerts': rows})
@app.route('/api/alerts', methods=['POST'])
def create_alert():
body = request.json or {}
conn = get_db()
conn.execute("INSERT INTO alerts (name, filters) VALUES (?,?)",
(body.get('name','Alerta'), json.dumps(body.get('filters',{}))))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/alerts/<int:aid>', methods=['DELETE'])
def delete_alert(aid):
conn = get_db()
conn.execute("UPDATE alerts SET active=0 WHERE id=?", (aid,))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/sources')
def list_sources():
by_cat = {}
for s in DIRECT_SOURCES:
cat = s['category']
if cat not in by_cat:
by_cat[cat] = []
by_cat[cat].append({'name': s['name'], 'url': s['search_url'].split('?')[0], 'type': s['type'], 'builtin': True})
# Add custom sources
try:
conn = get_db()
custom = [dict(r) for r in conn.execute("SELECT * FROM custom_sources ORDER BY category").fetchall()]
conn.close()
for c in custom:
cat = c['category'] or 'Custom'
if cat not in by_cat:
by_cat[cat] = []
by_cat[cat].append({
'name': c['name'], 'url': c['search_url'].split('?')[0],
'type': c['source_type'], 'builtin': False,
'id': c['id'], 'active': bool(c['active'])
})
except:
pass
return jsonify({'sources': by_cat, 'total': sum(len(v) for v in by_cat.values())})
@app.route('/api/history')
def search_history():
conn = get_db()
rows = [dict(r) for r in conn.execute(
"SELECT * FROM search_history ORDER BY searched_at DESC LIMIT 50").fetchall()]
conn.close()
return jsonify({'history': rows})
@app.route('/api/analyze', methods=['POST'])
def analyze_text():
body = request.json or {}
text = body.get('text', '')
source = body.get('source', 'Manual')
if not text:
return jsonify({'error': 'text requerido'}), 400
result = extract_vessel_from_text(text, source)
if result:
vid = save_vessel({**result, 'source_name': source})
result['id'] = vid
return jsonify(result)
@app.route('/api/collections', methods=['GET'])
def list_collections():
conn = get_db()
cols = [dict(r) for r in conn.execute(
"SELECT c.*, COUNT(cv.vessel_id) as vessel_count FROM collections c "
"LEFT JOIN collection_vessels cv ON c.id=cv.collection_id "
"GROUP BY c.id ORDER BY c.created_at DESC").fetchall()]
conn.close()
return jsonify({'collections': cols})
@app.route('/api/collections', methods=['POST'])
def create_collection():
body = request.json or {}
name = body.get('name','').strip()
if not name:
return jsonify({'error': 'name required'}), 400
conn = get_db()
conn.execute("INSERT INTO collections (name,description,color,icon) VALUES (?,?,?,?)",
(name, body.get('description',''), body.get('color','#00b4ff'), body.get('icon','📁')))
conn.commit()
cid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
conn.close()
return jsonify({'ok': True, 'id': cid})
@app.route('/api/collections/<int:cid>', methods=['DELETE'])
def delete_collection(cid):
conn = get_db()
conn.execute("DELETE FROM collection_vessels WHERE collection_id=?", (cid,))
conn.execute("DELETE FROM collections WHERE id=?", (cid,))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/collections/<int:cid>/vessels', methods=['GET'])
def collection_vessels(cid):
conn = get_db()
rows = conn.execute("""
SELECT v.*, cv.notes, cv.added_at FROM vessels v
JOIN collection_vessels cv ON v.id=cv.vessel_id
WHERE cv.collection_id=? ORDER BY cv.added_at DESC""", (cid,)).fetchall()
result = []
for r in rows:
v = dict(r)
v['flags'] = json.loads(v.get('flags') or '[]')
v['images'] = json.loads(v.get('images') or '[]')
result.append(v)
conn.close()
return jsonify({'vessels': result, 'count': len(result)})
@app.route('/api/collections/<int:cid>/vessels', methods=['POST'])
def add_to_collection(cid):
body = request.json or {}
vessel_ids = body.get('vessel_ids', [])
notes = body.get('notes', '')
conn = get_db()
added = 0
for vid in vessel_ids:
try:
conn.execute("INSERT OR IGNORE INTO collection_vessels (collection_id,vessel_id,notes) VALUES (?,?,?)",
(cid, vid, notes))
added += 1
except:
pass
conn.commit()
conn.close()
return jsonify({'ok': True, 'added': added})
@app.route('/api/collections/<int:cid>/vessels/<int:vid>', methods=['DELETE'])
def remove_from_collection(cid, vid):
conn = get_db()
conn.execute("DELETE FROM collection_vessels WHERE collection_id=? AND vessel_id=?", (cid, vid))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/custom_sources', methods=['GET'])
def get_custom_sources():
conn = get_db()
rows = [dict(r) for r in conn.execute(
"SELECT * FROM custom_sources ORDER BY created_at DESC").fetchall()]
conn.close()
return jsonify({'sources': rows})
@app.route('/api/custom_sources', methods=['POST'])
def add_custom_source():
body = request.json or {}
name = body.get('name','').strip()
url = body.get('search_url','').strip()
if not name or not url:
return jsonify({'error': 'name and search_url required'}), 400
# Ensure URL has {query} placeholder
if '{query}' not in url:
url = url.rstrip('/') + '?q={query}'
conn = get_db()
conn.execute("""INSERT INTO custom_sources (name,category,search_url,source_type,added_by)
VALUES (?,?,?,?,?)""",
(name, body.get('category','Custom'),
url, body.get('source_type','broker'),
session.get('username','admin')))
conn.commit()
sid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
conn.close()
return jsonify({'ok': True, 'id': sid})
@app.route('/api/custom_sources/<int:sid>', methods=['PUT'])
def update_custom_source(sid):
body = request.json or {}
conn = get_db()
fields = ['name','category','search_url','source_type','active']
updates = {k: body[k] for k in fields if k in body}
if updates:
set_clause = ', '.join(f"{k}=?" for k in updates)
conn.execute(f"UPDATE custom_sources SET {set_clause} WHERE id=?",
[*updates.values(), sid])
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/custom_sources/<int:sid>', methods=['DELETE'])
def delete_custom_source(sid):
conn = get_db()
conn.execute("DELETE FROM custom_sources WHERE id=?", (sid,))
conn.commit()
conn.close()
return jsonify({'ok': True})
@app.route('/api/stats')
def stats():
conn = get_db()
c = conn.cursor()
data = {
'total': c.execute("SELECT COUNT(*) FROM vessels").fetchone()[0],
'saved': c.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0],
'by_type': dict(c.execute("SELECT vessel_type, COUNT(*) FROM vessels GROUP BY vessel_type").fetchall()),
'by_status': dict(c.execute("SELECT status, COUNT(*) FROM vessels GROUP BY status").fetchall()),
'by_country':dict((k or 'Unknown', v) for k,v in c.execute("SELECT country, COUNT(*) FROM vessels WHERE country IS NOT NULL GROUP BY country ORDER BY COUNT(*) DESC LIMIT 10").fetchall()),
'avg_score': c.execute("SELECT AVG(score) FROM vessels").fetchone()[0] or 0,
'avg_price': c.execute("SELECT AVG(price_usd) FROM vessels WHERE price_usd > 0").fetchone()[0] or 0,
'top_opportunities': [dict(r) for r in c.execute(
"SELECT id,name,vessel_type,price_usd,score,location FROM vessels ORDER BY score DESC LIMIT 5").fetchall()],
}
conn.close()
return jsonify(data)
# ── Seed sample data ──────────────────────────────────────────────────────────
def seed_sample_data():
samples = [
{"name":"M/Y Stella Maris","vessel_type":"Yacht","loa_m":28.4,"beam_m":6.8,"draft_m":1.9,"year_built":2008,"hull":"Fiberglass","propulsion":"Diesel","status":"active","price_usd":189000,"location":"Fort Lauderdale, FL","country":"US","source_name":"YachtWorld","source_url":"https://yachtworld.com","description":"Yate motor bien mantenido, twin Volvo IPS, refit 2022.","flags":["below_market","motivated_seller"],"score":87},
{"name":"F/V Cape Hatteras","vessel_type":"Fishing","loa_m":19.2,"beam_m":5.1,"draft_m":1.4,"year_built":1997,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":22000,"location":"Gloucester, MA","country":"US","source_name":"GovDeals","source_url":"https://govdeals.com","description":"Ex buque NOAA, motor operativo, casco requiere trabajo.","flags":["rare","salvage_value","below_market"],"score":94},
{"name":"TUG Bravo Eagle","vessel_type":"Tug","loa_m":32.0,"beam_m":9.4,"draft_m":3.8,"year_built":1989,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":310000,"location":"New Orleans, LA","country":"US","source_name":"AuctionTime","source_url":"https://auctiontime.com","description":"Remolcador 2400HP, clase ABS, listo para operación comercial.","flags":["rare","auction","motivated_seller"],"score":91},
{"name":"OSV Pacific Ranger","vessel_type":"Offshore","loa_m":52.0,"beam_m":13.2,"draft_m":4.1,"year_built":2005,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":890000,"location":"Port Fourchon, LA","country":"US","source_name":"GovPlanet","source_url":"https://govplanet.com","description":"Buque apoyo offshore DP1, 400T carga, documentación completa.","flags":["rare","auction","government_surplus"],"score":79},
{"name":"Barge RJ-440","vessel_type":"Barge","loa_m":44.0,"beam_m":12.0,"draft_m":1.8,"year_built":1978,"hull":"Steel","propulsion":"None","status":"active","price_usd":55000,"location":"Houston, TX","country":"US","source_name":"WorkBoat Classifieds","source_url":"https://workboat.com","description":"Barcaza cubierta, capacidad 800T, buen estado estructural.","flags":["below_market","rare"],"score":73},
{"name":"LCT Endeavour","vessel_type":"Barge","loa_m":61.0,"beam_m":14.6,"draft_m":1.5,"year_built":1968,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":38000,"location":"Manila, Filipinas","country":"PH","source_name":"Salvex","source_url":"https://salvex.com","description":"Landing craft, estructura sólida, motores requieren overhaul.","flags":["salvage_value","rare","below_market"],"score":82},
]
for s in samples:
save_vessel(s)
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == '__main__':
import socket, signal, atexit, sys
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
PID_FILE = os.path.join(BASE_DIR, ".server.pid")
# ── Handle existing instance ───────────────────────────────────────────────
def kill_pid(pid):
try:
import ctypes
handle = ctypes.windll.kernel32.OpenProcess(1, False, pid)
ctypes.windll.kernel32.TerminateProcess(handle, -1)
ctypes.windll.kernel32.CloseHandle(handle)
return True
except:
try:
os.kill(pid, 9)
return True
except:
return False
def pid_running(pid):
try:
os.kill(pid, 0)
return True
except OSError:
return False
if os.path.exists(PID_FILE):
try:
old_pid = int(open(PID_FILE).read().strip())
if pid_running(old_pid):
print(f"\n ⚠️ Ya hay una instancia corriendo (PID {old_pid})")
resp = input(" ¿Cerrar la instancia anterior y continuar? [S/n]: ").strip().lower()
if resp in ("", "s", "si", "sí", "y", "yes"):
if kill_pid(old_pid):
print(f" ✓ Instancia anterior (PID {old_pid}) cerrada.")
import time; time.sleep(1)
else:
print(f" ✗ No se pudo cerrar. Ciérrala manualmente y vuelve a intentar.")
sys.exit(1)
else:
print(" Saliendo sin cambios.")
sys.exit(0)
except (ValueError, IOError):
pass # PID file corrupted — ignore
# ── Write PID file ─────────────────────────────────────────────────────────
with open(PID_FILE, "w") as f:
f.write(str(os.getpid()))
def cleanup_pid():
try: os.remove(PID_FILE)
except: pass
atexit.register(cleanup_pid)
def handle_signal(sig, frame):
print("\n\n 👋 Cerrando Boat&Ship-Finder...")
cleanup_pid()
sys.exit(0)
signal.signal(signal.SIGINT, handle_signal)
signal.signal(signal.SIGTERM, handle_signal)
# ── Port selection ─────────────────────────────────────────────────────────
def port_free(p):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try: s.bind(("0.0.0.0", p)); return True
except: return False
desired = int(os.environ.get('MARINE_PORT', 8765))
port = desired
if not port_free(desired):
for p in range(desired + 1, desired + 20):
if port_free(p):
port = p
break
print(f"\n ⚠️ Puerto {desired} ocupado — usando {port}")
# ── DB init ────────────────────────────────────────────────────────────────
print("\n" + "="*55)
print(" Boat&Ship-Finder — Iniciando...")
print("="*55)
init_db()
seed_admin()
conn = get_db()
count = conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0]
conn.close()
if count == 0:
print("[DB] Base de datos vacía — lista para búsquedas reales")
else:
print(f"[DB] {count} embarcaciones en caché de sesión anterior")
print(f"\n Local: http://localhost:{port}")
print(f" Tailscale: http://<tu-ip-tailscale>:{port}")
print(f" Fuentes directas: {len(DIRECT_SOURCES)}")
print(f" Modelos Ollama: {list(MODELS.values())}")
print(f" PID: {os.getpid()} (guardado en .server.pid)")
print("\n [Ctrl+C para detener]\n")
app.run(host='0.0.0.0', port=port, debug=False)