235a9abbfe
- Replace hardcoded secret_key with os.environ.get('SECRET_KEY')
- RuntimeError if SECRET_KEY not set (fail fast)
- Restrict CORS to localhost:8765 origins (was allow all with credentials)
- Add .gitignore excluding db, env, __pycache__, backups
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
5403 lines
247 KiB
Python
5403 lines
247 KiB
Python
"""
|
|
Boat&Ship-Finder - Backend Server
|
|
Requiere: pip install flask flask-cors requests beautifulsoup4 playwright
|
|
"""
|
|
|
|
from flask import Flask, jsonify, request, send_from_directory, session
|
|
import hashlib as _hashlib
|
|
from flask_cors import CORS
|
|
import requests
|
|
import json
|
|
import sqlite3
|
|
import os
|
|
import re
|
|
import time
|
|
import hashlib
|
|
from datetime import datetime
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from bs4 import BeautifulSoup
|
|
import threading
|
|
import urllib3
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
app = Flask(__name__, static_folder='static')
|
|
_secret = os.environ.get('SECRET_KEY')
|
|
if not _secret:
|
|
raise RuntimeError("SECRET_KEY not set — add SECRET_KEY=<random> to your environment")
|
|
app.secret_key = _secret
|
|
CORS(app,
|
|
origins=["http://localhost:8765", "http://127.0.0.1:8765"],
|
|
supports_credentials=True)
|
|
|
|
DB_PATH = 'marine.db'
|
|
OLLAMA_URL = 'http://localhost:11434/api/generate'
|
|
|
|
# ── Modelos Ollama por tarea ──────────────────────────────────────────────────
|
|
MODELS = {
|
|
'extract': 'qwen2.5:32b', # Extracción de specs (más rápido que 72b, igual de preciso)
|
|
'classify': 'llama3.1:8b', # Clasificación rápida
|
|
'embed': 'nomic-embed-text:latest', # Embeddings para dedup
|
|
'parse': 'qwen3-coder:latest' # Parsing estructurado
|
|
}
|
|
|
|
# ── Fuentes globales por categoría ────────────────────────────────────────────
|
|
SOURCES = {
|
|
"Subastas USA": [
|
|
{"name": "GovPlanet", "url": "https://www.govplanet.com/boats", "type": "auction"},
|
|
{"name": "GovDeals", "url": "https://www.govdeals.com", "type": "auction"},
|
|
{"name": "PropertyRoom", "url": "https://www.propertyroom.com/boats", "type": "auction"},
|
|
{"name": "PublicSurplus", "url": "https://www.publicsurplus.com", "type": "auction"},
|
|
{"name": "AuctionTime", "url": "https://www.auctiontime.com/boats", "type": "auction"},
|
|
{"name": "IronPlanet", "url": "https://www.ironplanet.com/boats", "type": "auction"},
|
|
{"name": "HiBid", "url": "https://www.hibid.com/boats", "type": "auction"},
|
|
{"name": "Copart Marine", "url": "https://www.copart.com/boats", "type": "auction"},
|
|
{"name": "BidSpotter", "url": "https://www.bidspotter.com/boats", "type": "auction"},
|
|
{"name": "32auctions", "url": "https://www.32auctions.com", "type": "auction"},
|
|
],
|
|
"Subastas Internacionales": [
|
|
{"name": "Ritchie Bros", "url": "https://www.rbauction.com/boats", "type": "auction"},
|
|
{"name": "Euro Auctions", "url": "https://www.euroauctions.com", "type": "auction"},
|
|
{"name": "Troostwijk", "url": "https://www.troostwijkauctions.com", "type": "auction"},
|
|
{"name": "Surplex", "url": "https://www.surplex.com/marine", "type": "auction"},
|
|
{"name": "BVA Auctions", "url": "https://www.bva-auctions.com", "type": "auction"},
|
|
{"name": "Catawiki Marine", "url": "https://www.catawiki.com/boats", "type": "auction"},
|
|
{"name": "Barnebys", "url": "https://www.barnebys.com/boats", "type": "auction"},
|
|
{"name": "ShipXchange", "url": "https://www.shipxchange.com", "type": "auction"},
|
|
],
|
|
"Venta Especializada": [
|
|
{"name": "YachtWorld", "url": "https://www.yachtworld.com", "type": "broker"},
|
|
{"name": "Boats.com", "url": "https://www.boats.com", "type": "broker",
|
|
"search_url": "https://www.boats.com/boats-for-sale/?query={query}"},
|
|
{"name": "BoatTrader", "url": "https://www.boattrader.com", "type": "broker",
|
|
"search_url": "https://www.boattrader.com/boats/?query={query}"},
|
|
{"name": "Apollo Duck", "url": "https://www.apolloduck.com", "type": "broker",
|
|
"search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1"},
|
|
{"name": "Rightboat", "url": "https://www.rightboat.com", "type": "broker",
|
|
"search_url": "https://www.rightboat.com/boats-for-sale/?q={query}"},
|
|
{"name": "Boat24", "url": "https://www.boat24.com", "type": "broker",
|
|
"search_url": "https://www.boat24.com/en/usedboats/"},
|
|
{"name": "Inautia", "url": "https://www.inautia.com", "type": "broker",
|
|
"search_url": "https://www.inautia.com/boats/?q={query}"},
|
|
# ── US Brokers ────────────────────────────────────────────────────────
|
|
{"name": "HMY Yachts", "url": "https://hmy.com", "type": "broker",
|
|
"search_url": "https://www.hmy.com/yachts-for-sale/?SaleClassCode=used",
|
|
"category": "Brokers USA"},
|
|
{"name": "Denison Yachting","url": "https://www.denisonyachtsales.com", "type": "broker",
|
|
"search_url": "https://www.denisonyachtsales.com/yachts-for-sale/?search={query}",
|
|
"category": "Brokers USA"},
|
|
{"name": "United Yacht", "url": "https://www.unitedyacht.com", "type": "broker",
|
|
"search_url": "https://www.unitedyacht.com/yachts-for-sale/",
|
|
"category": "Brokers USA"},
|
|
{"name": "Northrop & Johnson","url": "https://www.n-j.com", "type": "broker",
|
|
"search_url": "https://www.n-j.com/yachts-for-sale/",
|
|
"category": "Brokers USA"},
|
|
{"name": "Worth Ave Yachts","url": "https://www.worthavenueyachts.com", "type": "broker",
|
|
"search_url": "https://www.worthavenueyachts.com/yachts-for-sale/",
|
|
"category": "Brokers USA"},
|
|
{"name": "Bluewater Yachting","url": "https://www.bluewateryachting.com", "type": "broker",
|
|
"category": "Brokers USA"},
|
|
{"name": "Galati Yachts", "url": "https://www.galatiyachts.com", "type": "broker",
|
|
"search_url": "https://www.galatiyachts.com/boat-search/?q={query}",
|
|
"category": "Brokers USA"},
|
|
{"name": "Fraser Yachts", "url": "https://www.fraseryachts.com", "type": "broker",
|
|
"search_url": "https://www.fraseryachts.com/en/yachts-for-sale/?search={query}",
|
|
"category": "Brokers INT"},
|
|
{"name": "Burgess Yachts", "url": "https://www.burgessyachts.com", "type": "broker",
|
|
"search_url": "https://www.burgessyachts.com/en/yacht-sale?q={query}",
|
|
"category": "Brokers INT"},
|
|
{"name": "Ocean Alexander", "url": "https://www.oceanalexander.com", "type": "broker",
|
|
"search_url": "https://www.oceanalexander.com/find-a-boat/?q={query}",
|
|
"category": "Brokers USA"},
|
|
{"name": "Merle Wood", "url": "https://www.merlewood.com", "type": "broker",
|
|
"search_url": "https://www.merlewood.com/yachts-for-sale/",
|
|
"category": "Brokers INT"},
|
|
# ── Other ─────────────────────────────────────────────────────────────
|
|
{"name": "NauticExpo", "url": "https://www.nauticexpo.com", "type": "broker"},
|
|
{"name": "Seaboats", "url": "https://www.seaboats.net", "type": "broker"},
|
|
{"name": "YachtBroker", "url": "https://www.yachtbroker.com", "type": "broker"},
|
|
],
|
|
"Comercial / Industrial": [
|
|
{"name": "WorkBoat", "url": "https://www.workboat.com/classifieds", "type": "commercial"},
|
|
{"name": "TradeABoat", "url": "https://www.tradeaboat.com.au", "type": "broker"},
|
|
{"name": "Boatpoint", "url": "https://www.boatpoint.com.au", "type": "broker"},
|
|
{"name": "Boats & Outboards","url": "https://www.boatsandoutboards.co.uk", "type": "broker"},
|
|
{"name": "Commercial Vessel","url": "https://www.commercialvessel.com", "type": "commercial"},
|
|
{"name": "ShipServ", "url": "https://www.shipserv.com", "type": "commercial"},
|
|
{"name": "Marine Classifieds","url": "https://www.marineclassifieds.com", "type": "classifieds"},
|
|
{"name": "Barcos.net", "url": "https://www.barcos.net", "type": "broker"},
|
|
# ── Offshore / DP / OSV ───────────────────────────────────────────────
|
|
{"name": "Offshore Vessel Exchange","url": "https://www.offshorevessel.exchange","type": "commercial",
|
|
"search_url": "https://www.offshorevessel.exchange/?s={query}",
|
|
"category": "Offshore / DP"},
|
|
{"name": "MarineTraffic Vessels For Sale","url": "https://www.marinetraffic.com/en/ads/p/list","type": "commercial",
|
|
"search_url": "https://www.marinetraffic.com/en/ads/p/list?search={query}",
|
|
"category": "Offshore / DP"},
|
|
{"name": "YachtWorld Commercial","url": "https://www.yachtworld.com","type": "commercial",
|
|
"search_url": "https://www.yachtworld.com/boats-for-sale/type-commercial/?query={query}",
|
|
"category": "Offshore / DP"},
|
|
{"name": "Apollo Duck Workboats","url": "https://www.apolloduck.com","type": "commercial",
|
|
"search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1",
|
|
"category": "Offshore / DP"},
|
|
{"name": "Seawork Classifieds","url": "https://www.seawork.com","type": "commercial",
|
|
"search_url": "https://www.seawork.com/classifieds/",
|
|
"category": "Offshore / DP"},
|
|
{"name": "ShipXchange OSV", "url": "https://www.shipxchange.com", "type": "commercial",
|
|
"search_url": "https://www.shipxchange.com/en/vessel-types/offshore-support-vessel",
|
|
"category": "Offshore / DP"},
|
|
{"name": "Vessel Sales & Acquisitions","url": "https://www.vsl.no", "type": "commercial",
|
|
"search_url": "https://www.vsl.no/vessels-for-sale/",
|
|
"category": "Offshore / DP"},
|
|
],
|
|
"Clasificados Generales": [
|
|
{"name": "Craigslist Boats", "url": "https://www.craigslist.org/search/boa", "type": "classifieds"},
|
|
{"name": "eBay Motors Marine","url": "https://www.ebay.com/b/Boats/26429", "type": "classifieds",
|
|
"search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10"},
|
|
{"name": "Facebook Marketplace","url": "https://www.facebook.com/marketplace/boats","type": "classifieds"},
|
|
{"name": "BoatCrazy", "url": "https://boatcrazy.com", "type": "classifieds",
|
|
"search_url": "https://boatcrazy.com/boats?q={query}",
|
|
"category": "Clasificados USA"},
|
|
{"name": "Kijiji Marine", "url": "https://www.kijiji.ca/b-boats", "type": "classifieds"},
|
|
{"name": "Gumtree Boats", "url": "https://www.gumtree.com/boats", "type": "classifieds"},
|
|
{"name": "Subito.it Barche", "url": "https://www.subito.it/barche", "type": "classifieds"},
|
|
{"name": "LeBonCoin Bateaux","url": "https://www.leboncoin.fr/bateaux", "type": "classifieds"},
|
|
{"name": "Wallapop Barcos", "url": "https://es.wallapop.com/barcos", "type": "classifieds"},
|
|
{"name": "MercadoLibre", "url": "https://www.mercadolibre.com/barcos", "type": "classifieds"},
|
|
{"name": "OLX Marine", "url": "https://www.olx.com/boats", "type": "classifieds"},
|
|
],
|
|
"Salvage & Wrecks": [
|
|
{"name": "Cooper Capital Salvage", "url": "https://www.cooperss.com", "type": "salvage",
|
|
"search_url": "https://www.cooperss.com/",
|
|
"category": "Salvage USA"},
|
|
{"name": "Salvex", "url": "https://www.salvex.com", "type": "salvage",
|
|
"search_url": "https://www.salvex.com/search/?q={query}&cat=30",
|
|
"category": "Salvage USA"},
|
|
{"name": "Copart Marine", "url": "https://www.copart.com", "type": "salvage",
|
|
"search_url": "https://www.copart.com/public/data/lotSearchResults/?query={query}&vehicleType=BOAT",
|
|
"category": "Salvage USA"},
|
|
{"name": "IAA Watercraft", "url": "https://www.iaai.com", "type": "salvage",
|
|
"search_url": "https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft",
|
|
"category": "Salvage USA"},
|
|
{"name": "Ritchie Bros Marine","url": "https://www.rbauction.com", "type": "auction",
|
|
"search_url": "https://www.rbauction.com/used-equipment?q={query}&searchType=MODEL&equipmentCategory=marine",
|
|
"category": "Salvage USA"},
|
|
{"name": "NavAuctions", "url": "https://www.navauctions.com", "type": "salvage"},
|
|
{"name": "MarineWrecks", "url": "https://www.marinewrecks.com", "type": "salvage"},
|
|
{"name": "BoatBreakers", "url": "https://www.boatbreakers.com", "type": "salvage"},
|
|
{"name": "Barnacle Marine", "url": "https://www.barnaclemarine.com", "type": "salvage"},
|
|
{"name": "Boat Breakers AU","url": "https://www.boatbreakersnz.com", "type": "salvage"},
|
|
],
|
|
"Revistas & Noticias": [
|
|
{"name": "Trade Only Today","url": "https://www.tradeonlytoday.com", "type": "news"},
|
|
{"name": "Nautical News", "url": "https://www.nauticalnews.com", "type": "news"},
|
|
{"name": "Boat International","url": "https://www.boatinternational.com/yachts","type": "magazine"},
|
|
{"name": "Superyacht Times","url": "https://www.superyachttimes.com", "type": "magazine"},
|
|
{"name": "The Triton", "url": "https://www.the-triton.com/classifieds", "type": "magazine"},
|
|
{"name": "Passagemaker", "url": "https://www.passagemaker.com", "type": "magazine"},
|
|
{"name": "WorkBoat Mag", "url": "https://www.workboat.com", "type": "magazine"},
|
|
{"name": "Lloyd's List", "url": "https://lloydslist.maritimeintelligence.informa.com", "type": "news"},
|
|
{"name": "Tradewinds", "url": "https://www.tradewindsnews.com", "type": "news"},
|
|
{"name": "Maritime Executive","url": "https://www.maritime-executive.com", "type": "news"},
|
|
{"name": "Splash247", "url": "https://splash247.com", "type": "news"},
|
|
{"name": "Bairdmaritime", "url": "https://www.bairdmaritime.com", "type": "news"},
|
|
],
|
|
"Registros & Gobierno": [
|
|
{"name": "USCG Docs", "url": "https://www.dco.uscg.mil/nvdc", "type": "registry"},
|
|
{"name": "UK Ship Register","url": "https://www.ukshipregister.co.uk", "type": "registry"},
|
|
{"name": "Panama Registry", "url": "https://www.segumar.com", "type": "registry"},
|
|
{"name": "Marshall Islands","url": "https://www.register-iri.com", "type": "registry"},
|
|
{"name": "Liberian Registry","url": "https://www.liscr.com", "type": "registry"},
|
|
{"name": "Bahamas Maritime","url": "https://www.bahamasmaritime.com", "type": "registry"},
|
|
{"name": "IHS Sea-web", "url": "https://maritime.ihs.com", "type": "registry"},
|
|
],
|
|
}
|
|
|
|
# ── Database ──────────────────────────────────────────────────────────────────
|
|
def init_db():
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.executescript("""
|
|
CREATE TABLE IF NOT EXISTS vessels (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT,
|
|
vessel_type TEXT,
|
|
loa_m REAL,
|
|
beam_m REAL,
|
|
draft_m REAL,
|
|
year_built INTEGER,
|
|
hull TEXT,
|
|
propulsion TEXT,
|
|
status TEXT,
|
|
price_usd REAL,
|
|
currency TEXT DEFAULT 'USD',
|
|
location TEXT,
|
|
country TEXT,
|
|
source_name TEXT,
|
|
source_url TEXT,
|
|
description TEXT,
|
|
images TEXT,
|
|
flags TEXT,
|
|
score REAL DEFAULT 0,
|
|
fingerprint TEXT UNIQUE,
|
|
raw_data TEXT,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
updated_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE IF NOT EXISTS saved_vessels (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
vessel_id INTEGER REFERENCES vessels(id),
|
|
notes TEXT,
|
|
saved_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE IF NOT EXISTS search_history (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
query TEXT,
|
|
filters TEXT,
|
|
results INTEGER,
|
|
searched_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE IF NOT EXISTS custom_sources (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT NOT NULL,
|
|
category TEXT DEFAULT 'Custom',
|
|
search_url TEXT NOT NULL,
|
|
source_type TEXT DEFAULT 'broker',
|
|
active INTEGER DEFAULT 1,
|
|
added_by TEXT,
|
|
last_status TEXT DEFAULT 'unknown',
|
|
created_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE IF NOT EXISTS users (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
username TEXT UNIQUE NOT NULL,
|
|
password TEXT NOT NULL,
|
|
role TEXT DEFAULT 'user',
|
|
created_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE IF NOT EXISTS collections (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT NOT NULL,
|
|
description TEXT,
|
|
color TEXT DEFAULT '#00b4ff',
|
|
icon TEXT DEFAULT '📁',
|
|
created_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE IF NOT EXISTS collection_vessels (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
collection_id INTEGER REFERENCES collections(id),
|
|
vessel_id INTEGER REFERENCES vessels(id),
|
|
notes TEXT,
|
|
added_at TEXT DEFAULT (datetime('now')),
|
|
UNIQUE(collection_id, vessel_id)
|
|
);
|
|
CREATE TABLE IF NOT EXISTS alerts (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT,
|
|
filters TEXT,
|
|
last_match INTEGER DEFAULT 0,
|
|
active INTEGER DEFAULT 1,
|
|
created_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_vessels_type ON vessels(vessel_type);
|
|
CREATE INDEX IF NOT EXISTS idx_vessels_status ON vessels(status);
|
|
CREATE INDEX IF NOT EXISTS idx_vessels_price ON vessels(price_usd);
|
|
CREATE INDEX IF NOT EXISTS idx_vessels_score ON vessels(score DESC);
|
|
""")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def get_db():
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
return conn
|
|
|
|
# ── Ollama helpers ─────────────────────────────────────────────────────────────
|
|
_ollama_sem = threading.Semaphore(3) # max 3 concurrent Ollama calls
|
|
|
|
def ollama_generate(prompt: str, model: str = None, json_mode: bool = False) -> str:
|
|
model = model or MODELS['classify']
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 2048}
|
|
}
|
|
if json_mode:
|
|
payload["format"] = "json"
|
|
with _ollama_sem:
|
|
try:
|
|
r = requests.post(OLLAMA_URL, json=payload, timeout=120)
|
|
r.raise_for_status()
|
|
return r.json().get("response", "")
|
|
except Exception as e:
|
|
print(f"[Ollama] Error: {e}")
|
|
return ""
|
|
|
|
def ollama_models() -> list:
|
|
try:
|
|
r = requests.get("http://localhost:11434/api/tags", timeout=5)
|
|
return [m["name"] for m in r.json().get("models", [])]
|
|
except:
|
|
return []
|
|
|
|
def extract_vessel_from_text(text: str, source: str) -> dict:
|
|
"""Use Ollama to extract structured vessel data from raw text."""
|
|
prompt = f"""Eres un experto en inteligencia de mercado marítimo.
|
|
Analiza este texto de un anuncio de embarcación y extrae los datos disponibles.
|
|
Fuente: {source}
|
|
|
|
TEXTO:
|
|
{text[:3000]}
|
|
|
|
Responde SOLO con JSON válido. Si el texto NO es un listing de embarcación específica responde {{"skip": true}}.
|
|
|
|
{{
|
|
"skip": false,
|
|
"name": "nombre del barco o descripción corta",
|
|
"vessel_type": "Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Salvage|Other",
|
|
"loa_m": número o null,
|
|
"beam_m": número o null,
|
|
"draft_m": número o null,
|
|
"year_built": número o null,
|
|
"hull": "Fiberglass|Steel|Aluminum|Wood|Composite|Unknown",
|
|
"propulsion": "Diesel|Gasoline|Electric|Sail|None|Unknown",
|
|
"status": "active|auction|salvage|abandoned|sold",
|
|
"price_usd": número o null,
|
|
"currency": "USD|EUR|GBP|CAD|AUD|etc",
|
|
"location": "ciudad, país",
|
|
"country": "código ISO 2 letras",
|
|
"description": "resumen en español máximo 200 caracteres",
|
|
"flags": ["below_market","rare","auction","salvage_value","motivated_seller","commercial","government_surplus"],
|
|
"score": número del 0 al 100 según oportunidad para un broker
|
|
}}"""
|
|
|
|
response = ollama_generate(prompt, model=MODELS['extract'], json_mode=True)
|
|
try:
|
|
data = json.loads(response)
|
|
if data.get("skip"):
|
|
return {}
|
|
return data
|
|
except:
|
|
match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group())
|
|
if data.get("skip"):
|
|
return {}
|
|
return data
|
|
except:
|
|
pass
|
|
return {}
|
|
|
|
# ── Direct source scrapers — no search engine middleman ──────────────────
|
|
|
|
import random
|
|
|
|
USER_AGENTS = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
|
|
]
|
|
|
|
def get_headers(referer=None):
|
|
ua = random.choice(USER_AGENTS)
|
|
h = {
|
|
'User-Agent': ua,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,fr;q=0.7',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
'Cache-Control': 'max-age=0',
|
|
}
|
|
if referer:
|
|
h['Referer'] = referer
|
|
return h
|
|
|
|
HEADERS = get_headers()
|
|
|
|
|
|
# Each source has its own search URL pattern and CSS selectors
|
|
# Sites we scrape directly (confirmed working)
|
|
DIRECT_SOURCES = [
|
|
# ── Craigslist ─────────────────────────────────────────────────────────────
|
|
# Single multi-city entry (uses scrape_craigslist internally — Playwright, 3+ cities)
|
|
{"name":"Craigslist", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
# Individual cities — each makes one targeted request via scrape_direct_source
|
|
{"name":"Craigslist Miami", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Tampa", "category":"Clasificados USA", "search_url":"https://tampa.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Fort Laud", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}&sort=date", "type":"classifieds"},
|
|
{"name":"Craigslist New Orleans","category":"Clasificados USA", "search_url":"https://neworleans.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Houston", "category":"Clasificados USA", "search_url":"https://houston.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Seattle", "category":"Clasificados USA", "search_url":"https://seattle.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist LA", "category":"Clasificados USA", "search_url":"https://losangeles.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist SF", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Jacksonville","category":"Clasificados USA", "search_url":"https://jacksonville.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Sarasota", "category":"Clasificados USA", "search_url":"https://sarasota.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Chicago", "category":"Clasificados USA", "search_url":"https://chicago.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Boston", "category":"Clasificados USA", "search_url":"https://boston.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Atlanta", "category":"Clasificados USA", "search_url":"https://atlanta.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Baltimore", "category":"Clasificados USA", "search_url":"https://baltimore.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Norfolk", "category":"Clasificados USA", "search_url":"https://norfolk.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist San Diego", "category":"Clasificados USA", "search_url":"https://sandiego.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Portland OR","category":"Clasificados USA", "search_url":"https://portland.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Minneapolis","category":"Clasificados USA", "search_url":"https://minneapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Detroit", "category":"Clasificados USA", "search_url":"https://detroit.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Cleveland", "category":"Clasificados USA", "search_url":"https://cleveland.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Charlotte", "category":"Clasificados USA", "search_url":"https://charlotte.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Denver", "category":"Clasificados USA", "search_url":"https://denver.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Phoenix", "category":"Clasificados USA", "search_url":"https://phoenix.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Annapolis", "category":"Clasificados USA", "search_url":"https://annapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist New Jersey", "category":"Clasificados USA", "search_url":"https://newjersey.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Galveston", "category":"Clasificados USA", "search_url":"https://galveston.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Pensacola", "category":"Clasificados USA", "search_url":"https://pensacola.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Mobile AL", "category":"Clasificados USA", "search_url":"https://mobile.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Key West", "category":"Clasificados USA", "search_url":"https://keys.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Corpus", "category":"Clasificados USA", "search_url":"https://corpuschristi.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Beaumont", "category":"Clasificados USA", "search_url":"https://beaumont.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
{"name":"Craigslist Baton Rouge","category":"Clasificados USA", "search_url":"https://batonrouge.craigslist.org/search/boa?query={query}", "type":"classifieds"},
|
|
# NOTE: gulfcoast.craigslist.org (Biloxi) no longer exists — replaced with Mobile AL
|
|
|
|
# ── eBay ──────────────────────────────────────────────────────────────────
|
|
{"name":"eBay Marine", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10", "type":"classifieds"},
|
|
{"name":"eBay Auction", "category":"Subastas USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_Auction=1", "type":"auction"},
|
|
{"name":"eBay Motors Sail", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=36431&LH_BIN=1&_sop=10", "type":"classifieds"},
|
|
{"name":"eBay Boats Complete", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+boat&_sacat=26429&LH_BIN=1&_sop=15", "type":"classifieds"},
|
|
{"name":"eBay Salvage Boats", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"},
|
|
|
|
# ── Subastas Gobierno ─────────────────────────────────────────────────────
|
|
{"name":"GovDeals", "category":"Subastas Gobierno", "search_url":"https://www.govdeals.com/index.cfm?fa=Main.AdvSearchResultsNew&kWord={query}&category=70", "type":"auction"},
|
|
{"name":"PublicSurplus", "category":"Subastas Gobierno", "search_url":"https://www.publicsurplus.com/sms/browse/home?search={query}", "type":"auction"},
|
|
{"name":"PropertyRoom", "category":"Subastas Gobierno", "search_url":"https://www.propertyroom.com/s?q={query}+boat", "type":"auction"},
|
|
# GovPlanet: correct URL confirmed working (Recreational Marine category)
|
|
{"name":"GovPlanet", "category":"Subastas Gobierno", "search_url":"https://www.govplanet.com/Recreational+Marine", "type":"auction"},
|
|
# IronPlanet: correct URL confirmed working (Commercial Marine Vessels)
|
|
{"name":"IronPlanet", "category":"Subastas Gobierno", "search_url":"https://www.ironplanet.com/Commercial+Marine+Vessels", "type":"auction"},
|
|
# HiBid: React SPA — scrape_hibid uses Playwright
|
|
{"name":"HiBid", "category":"Subastas USA", "search_url":"https://www.hibid.com/lots?q={query}+boat", "type":"auction"},
|
|
{"name":"AuctionTime", "category":"Subastas USA", "search_url":"https://www.auctiontime.com/listings/search?q={query}+boat", "type":"auction"},
|
|
{"name":"BidSpotter", "category":"Subastas USA", "search_url":"https://www.bidspotter.com/en-us/auction-catalogues?q={query}+boat", "type":"auction"},
|
|
# Copart: Playwright scraper handles JS-rendered lots
|
|
{"name":"Copart Marine", "category":"Subastas USA", "search_url":"https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT", "type":"auction"},
|
|
|
|
# ── Salvage ───────────────────────────────────────────────────────────────
|
|
{"name":"Salvex Marine", "category":"Salvage / Subastas", "search_url":"https://salvex.com/listings/?q={query}&cat=marine", "type":"salvage"},
|
|
{"name":"Barnacle Marine", "category":"Salvage / Subastas", "search_url":"https://www.barnaclemarine.com/?s={query}", "type":"salvage"},
|
|
{"name":"eBay Salvage", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"},
|
|
{"name":"Cooper Capital Salvage","category":"Salvage USA", "search_url":"https://www.cooperss.com/", "type":"salvage"},
|
|
{"name":"IAA Watercraft", "category":"Salvage USA", "search_url":"https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft", "type":"salvage"},
|
|
|
|
# ── Venta Especializada — principales ────────────────────────────────────
|
|
{"name":"YachtWorld", "category":"Venta Especializada", "search_url":"https://www.yachtworld.com/boats-for-sale/", "type":"broker"},
|
|
{"name":"BoatTrader", "category":"Venta Especializada", "search_url":"https://www.boattrader.com/boats/?query={query}", "type":"broker"},
|
|
{"name":"Boats.com", "category":"Venta Especializada", "search_url":"https://www.boats.com/boats-for-sale/?query={query}", "type":"broker"},
|
|
{"name":"Apollo Duck", "category":"Venta Especializada", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"broker"},
|
|
{"name":"Rightboat", "category":"Venta Especializada", "search_url":"https://www.rightboat.com/boats-for-sale/?q={query}", "type":"broker"},
|
|
# Boat24: 403 on requests — scrape_eu_broker uses Playwright
|
|
{"name":"Boat24", "category":"Venta Especializada", "search_url":"https://www.boat24.com/en/boats/?q={query}", "type":"broker"},
|
|
# YachtMarket: uses scrape_eu_broker (Playwright) in case of blocks
|
|
{"name":"YachtMarket", "category":"Venta Especializada", "search_url":"https://www.yachtmarket.com/boats-for-sale/?q={query}", "type":"broker"},
|
|
|
|
# ── SailboatListings (dedicated thread also runs in parallel) ────────────
|
|
{"name":"SailboatListings", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=&material=&hull=&state=&keyword={query}&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True},
|
|
{"name":"SailboatListings View", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=Sail&material=&hull=&state=&keyword=&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True},
|
|
# Forums: Playwright scraper handles vBulletin/XenForo FS sections
|
|
{"name":"TheHullTruth", "category":"Veleros Global", "search_url":"https://www.thehulltruth.com/boating-forum/search.php?do=process&query={query}&prefixid=FS&type=post", "type":"classifieds"},
|
|
{"name":"Cruisers Forum", "category":"Veleros Global", "search_url":"https://www.cruisersforum.com/forums/f152/", "type":"classifieds"},
|
|
|
|
# ── Comercial / Offshore ──────────────────────────────────────────────────
|
|
{"name":"WorkBoat Classifieds", "category":"Comercial Offshore", "search_url":"https://www.workboat.com/classifieds/?keywords={query}", "type":"commercial"},
|
|
{"name":"Commercial Vessel", "category":"Comercial Offshore", "search_url":"https://www.commercialvessel.com/search?keywords={query}", "type":"commercial"},
|
|
{"name":"OSV Broker", "category":"Comercial Offshore", "search_url":"https://www.osvbroker.com/?s={query}", "type":"commercial"},
|
|
{"name":"Marine Classifieds", "category":"Comercial Offshore", "search_url":"https://www.marineclassifieds.com/search.php?search={query}", "type":"commercial"},
|
|
{"name":"Seaboats", "category":"Comercial Global", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=0", "type":"commercial"},
|
|
{"name":"Seaboats Offshore", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=offshore+support+vessels", "type":"commercial"},
|
|
{"name":"Seaboats Tug", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=tugs+%26+pushboats", "type":"commercial"},
|
|
{"name":"Seaboats Barge", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=barges+%26+lighters", "type":"commercial"},
|
|
{"name":"Seaboats Fishing", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=fishing+vessels", "type":"commercial"},
|
|
{"name":"Apollo Duck Workboats", "category":"Comercial Offshore", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"commercial"},
|
|
{"name":"YachtWorld Commercial", "category":"Comercial Offshore", "search_url":"https://www.yachtworld.com/boats-for-sale/type-commercial/", "type":"commercial"},
|
|
|
|
# ── Australia / Pacífico ─────────────────────────────────────────────────
|
|
# Trade a Boat AU: server-rendered, correct URL confirmed working
|
|
{"name":"Trade a Boat AU", "category":"Australia / Pacifico","search_url":"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query}", "type":"broker"},
|
|
# Boatsales.com.au (Boatpoint redirects here): scrape_eu_broker via Playwright
|
|
{"name":"Boatsales AU", "category":"Australia / Pacifico","search_url":"https://www.boatsales.com.au/boats-for-sale/?q={query}", "type":"broker"},
|
|
|
|
# ── Reino Unido ───────────────────────────────────────────────────────────
|
|
# Boats & Outboards UK: 403 on requests — scrape_eu_broker uses Playwright
|
|
{"name":"Boats & Outboards UK", "category":"Reino Unido", "search_url":"https://www.boatsandoutboards.co.uk/boats-for-sale/?q={query}", "type":"broker"},
|
|
# Apollo Duck UK: use same apolloduck.com (no separate UK subdomain)
|
|
{"name":"Apollo Duck UK", "category":"Reino Unido", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1&country=GB", "type":"broker"},
|
|
|
|
# ── Francia ───────────────────────────────────────────────────────────────
|
|
# Annonces Bateau: 403 on requests — scrape_eu_broker uses Playwright
|
|
{"name":"Annonces Bateau", "category":"Francia", "search_url":"https://www.annoncesbateau.com/bateaux/annonces-bateaux?keyword={query}", "type":"broker"},
|
|
|
|
# ── España / Mediterráneo ────────────────────────────────────────────────
|
|
# Inautia ES: 403 on requests — scrape_eu_broker uses Playwright
|
|
{"name":"Inautia ES", "category":"Espana / Global", "search_url":"https://www.inautia.es/barca?q={query}", "type":"broker"},
|
|
{"name":"Barcos.net", "category":"Espana / Global", "search_url":"https://www.barcos.net/busqueda/?q={query}", "type":"broker"},
|
|
|
|
# ── Europa / Global ───────────────────────────────────────────────────────
|
|
# YachtAll: 403 on requests — scrape_eu_broker uses Playwright
|
|
{"name":"YachtAll", "category":"Clasificados EU", "search_url":"https://yachtall.com/yachts/?search={query}", "type":"broker"},
|
|
|
|
# ── Brokers USA ───────────────────────────────────────────────────────────
|
|
{"name":"HMY Yachts", "category":"Brokers USA", "search_url":"https://www.hmy.com/yachts-for-sale/?SaleClassCode=used", "type":"broker"},
|
|
{"name":"Denison Yachting", "category":"Brokers USA", "search_url":"https://www.denisonyachtsales.com/yachts-for-sale/?search={query}", "type":"broker"},
|
|
{"name":"BoatCrazy", "category":"Brokers USA", "search_url":"https://boatcrazy.com/boats?q={query}", "type":"classifieds"},
|
|
# Galati Yachts: server-rendered WP site — scrape_galati uses requests
|
|
{"name":"Galati Yachts", "category":"Brokers USA", "search_url":"https://www.galatiyachts.com/yachts-for-sale/?keywords={query}", "type":"broker"},
|
|
{"name":"United Yacht Sales", "category":"Brokers USA", "search_url":"https://www.unitedyacht.com/yachts-for-sale/", "type":"broker"},
|
|
# Worth Ave Yachts: hybrid server-rendered — scrape_luxury_broker uses Playwright
|
|
{"name":"Worth Ave Yachts", "category":"Brokers USA", "search_url":"https://www.worthavenueyachts.com/yachts-for-sale/", "type":"broker"},
|
|
|
|
# ── Brokers Internacionales ───────────────────────────────────────────────
|
|
# Fraser Yachts: Vue/JS SPA — scrape_luxury_broker uses Playwright
|
|
{"name":"Fraser Yachts", "category":"Brokers Internacional","search_url":"https://www.fraseryachts.com/en/yachts-for-sale/", "type":"broker"},
|
|
# Burgess Yachts: JS-loaded — scrape_luxury_broker uses Playwright
|
|
{"name":"Burgess Yachts", "category":"Brokers Internacional","search_url":"https://www.burgessyachts.com/en/yachts/sale/", "type":"broker"},
|
|
# Northrop & Johnson: JS-loaded — scrape_luxury_broker uses Playwright
|
|
{"name":"Northrop & Johnson", "category":"Brokers Internacional","search_url":"https://www.njcharters.com/yachts-for-sale/", "type":"broker"},
|
|
{"name":"Merle Wood", "category":"Brokers Internacional","search_url":"https://www.merlewood.com/yachts-for-sale/", "type":"broker"},
|
|
|
|
# ── Canada ────────────────────────────────────────────────────────────────
|
|
{"name":"Kijiji Boats CA", "category":"Canada", "search_url":"https://www.kijiji.ca/b-boats/{query}/k0c132", "type":"classifieds"},
|
|
]
|
|
|
|
# Web search queries — finds listings on ANY site including blocked ones
|
|
# DuckDuckGo returns results from YachtWorld, Boats.com, Apollo Duck, etc.
|
|
# Base web search templates — {query} is replaced at runtime
|
|
# Dynamic templates also get price/loa filters appended when available
|
|
WEB_SEARCH_TEMPLATES = [
|
|
'"{query}" boat for sale',
|
|
'"{query}" sailboat for sale',
|
|
'"{query}" vessel for sale',
|
|
'"{query}" yacht for sale',
|
|
'"{query}" barco venta',
|
|
'"{query}" bateau vendre occasion',
|
|
'site:yachtworld.com {query} for sale sail cruiser',
|
|
'site:boats.com {query} sailboat for sale',
|
|
'site:apolloduck.com {query} for sale',
|
|
'site:rightboat.com {query} for sale',
|
|
'site:boat24.com {query} for sale',
|
|
'site:yachtall.com {query} sailboat',
|
|
'site:annoncesbateau.com {query} voilier',
|
|
'site:cruisersforum.com {query} for sale',
|
|
'site:thehulltruth.com {query} for sale fs',
|
|
'site:govplanet.com {query} vessel',
|
|
'site:ironplanet.com {query} boat vessel',
|
|
'site:govdeals.com {query} vessel boat',
|
|
'site:publicsurplus.com {query} vessel',
|
|
'site:hibid.com {query} boat',
|
|
'site:copart.com {query} boat vessel',
|
|
'site:rbauction.com {query} boat',
|
|
'"{query}" boat auction government surplus',
|
|
'"{query}" vessel auction salvage',
|
|
# Salvage specific
|
|
'site:salvex.com {query} marine vessel',
|
|
'site:copart.com {query} boat salvage',
|
|
'site:iaai.com {query} boat',
|
|
'site:boatbreakers.com {query}',
|
|
'"{query}" salvage boat for sale',
|
|
'"{query}" insurance total loss boat',
|
|
'"{query}" wrecked boat for sale parts',
|
|
'"{query}" boat salvage title for sale',
|
|
'site:seaboats.net {query}',
|
|
'site:workboat.com {query} for sale',
|
|
'site:commercialvessel.com {query}',
|
|
# Offshore / commercial
|
|
'site:osvbroker.com {query}',
|
|
'site:marineclassifieds.com {query} for sale',
|
|
'site:apolloduck.com {query} offshore tug barge',
|
|
'"{query}" offshore supply vessel for sale',
|
|
'"{query}" OSV for sale broker',
|
|
'"{query}" crew boat for sale',
|
|
'"{query}" workboat for sale',
|
|
'"{query}" tug for sale',
|
|
'"{query}" barge for sale',
|
|
'"{query}" supply vessel for sale',
|
|
'"{query}" fishing vessel for sale',
|
|
'"{query}" commercial vessel for sale',
|
|
# Australia / Pacific
|
|
'site:tradeaboat.com.au {query} for sale',
|
|
'site:boatpoint.com.au {query} for sale',
|
|
# Europe classifieds
|
|
'site:boatsandoutboards.co.uk {query} for sale',
|
|
'site:annoncesbateau.com {query} voilier',
|
|
'site:inautia.com {query} barco venta',
|
|
]
|
|
|
|
def build_web_queries(base_query: str, filters: dict) -> list:
|
|
"""Build web search queries filtered by vessel type/status to avoid irrelevant searches."""
|
|
price_ctx = ""
|
|
loa_ctx = ""
|
|
if filters.get("max_price"):
|
|
price_ctx = f" under ${filters['max_price']}"
|
|
if filters.get("min_loa"):
|
|
ft = int(float(filters["min_loa"]) / 0.3048)
|
|
loa_ctx = f" {ft}ft+"
|
|
|
|
vtype = (filters.get("type","") or "").lower()
|
|
status = (filters.get("status","") or "").lower()
|
|
|
|
# Categorize templates so we only include relevant ones
|
|
SALVAGE_KWORDS = {"salvage","copart","iaai","boatbreakers","insurance","total loss","wrecked","salvage title"}
|
|
OFFSHORE_KWORDS = {"workboat","commercial","osvbroker","offshore","osv","crew boat","supply vessel","tug","barge","fishing vessel"}
|
|
SAIL_KWORDS = {"sailboat","yachtall","annoncesbateau","voilier","cruisersforum","sail cruiser"}
|
|
GENERIC_KWORDS = {"boat for sale","vessel for sale","yacht for sale","barco venta","bateau","yachtworld","boats.com","apolloduck","rightboat","boat24","govplanet","ironplanet","govdeals","publicsurplus","hibid","rbauction","tradeaboat","boatpoint","boatsandoutboards","inautia"}
|
|
|
|
is_salvage = status == "salvage" or "salvage" in base_query.lower()
|
|
is_offshore = vtype in {"offshore","tug","barge","ferry","fishing","commercial"} or any(k in base_query.lower() for k in {"tug","barge","osv","crew boat","workboat"})
|
|
is_sail = vtype in {"sailboat","velero","sail"} or any(k in base_query.lower() for k in {"sail","velero","ketch","sloop"})
|
|
|
|
queries = []
|
|
for tmpl in WEB_SEARCH_TEMPLATES:
|
|
tmpl_l = tmpl.lower()
|
|
# Skip salvage templates for non-salvage searches
|
|
if any(k in tmpl_l for k in SALVAGE_KWORDS) and not is_salvage:
|
|
continue
|
|
# Skip offshore templates for clearly non-offshore searches (sailboat/velero)
|
|
if any(k in tmpl_l for k in OFFSHORE_KWORDS) and is_sail and not is_offshore:
|
|
continue
|
|
# Skip sailboat templates for offshore/salvage searches
|
|
if any(k in tmpl_l for k in SAIL_KWORDS) and (is_offshore or is_salvage) and not is_sail:
|
|
continue
|
|
|
|
q = tmpl.replace("{query}", base_query)
|
|
if not q.startswith("site:") and (price_ctx or loa_ctx):
|
|
q += loa_ctx + price_ctx
|
|
queries.append(q)
|
|
return queries
|
|
|
|
SEARCH_ENGINES = [
|
|
{
|
|
"name": "DuckDuckGo",
|
|
"url": "https://html.duckduckgo.com/html/?q={query}",
|
|
"link_sel": "a.result__a",
|
|
"snippet_sel": "a.result__snippet",
|
|
},
|
|
{
|
|
"name": "Bing",
|
|
"url": "https://www.bing.com/search?q={query}&count=20",
|
|
"link_sel": "h2 a",
|
|
"snippet_sel": ".b_caption p",
|
|
},
|
|
]
|
|
|
|
def web_search(query: str, max_results: int = 8) -> list[dict]:
|
|
"""Search web engines for real listings."""
|
|
results = []
|
|
seen = set()
|
|
skip = ["google.","bing.","duckduckgo.","yahoo.","wikipedia.","youtube.",
|
|
"facebook.com/login","instagram.","twitter.","linkedin.",
|
|
"pinterest.","reddit.com/r/",".pdf","amazon.com/s?"]
|
|
|
|
for engine in SEARCH_ENGINES:
|
|
try:
|
|
url = engine["url"].format(query=requests.utils.quote(query))
|
|
time.sleep(1.0)
|
|
r = requests.get(url, headers=get_headers(), timeout=20, verify=False)
|
|
if r.status_code != 200:
|
|
continue
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
links = soup.select(engine["link_sel"])
|
|
snippets = soup.select(engine["snippet_sel"])
|
|
|
|
for i, link in enumerate(links[:max_results*2]):
|
|
href = link.get("href","")
|
|
# Clean DDG redirect
|
|
if "duckduckgo.com" in href:
|
|
m = re.search(r'uddg=([^&]+)', href)
|
|
if m: href = requests.utils.unquote(m.group(1))
|
|
if not href.startswith("http"): continue
|
|
if any(s in href for s in skip): continue
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
title = link.get_text(strip=True)
|
|
snippet = snippets[i].get_text(strip=True) if i < len(snippets) else ""
|
|
try: source = href.split("/")[2].replace("www.","")
|
|
except: source = "web"
|
|
results.append({
|
|
"url": href, "title": title, "snippet": snippet,
|
|
"price_text": "", "img_url": "",
|
|
"location": "", "source": source,
|
|
"source_type": "broker", "category": "Web Search"
|
|
})
|
|
if len(results) >= max_results: break
|
|
except Exception as e:
|
|
pass
|
|
if len(results) >= max_results: break
|
|
return results
|
|
|
|
|
|
|
|
def scrape_direct_source(source: dict, query: str, filters: dict = None) -> list[dict]:
|
|
if filters is None: filters = {}
|
|
"""AI-powered scraper — no CSS selectors, reads HTML like a human."""
|
|
results = []
|
|
try:
|
|
# Build URL — expand filter placeholders if source supports them
|
|
raw_url = source["search_url"]
|
|
if source.get("supports_filters"):
|
|
min_loa_m = float(filters.get("min_loa") or 0)
|
|
max_price = filters.get("max_price") or ""
|
|
min_price = filters.get("min_price") or ""
|
|
loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else ""
|
|
loa_max_ft = "" # no max LOA filter in current UI
|
|
raw_url = raw_url.replace("{loa_min_ft}", str(loa_min_ft))
|
|
raw_url = raw_url.replace("{loa_max_ft}", str(loa_max_ft))
|
|
raw_url = raw_url.replace("{price_min}", str(min_price))
|
|
raw_url = raw_url.replace("{price_max}", str(max_price))
|
|
# Clean query - remove duplicate "for sale"
|
|
# Clean query - remove duplicates
|
|
clean_q = query.strip()
|
|
for phrase in [" for sale for sale", "for sale for sale", " velero velero", " sailboat sailboat"]:
|
|
clean_q = clean_q.replace(phrase, phrase.split()[0] + " " + phrase.split()[1])
|
|
clean_q = ' '.join(dict.fromkeys(clean_q.split())) # remove duplicate words
|
|
url = raw_url.format(query=requests.utils.quote(clean_q.replace(' for sale for sale',' for sale')))
|
|
time.sleep(1.0)
|
|
domain = url.split('/')[2]
|
|
headers = get_headers(referer=f"https://{domain}/")
|
|
r = requests.get(url, headers=headers, timeout=25, verify=False)
|
|
|
|
# Retry with different UA if blocked
|
|
if r.status_code in [403, 429, 503]:
|
|
time.sleep(2)
|
|
headers = get_headers()
|
|
r = requests.get(url, headers=headers, timeout=25, verify=False)
|
|
|
|
if r.status_code not in [200, 206]:
|
|
print(f"[{source['name']}] HTTP {r.status_code}")
|
|
return []
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
for tag in soup(["script","style","nav","footer","header","aside","noscript","meta","link"]):
|
|
tag.decompose()
|
|
|
|
base_url = "/".join(url.split("/")[:3])
|
|
raw_links = []
|
|
skip_words = ["login","register","signup","about","contact","help",
|
|
"privacy","terms","facebook.com","twitter.com","instagram.com"]
|
|
|
|
for a in soup.find_all("a", href=True)[:80]:
|
|
href = a["href"].strip()
|
|
if not href or href.startswith("#") or href.startswith("javascript"):
|
|
continue
|
|
if not href.startswith("http"):
|
|
href = base_url + ("" if href.startswith("/") else "/") + href
|
|
if any(s in href.lower() for s in skip_words):
|
|
continue
|
|
text = a.get_text(strip=True)[:150]
|
|
parent = a.find_parent()
|
|
price = ""
|
|
img = ""
|
|
if parent:
|
|
ptxt = parent.get_text(" ", strip=True)
|
|
pm = re.search(r'[\d,]+(?:\.\d+)?\s*(?:USD|EUR|GBP|CAD|\$|€|£)', ptxt)
|
|
if pm:
|
|
price = pm.group()
|
|
# Traverse up to 4 levels to find a thumbnail image
|
|
node = parent
|
|
for _ in range(4):
|
|
if node is None:
|
|
break
|
|
im = node.find("img")
|
|
if im:
|
|
src = _extract_best_src(im)
|
|
if src:
|
|
# Convert relative to absolute
|
|
if src.startswith("//"):
|
|
src = "https:" + src
|
|
elif src.startswith("/"):
|
|
src = base_url + src
|
|
if src.startswith("http") and len(src) > 20:
|
|
img = src
|
|
break
|
|
node = node.parent
|
|
if text and len(text) > 8:
|
|
raw_links.append({"url":href,"title":text,"price":price,"img":img})
|
|
|
|
if not raw_links:
|
|
print(f"[{source['name']}] No links found")
|
|
return []
|
|
|
|
seen = set()
|
|
unique = []
|
|
for lnk in raw_links:
|
|
if lnk["url"] not in seen:
|
|
seen.add(lnk["url"])
|
|
unique.append(lnk)
|
|
|
|
# ── Heuristic listing filter (no AI needed) ──────────────────────────
|
|
# Score each link — higher = more likely to be an actual vessel listing
|
|
BOAT_KW = ["boat","yacht","vessel","sail","ketch","sloop","cutter","schooner",
|
|
"yawl","catamaran","trimaran","motor","tug","barge","cruiser","skiff",
|
|
"fishing","trawler","offshore","cabin","dinghy","pontoon","runabout"]
|
|
|
|
def listing_score(lnk):
|
|
url_l = lnk["url"].lower()
|
|
title_l = lnk["title"].lower()
|
|
sc = 0
|
|
if lnk["price"]: sc += 4 # price is strong signal
|
|
if lnk["img"]: sc += 1 # has photo
|
|
if re.search(r'/\d{5,}', url_l): sc += 3 # 5+ digit ID
|
|
if re.search(r'/(view|detail|listing|item|vessel|boat|ship|for-sale)[-/]', url_l): sc += 2
|
|
if re.search(r'-for-sale[/-]?$', url_l): sc += 2
|
|
if re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title_l): sc += 3 # year in title
|
|
if re.search(r'\d{2,3}\s*(?:\'|ft|feet|meter)', title_l): sc += 2 # size
|
|
if any(k in title_l for k in BOAT_KW): sc += 1
|
|
if re.search(r'\b(for sale|en vente|vendre|en venta)\b', title_l): sc += 1
|
|
if len(lnk["title"]) > 15: sc += 1 # nav links are short
|
|
return sc
|
|
|
|
scored = [(listing_score(lnk), lnk) for lnk in unique[:30]]
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
# Keep links with score >= 3, or fall back to top-5 if nothing qualifies
|
|
good = [lnk for sc, lnk in scored if sc >= 3]
|
|
if not good:
|
|
good = [lnk for _, lnk in scored[:5]] # best guesses from this source
|
|
|
|
for lnk in good[:20]:
|
|
results.append({
|
|
"url": lnk["url"],
|
|
"title": lnk["title"],
|
|
"snippet": f"Price: {lnk['price']}",
|
|
"price_text": lnk["price"],
|
|
"img_url": lnk["img"],
|
|
"location": "",
|
|
"source": source["name"],
|
|
"source_type": source["type"],
|
|
"category": source["category"],
|
|
})
|
|
|
|
print(f"[{source['name']}] {len(results)} listings found")
|
|
except Exception as e:
|
|
print(f"[{source['name']}] Error: {e}")
|
|
return results
|
|
|
|
|
|
# Interleave queue for polite scraping
|
|
_interleave_lock = threading.Lock()
|
|
_interleave_sites = [
|
|
"https://miami.craigslist.org",
|
|
"https://www.seaboats.net",
|
|
"https://www.barcos.net",
|
|
"https://www.ebay.com",
|
|
"https://boston.craigslist.org",
|
|
"https://seattle.craigslist.org",
|
|
]
|
|
_interleave_idx = 0
|
|
|
|
def polite_pause(source_name: str):
|
|
"""
|
|
Between pages of the same site, make a quick request to a different
|
|
site so we look like a human browsing — not a bot hammering one server.
|
|
"""
|
|
global _interleave_idx
|
|
with _interleave_lock:
|
|
site = _interleave_sites[_interleave_idx % len(_interleave_sites)]
|
|
_interleave_idx += 1
|
|
try:
|
|
requests.get(site, headers=get_headers(), timeout=5, verify=False)
|
|
except Exception:
|
|
pass
|
|
# Random human-like delay: 2-5 seconds
|
|
time.sleep(random.uniform(2.0, 5.0))
|
|
print(f"[{source_name}] Polite pause done — continuing...")
|
|
|
|
def scrape_sailboatlistings(query: str, filters: dict, max_pages: int = 8) -> list[dict]:
|
|
"""
|
|
Multi-page scraper for SailboatListings.com.
|
|
Captures MAIN listings (sailboat=XXXXX) with full structured data,
|
|
plus SIDEBAR featured listings (/view/XXXXX) as bonus.
|
|
"""
|
|
results = []
|
|
seen_urls = set()
|
|
|
|
min_loa_m = float(filters.get("min_loa") or 0)
|
|
max_loa_m = float(filters.get("max_loa") or 0)
|
|
max_price = filters.get("max_price") or ""
|
|
loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else ""
|
|
loa_max_ft = int(max_loa_m / 0.3048) if max_loa_m else ""
|
|
|
|
vessel_type = filters.get("type","").lower()
|
|
sbl_type_map = {
|
|
"sailboat": "Sail", "sail": "Sail",
|
|
"yacht": "cruiser",
|
|
"motor": "powerboat", "motorboat": "powerboat",
|
|
"fishing": "fishing",
|
|
"tug": "", "barge": "", "offshore": "", "ferry": "", "commercial": "",
|
|
}
|
|
# Default "" → search ALL types on SailboatListings
|
|
sbl_type = sbl_type_map.get(vessel_type, "")
|
|
hull = filters.get("hull","").lower()
|
|
sbl_hull_map = {
|
|
"fiberglass":"fiberglass","steel":"steel",
|
|
"aluminum":"aluminum","wood":"wood",
|
|
}
|
|
sbl_material = sbl_hull_map.get(hull, "")
|
|
|
|
year_min = filters.get("year_min","---")
|
|
year_max = filters.get("year_max","---")
|
|
if not year_min: year_min = "---"
|
|
if not year_max: year_max = "---"
|
|
|
|
base_url = (
|
|
"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi"
|
|
"?db=default&uid=default&sb=33&so=descend&websearch=1"
|
|
f"&manufacturer=&model="
|
|
f"&length-gt={loa_min_ft}&length-lt={loa_max_ft}"
|
|
f"&year-lt={year_max}&year-gt={year_min}&price-lt={max_price}"
|
|
f"&type={sbl_type}&material={sbl_material}&hull=&state="
|
|
f"&keyword={requests.utils.quote(query)}"
|
|
f"&view_records=+Show+Matching+Boats+"
|
|
)
|
|
|
|
for page in range(1, max_pages + 1):
|
|
if page > 1:
|
|
polite_pause("SailboatListings")
|
|
|
|
try:
|
|
url = base_url if page == 1 else base_url + f"&nh={page}"
|
|
r = requests.get(url, headers=get_headers(), timeout=25, verify=False)
|
|
|
|
if r.status_code == 429:
|
|
print(f"[SailboatListings] Rate limited on page {page} — stopping")
|
|
break
|
|
if r.status_code != 200:
|
|
print(f"[SailboatListings] Page {page} HTTP {r.status_code}")
|
|
break
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
body_text = soup.get_text()
|
|
|
|
if "no records" in body_text.lower() or "0 matches" in body_text.lower():
|
|
print(f"[SailboatListings] No more results at page {page}")
|
|
break
|
|
|
|
page_results = 0
|
|
|
|
# ── MAIN LISTINGS (sailboat=XXXXX) — full structured data ──
|
|
for header_link in soup.find_all("a", class_="sailheader"):
|
|
href = header_link.get("href", "")
|
|
m = re.search(r'sailboat=(\d+)', href)
|
|
if not m:
|
|
continue
|
|
sid = m.group(1)
|
|
canonical = f"https://www.sailboatlistings.com/view/{sid}"
|
|
if canonical in seen_urls:
|
|
continue
|
|
seen_urls.add(canonical)
|
|
|
|
title = header_link.get_text(strip=True)
|
|
|
|
# Parent table contains all structured sailvb/sailvk spans
|
|
listing_table = header_link.find_parent("table")
|
|
if not listing_table:
|
|
continue
|
|
|
|
# Extract structured fields
|
|
fields = {}
|
|
for label_span in listing_table.find_all("span", class_="sailvb"):
|
|
label = label_span.get_text(strip=True).rstrip(":").strip()
|
|
value_span = label_span.find_next("span", class_="sailvk")
|
|
if value_span:
|
|
fields[label] = value_span.get_text(strip=True)
|
|
|
|
price_text = fields.get("Asking", "")
|
|
location = fields.get("Location", "")
|
|
|
|
# Build context string from structured fields
|
|
context_parts = [f"{k}: {v}" for k, v in fields.items()]
|
|
context = " | ".join(context_parts)
|
|
|
|
# Extract image — upgrade thumbnail to full-size
|
|
img_src = ""
|
|
img_tag = listing_table.find("img")
|
|
if img_tag:
|
|
img_src = img_tag.get("src", "") or img_tag.get("data-src", "")
|
|
if img_src and not img_src.startswith("http"):
|
|
img_src = "https://www.sailboatlistings.com" + img_src
|
|
# Upgrade /sailimg/t/ (thumbnail) or /sailimg/m/ (medium) → /sailimg/ (full)
|
|
for thumb in ["/sailimg/t/", "/sailimg/m/"]:
|
|
if thumb in img_src:
|
|
img_src = img_src.replace(thumb, "/sailimg/")
|
|
break
|
|
if not img_src:
|
|
img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg"
|
|
|
|
results.append({
|
|
"url": canonical,
|
|
"title": title or context[:80],
|
|
"snippet": context,
|
|
"price_text": price_text,
|
|
"img_url": img_src,
|
|
"location": location,
|
|
"source": "SailboatListings",
|
|
"source_type": "broker",
|
|
"category": "Veleros Global",
|
|
"fields": fields, # pass structured fields for direct extraction
|
|
})
|
|
page_results += 1
|
|
|
|
# ── SIDEBAR FEATURED (/view/XXXXX) — less data but more listings ──
|
|
for a in soup.find_all("a", class_="featured"):
|
|
href = a.get("href", "")
|
|
view_m = re.search(r'/view/(\d+)', href)
|
|
if not view_m:
|
|
continue
|
|
sid = view_m.group(1)
|
|
canonical = f"https://www.sailboatlistings.com/view/{sid}"
|
|
if canonical in seen_urls:
|
|
continue
|
|
seen_urls.add(canonical)
|
|
|
|
link_text = a.get_text(" ", strip=True)
|
|
# Extract price from link text: "45' Alden 45 Falmouth, Maine Asking $355,000"
|
|
price_m = re.search(r'Asking\s*\$([\d,]+)', link_text)
|
|
price_text = f"${price_m.group(1)}" if price_m else ""
|
|
|
|
# Extract location from featurespec span
|
|
spec_span = a.find("span", class_="featurespec")
|
|
location = ""
|
|
if spec_span:
|
|
spec_text = spec_span.get_text(" ", strip=True)
|
|
# Location is before "Asking"
|
|
loc_m = re.search(r'^(.+?)\s*Asking', spec_text)
|
|
if loc_m:
|
|
location = loc_m.group(1).strip()
|
|
|
|
img_src = ""
|
|
img_tag = a.find("img")
|
|
if img_tag:
|
|
img_src = img_tag.get("src", "") or ""
|
|
if img_src and not img_src.startswith("http"):
|
|
img_src = "https://www.sailboatlistings.com" + img_src
|
|
for thumb in ["/sailimg/t/", "/sailimg/m/"]:
|
|
if thumb in img_src:
|
|
img_src = img_src.replace(thumb, "/sailimg/")
|
|
break
|
|
if not img_src:
|
|
img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg"
|
|
|
|
results.append({
|
|
"url": canonical,
|
|
"title": link_text.split("Asking")[0].strip() if "Asking" in link_text else link_text,
|
|
"snippet": link_text,
|
|
"price_text": price_text,
|
|
"img_url": img_src,
|
|
"location": location,
|
|
"source": "SailboatListings",
|
|
"source_type": "broker",
|
|
"category": "Veleros Global",
|
|
"fields": {}, # no structured fields for sidebar listings
|
|
})
|
|
page_results += 1
|
|
|
|
print(f"[SailboatListings] Page {page}: {page_results} listings (total: {len(results)})")
|
|
if page_results == 0:
|
|
break
|
|
|
|
except Exception as e:
|
|
print(f"[SailboatListings] Error page {page}: {e}")
|
|
break
|
|
|
|
print(f"[SailboatListings] Done — {len(results)} listings total")
|
|
return results
|
|
|
|
def scrape_and_extract_sailboatlistings(query: str, filters: dict, search_id: str, max_pages: int = 8):
|
|
"""
|
|
Runs SailboatListings scraping + AI extraction inline.
|
|
Saves each vessel to DB immediately so it appears in dashboard in real-time.
|
|
"""
|
|
print(f"[SBL-Thread] Starting SailboatListings extraction...")
|
|
raw_results = scrape_sailboatlistings(query, filters, max_pages)
|
|
|
|
if not raw_results:
|
|
print("[SBL-Thread] No results from SailboatListings")
|
|
return
|
|
|
|
sbl_min_loa = float(filters.get("min_loa") or 0)
|
|
sbl_max_price = float(filters.get("max_price") or 0)
|
|
saved = 0
|
|
|
|
for raw in raw_results:
|
|
if search_state.get('search_id') != search_id or search_state.get('cancelled'):
|
|
print("[SBL-Thread] Search cancelled — stopping")
|
|
return
|
|
|
|
try:
|
|
snippet = raw.get("snippet", "")
|
|
title = raw.get("title", "")
|
|
fields = raw.get("fields", {}) # structured fields from main listings
|
|
src = snippet + " " + title
|
|
|
|
# ── Helper to parse feet values like "30'" or "5.25'" ──
|
|
def parse_ft(val):
|
|
if not val: return None
|
|
m = re.match(r'([\d.]+)', val)
|
|
return float(m.group(1)) if m else None
|
|
|
|
# ── Use structured fields directly when available (main listings) ──
|
|
if fields:
|
|
loa_ft = parse_ft(fields.get("Length"))
|
|
beam_ft = parse_ft(fields.get("Beam"))
|
|
draft_ft = parse_ft(fields.get("Draft"))
|
|
year_val = fields.get("Year", "")
|
|
year_m = re.search(r'(\d{4})', year_val)
|
|
asking = fields.get("Asking", "")
|
|
price_r = re.search(r'\$\s*([\d,]{3,})', asking)
|
|
location = fields.get("Location", "")
|
|
hull_val = fields.get("Hull", "").lower()
|
|
else:
|
|
# Fallback: regex for sidebar/featured listings
|
|
length_r = re.search(r'Length:\s*([\d.]+)', src, re.IGNORECASE)
|
|
beam_r = re.search(r'Beam:\s*([\d.]+)', src, re.IGNORECASE)
|
|
draft_r = re.search(r'Draft:\s*([\d.]+)', src, re.IGNORECASE)
|
|
year_r = re.search(r'Year:\s*(\d{4})', src, re.IGNORECASE)
|
|
price_r = re.search(r'(?:Asking|Price):?\s*\$\s*([\d,]{3,})', src, re.IGNORECASE)
|
|
if not price_r:
|
|
price_r = re.search(r'\$\s*([\d,]{4,})', src)
|
|
loa_ft = float(length_r.group(1)) if length_r else None
|
|
beam_ft = float(beam_r.group(1)) if beam_r else None
|
|
draft_ft = float(draft_r.group(1)) if draft_r else None
|
|
year_m = year_r
|
|
location = raw.get("location", "")
|
|
hull_val = ""
|
|
loc_r = re.search(r'Location:\s*([^\n\r]{3,60}?)(?:\s{2,}|$)', src, re.IGNORECASE)
|
|
if loc_r: location = loc_r.group(1).strip()
|
|
hull_r2 = re.search(r'Hull:\s*([^\n\r]{3,50}?)(?:\s{2,}|$)', src, re.IGNORECASE)
|
|
if hull_r2: hull_val = hull_r2.group(1).lower()
|
|
|
|
# Fallback: extract LOA from title e.g. "35' Pearson 35"
|
|
if not loa_ft:
|
|
tm = re.search(r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', title, re.IGNORECASE)
|
|
if tm:
|
|
loa_ft = float(tm.group(1))
|
|
loa_m = round(loa_ft * 0.3048, 1) if loa_ft else None
|
|
beam_m = round(beam_ft * 0.3048, 1) if beam_ft else None
|
|
draft_m = round(draft_ft * 0.3048, 1) if draft_ft else None
|
|
year = int(year_m.group(1)) if year_m else None
|
|
location = location or raw.get("location", "")
|
|
|
|
price_usd = None
|
|
if price_r:
|
|
try: price_usd = float(price_r.group(1).replace(",", ""))
|
|
except: pass
|
|
if not price_usd and raw.get("price_text"):
|
|
pm = re.search(r'[\d,]+', raw["price_text"].replace("$",""))
|
|
if pm:
|
|
try: price_usd = float(pm.group().replace(",",""))
|
|
except: pass
|
|
|
|
# Skip only if absolutely no data
|
|
if not loa_m and not year and not price_usd:
|
|
continue
|
|
|
|
# Apply filters
|
|
if sbl_min_loa and loa_m and loa_m < (sbl_min_loa - 0.15):
|
|
continue
|
|
if sbl_max_price and price_usd and price_usd > sbl_max_price * 1.01:
|
|
continue
|
|
|
|
# Hull normalisation
|
|
hull_txt = hull_val
|
|
hull = ("Fiberglass" if "fiber" in hull_txt or "glass" in hull_txt else
|
|
"Steel" if "steel" in hull_txt else
|
|
"Aluminum" if "alum" in hull_txt else
|
|
"Wood" if "wood" in hull_txt else
|
|
"Composite" if "comp" in hull_txt else "Unknown")
|
|
|
|
# Algorithmic score (fast, no AI)
|
|
score = 50
|
|
if loa_m:
|
|
score += min(15, int((loa_m - 13) * 1.5)) if loa_m >= 13 else 0
|
|
if year:
|
|
score += min(10, max(0, (year - 1980) // 3))
|
|
if price_usd and loa_m:
|
|
price_per_ft = price_usd / (loa_m / 0.3048)
|
|
if price_per_ft < 500: score += 15
|
|
elif price_per_ft < 1000: score += 8
|
|
|
|
flags = []
|
|
if price_usd and loa_m and (price_usd / (loa_m / 0.3048)) < 600:
|
|
flags.append("below_market")
|
|
|
|
data = {
|
|
"name": title or "SailboatListings boat",
|
|
"vessel_type": "Sailboat",
|
|
"loa_m": loa_m,
|
|
"beam_m": beam_m,
|
|
"draft_m": draft_m,
|
|
"year_built": year,
|
|
"hull": hull,
|
|
"propulsion": "Sail",
|
|
"status": "active",
|
|
"price_usd": price_usd,
|
|
"currency": "USD",
|
|
"location": location,
|
|
"country": "US",
|
|
"description": f"Velero {title}. LOA: {loa_ft}ft. {location}".strip("."),
|
|
"flags": flags,
|
|
"score": min(100, score),
|
|
"images": [raw["img_url"]] if raw.get("img_url") else [],
|
|
"source_url": raw["url"],
|
|
"source_name": "SailboatListings",
|
|
}
|
|
|
|
vid = save_vessel(data)
|
|
if vid > 0:
|
|
search_state['found'] += 1
|
|
saved += 1
|
|
msg = f"✓ {title} ({loa_ft}ft, ${price_usd:,.0f}) — SailboatListings" if price_usd else f"✓ {title} ({loa_ft}ft) — SailboatListings"
|
|
print(f"[SBL-Thread] {msg}")
|
|
search_state['log'].append(msg)
|
|
|
|
except Exception as e:
|
|
print(f"[SBL-Thread] Error on {raw.get('title','?')}: {e}")
|
|
|
|
print(f"[SBL-Thread] Done — {saved}/{len(raw_results)} vessels saved")
|
|
|
|
def stealth_fetch(url: str, max_chars: int = 3000) -> tuple:
|
|
"""
|
|
Fetch a Cloudflare-protected page using Playwright with human-like behavior.
|
|
Returns (text, [image_urls])
|
|
Techniques used:
|
|
- Realistic viewport and user agent
|
|
- Random mouse movements before scrolling
|
|
- Human-like delays
|
|
- Accept cookies automatically
|
|
- Disable webdriver flags
|
|
"""
|
|
text = ""
|
|
images = []
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-web-security',
|
|
'--disable-features=IsolateOrigins,site-per-process',
|
|
]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={'width': 1366, 'height': 768},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale='en-US',
|
|
timezone_id='America/New_York',
|
|
java_script_enabled=True,
|
|
ignore_https_errors=True,
|
|
extra_http_headers={
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Cache-Control': 'no-cache',
|
|
'Pragma': 'no-cache',
|
|
}
|
|
)
|
|
# Remove webdriver property
|
|
context.add_init_script("""
|
|
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
|
window.chrome = {runtime: {}};
|
|
""")
|
|
page = context.new_page()
|
|
|
|
# Navigate with realistic timeout
|
|
page.goto(url, timeout=30000, wait_until='domcontentloaded')
|
|
|
|
# Random delay like a human reading
|
|
page.wait_for_timeout(random.randint(1500, 3000))
|
|
|
|
# Accept cookies if button exists
|
|
for selector in ['button:has-text("Accept")', 'button:has-text("Accept All")',
|
|
'#onetrust-accept-btn-handler', '.cookie-accept']:
|
|
try:
|
|
page.click(selector, timeout=1000)
|
|
page.wait_for_timeout(500)
|
|
break
|
|
except:
|
|
pass
|
|
|
|
# Scroll down naturally
|
|
page.evaluate("window.scrollBy(0, 300)")
|
|
page.wait_for_timeout(random.randint(500, 1200))
|
|
page.evaluate("window.scrollBy(0, 300)")
|
|
page.wait_for_timeout(random.randint(300, 800))
|
|
|
|
# Get content
|
|
html = page.content()
|
|
browser.close()
|
|
|
|
from bs4 import BeautifulSoup as BS
|
|
soup = BS(html, 'html.parser')
|
|
|
|
# Extract images — check all lazy-load attributes
|
|
skip_words = ['logo','icon','avatar','banner','pixel','sprite','ad',
|
|
'placeholder','blank','loading','spacer','1x1']
|
|
seen_imgs = set()
|
|
for img in soup.find_all('img'):
|
|
src = _extract_best_src(img)
|
|
if src and src not in seen_imgs:
|
|
if not any(s in src.lower() for s in skip_words):
|
|
seen_imgs.add(src)
|
|
images.append(src)
|
|
if len(images) >= 12:
|
|
break
|
|
|
|
for tag in soup(['script','style','nav','footer','header','aside']):
|
|
tag.decompose()
|
|
text = ' '.join(soup.get_text(' ', strip=True).split())[:max_chars]
|
|
|
|
except Exception as e:
|
|
print(f"[Stealth] Error: {e}")
|
|
return text, images
|
|
|
|
|
|
# Sites that need stealth scraping (Cloudflare protected)
|
|
STEALTH_REQUIRED = [
|
|
'yachtworld.com', 'boats.com', 'boattrader.com',
|
|
'rightboat.com', 'boat24.com', 'yachtall.com',
|
|
'botentekoop.nl', 'leboncoin.fr', 'annoncesbateau.com',
|
|
'thehulltruth.com', 'cruisersforum.com',
|
|
]
|
|
|
|
def smart_fetch(url: str, max_chars: int = 3000) -> tuple:
|
|
"""Use stealth for protected sites, regular fetch for others."""
|
|
domain = url.split('/')[2].replace('www.','') if '//' in url else ''
|
|
needs_stealth = any(s in domain for s in STEALTH_REQUIRED)
|
|
if needs_stealth:
|
|
print(f"[Fetch] Using stealth for {domain}")
|
|
return stealth_fetch(url, max_chars)
|
|
else:
|
|
return fetch_page_with_images(url)
|
|
|
|
|
|
def scrape_yachtworld(query: str, filters: dict, max_pages: int = 5) -> list:
|
|
"""
|
|
Dedicated YachtWorld stealth scraper.
|
|
Builds filtered URL and navigates with human-like behavior.
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
# Build YachtWorld filtered URL
|
|
vessel_type = filters.get("type","").lower()
|
|
yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power"
|
|
min_loa = filters.get("min_loa","")
|
|
max_price = filters.get("max_price","")
|
|
|
|
base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/"
|
|
if vessel_type in ["sailboat","sail","velero",""]:
|
|
base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/"
|
|
if min_loa:
|
|
ft = int(float(min_loa) / 0.3048)
|
|
base_url += f"length-{ft}/"
|
|
if max_price:
|
|
base_url += f"price-0,{max_price}/"
|
|
|
|
print(f"[YachtWorld] Scraping: {base_url}")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=['--disable-blink-features=AutomationControlled','--no-sandbox']
|
|
)
|
|
context = browser.new_context(
|
|
viewport={'width': 1920, 'height': 1080},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale='en-US',
|
|
timezone_id='America/New_York',
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
|
|
"window.chrome = {runtime: {}};"
|
|
)
|
|
|
|
for page_num in range(1, max_pages + 1):
|
|
if search_state.get('cancelled'):
|
|
break
|
|
|
|
page_url = base_url if page_num == 1 else base_url + f"?page={page_num}"
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(page_url, timeout=30000, wait_until='domcontentloaded')
|
|
page.wait_for_timeout(random.randint(2000, 4000))
|
|
|
|
# Scroll to load lazy content
|
|
for _ in range(3):
|
|
page.evaluate("window.scrollBy(0, 400)")
|
|
page.wait_for_timeout(random.randint(400, 800))
|
|
|
|
html = page.content()
|
|
page.close()
|
|
|
|
from bs4 import BeautifulSoup as BS
|
|
soup = BS(html, 'html.parser')
|
|
|
|
# YachtWorld listing cards
|
|
page_count = 0
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
if '/boat-details/' in href or '/yacht/' in href:
|
|
if not href.startswith('http'):
|
|
href = 'https://www.yachtworld.com' + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
title = a.get_text(strip=True)
|
|
parent = a.find_parent() or a
|
|
ctx = parent.get_text(' ', strip=True)[:300]
|
|
img = ""
|
|
for im in parent.find_all('img'):
|
|
src = im.get('src') or im.get('data-src','')
|
|
if src and 'rendered_yacht' in src:
|
|
img = src
|
|
break
|
|
results.append({
|
|
"url": href, "title": title,
|
|
"snippet": ctx, "price_text": "",
|
|
"img_url": img, "location": "",
|
|
"source": "YachtWorld",
|
|
"source_type": "broker",
|
|
"category": "Brokers Especializados",
|
|
})
|
|
page_count += 1
|
|
|
|
print(f"[YachtWorld] Page {page_num}: {page_count} listings")
|
|
if page_count == 0:
|
|
break
|
|
|
|
# Polite pause between pages
|
|
if page_num < max_pages:
|
|
polite_pause("YachtWorld")
|
|
|
|
except Exception as e:
|
|
print(f"[YachtWorld] Page {page_num} error: {e}")
|
|
try: page.close()
|
|
except: pass
|
|
break
|
|
|
|
browser.close()
|
|
except Exception as e:
|
|
print(f"[YachtWorld] Fatal error: {e}")
|
|
|
|
print(f"[YachtWorld] Total: {len(results)} listings")
|
|
return results
|
|
|
|
def fetch_page_text(url: str, max_chars: int = 2000) -> str:
|
|
"""Fetch plain text from a page."""
|
|
try:
|
|
r = requests.get(url, headers=get_headers(), timeout=15, verify=False)
|
|
if r.status_code != 200:
|
|
return ""
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
for tag in soup(["script","style","nav","footer","header","aside","noscript"]):
|
|
tag.decompose()
|
|
return " ".join(soup.get_text(" ", strip=True).split())[:max_chars]
|
|
except Exception:
|
|
return ""
|
|
|
|
def _extract_best_src(img_tag) -> str:
|
|
"""Extract the best image URL from an <img> tag, handling lazy-load patterns."""
|
|
candidates = [
|
|
img_tag.get("src",""),
|
|
img_tag.get("data-src",""),
|
|
img_tag.get("data-lazy-src",""),
|
|
img_tag.get("data-original",""),
|
|
img_tag.get("data-lazy",""),
|
|
img_tag.get("data-image",""),
|
|
img_tag.get("data-full",""),
|
|
img_tag.get("data-url",""),
|
|
img_tag.get("data-hi-res-src",""),
|
|
]
|
|
# Also check srcset — take the largest variant
|
|
srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","")
|
|
if srcset:
|
|
parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()]
|
|
candidates.extend(parts)
|
|
for c in candidates:
|
|
c = c.strip()
|
|
if c and c.startswith("http") and not c.startswith("data:"):
|
|
return c
|
|
return ""
|
|
|
|
def fetch_page_with_images(url: str) -> tuple:
|
|
"""Fetch page text AND images. Returns (text, [image_urls])"""
|
|
text = ""
|
|
images = []
|
|
base_url = "/".join(url.split("/")[:3])
|
|
try:
|
|
r = requests.get(url, headers=get_headers(referer=url), timeout=18, verify=False)
|
|
if r.status_code != 200:
|
|
return fetch_page_text(url), []
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
# Extract images before stripping tags
|
|
skip_words = ["logo","icon","avatar","banner","pixel","track","ad","sprite","button",
|
|
"placeholder","blank","loading","spacer","1x1","transparent"]
|
|
seen_imgs = set()
|
|
for img in soup.find_all("img"):
|
|
src = _extract_best_src(img)
|
|
if not src:
|
|
continue
|
|
# Normalise relative URLs
|
|
if src.startswith("//"):
|
|
src = "https:" + src
|
|
elif src.startswith("/"):
|
|
src = base_url + src
|
|
if not src.startswith("http"):
|
|
continue
|
|
if any(s in src.lower() for s in skip_words):
|
|
continue
|
|
if src in seen_imgs:
|
|
continue
|
|
try:
|
|
w = int(str(img.get("width","0")).replace("px","") or 0)
|
|
if 0 < w < 100:
|
|
continue
|
|
except:
|
|
pass
|
|
seen_imgs.add(src)
|
|
images.append(src)
|
|
if len(images) >= 10:
|
|
break
|
|
for tag in soup(["script","style","nav","footer","header","aside","noscript"]):
|
|
tag.decompose()
|
|
text = " ".join(soup.get_text(" ", strip=True).split())[:3000]
|
|
except Exception:
|
|
text = fetch_page_text(url)
|
|
return text, images
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# DEDICATED SOURCE SCRAPERS
|
|
# Each function handles one site's quirks. scrape_source_router dispatches here.
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
def scrape_ebay(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
eBay Marine scraper — uses Playwright (Akamai blocks plain requests).
|
|
Handles all eBay entries: Marine, Auction, Sail, Salvage, etc.
|
|
|
|
New eBay layout (2024+) uses:
|
|
- <a class="s-card__link"> for item links
|
|
- Text title in nearby spans/divs
|
|
- <img> with i.ebayimg.com CDN URLs (s-l500 quality)
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = src.get("search_url", "")
|
|
if not raw_url:
|
|
return []
|
|
|
|
clean_q = " ".join(dict.fromkeys(query.strip().split()))
|
|
url = raw_url.replace("{query}", requests.utils.quote(clean_q))
|
|
|
|
# ── Adjust eBay category based on vessel type filter ──────────────────────
|
|
# 26429=All Boats 36431=Sailboats 36432=Powerboats 26430=PWC 63613=Kayaks
|
|
vtype = filters.get("type","").lower() if filters else ""
|
|
EBAY_CAT = {
|
|
"sailboat": "36431", "sail": "36431", "velero": "36431",
|
|
"motor": "36432", "motorboat": "36432", "yacht": "36432",
|
|
"fishing": "36432", "tug": "36432", "barge": "36432",
|
|
"offshore": "36432", "ferry": "36432",
|
|
}
|
|
if vtype and vtype in EBAY_CAT:
|
|
url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url)
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US",
|
|
timezone_id="America/New_York",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(1500, 2500))
|
|
# Scroll a bit to trigger lazy images
|
|
page.evaluate("window.scrollBy(0,600)")
|
|
page.wait_for_timeout(800)
|
|
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Playwright nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# ── New layout (2024+): li.s-card ─────────────────────────────────────
|
|
cards = soup.find_all("li", class_="s-card")
|
|
|
|
# ── Old layout fallback: li.s-item ────────────────────────────────────
|
|
if not cards:
|
|
return _parse_ebay_old_layout(soup, src)
|
|
|
|
for card in cards:
|
|
try:
|
|
# Title + URL — a.s-card__link WITHOUT image-treatment class
|
|
title_link = None
|
|
for a in card.find_all("a", class_="s-card__link"):
|
|
if "image-treatment" in (a.get("class") or []):
|
|
continue
|
|
t = a.get_text(strip=True)
|
|
if t and not t.lower().startswith("shop on ebay"):
|
|
title_link = a
|
|
break
|
|
if not title_link:
|
|
continue
|
|
|
|
href = title_link.get("href", "")
|
|
if "/itm/" not in href:
|
|
continue
|
|
m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href)
|
|
if not m:
|
|
continue
|
|
href = m.group(1)
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
|
|
# Clean title — strip eBay UI noise appended to link text
|
|
title = title_link.get_text(strip=True)
|
|
title = re.sub(r'\s*Opens in a new window or tab.*', '',
|
|
title, flags=re.IGNORECASE).strip()
|
|
|
|
# Price ── .s-card__price
|
|
price_tag = (card.find(class_="s-card__price") or
|
|
card.find(class_="s-item__price"))
|
|
price = price_tag.get_text(strip=True) if price_tag else ""
|
|
|
|
# Image ── img inside a.s-card__link.image-treatment
|
|
img = ""
|
|
img_link = card.find("a", class_="image-treatment")
|
|
if img_link:
|
|
im = img_link.find("img")
|
|
if im:
|
|
raw = (_extract_best_src(im) or
|
|
im.get("src","") or im.get("data-src",""))
|
|
if raw:
|
|
img = re.sub(r's-l\d+\.(jpg|webp|jpeg)',
|
|
r's-l500.\1', raw)
|
|
# Fallback: any ebayimg.com src in the card
|
|
if not img:
|
|
for im in card.find_all("img"):
|
|
raw = (_extract_best_src(im) or im.get("src",""))
|
|
if raw and "ebayimg.com" in raw:
|
|
img = re.sub(r's-l\d+\.(jpg|webp|jpeg)',
|
|
r's-l500.\1', raw)
|
|
break
|
|
|
|
# Location ── "Located in: XXX" — stop before "Delivery"
|
|
location = ""
|
|
card_text = card.get_text(" ", strip=True)
|
|
lm = re.search(
|
|
r'[Ll]ocated in[:\s]+([A-Za-z][^,\|•\n$\d]{2,30})',
|
|
card_text)
|
|
if lm:
|
|
loc_raw = lm.group(1).strip()
|
|
# Trim trailing noise like "Delivery or pickup..."
|
|
loc_raw = re.split(r'\s+[Dd]elivery|\s+[Ss]hipping',
|
|
loc_raw)[0].strip()
|
|
location = loc_raw
|
|
|
|
results.append({
|
|
"url": href,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "eBay"),
|
|
"source_type": src.get("type", "classifieds"),
|
|
"category": src.get("category", "Clasificados USA"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings (new layout)")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def _parse_ebay_old_layout(soup, src: dict) -> list[dict]:
|
|
"""Fallback for the classic eBay li.s-item layout."""
|
|
results = []
|
|
seen = set()
|
|
for item in soup.find_all("li", class_="s-item"):
|
|
try:
|
|
link_tag = item.find("a", class_="s-item__link")
|
|
if not link_tag: continue
|
|
href = link_tag.get("href","")
|
|
if "/itm/" not in href: continue
|
|
m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href)
|
|
if m: href = m.group(1)
|
|
if href in seen: continue
|
|
seen.add(href)
|
|
|
|
title_tag = (item.find("span", class_="BOLD") or
|
|
item.find("div", class_="s-item__title") or
|
|
item.find("span", class_="s-item__title"))
|
|
title = (title_tag or link_tag).get_text(strip=True)
|
|
if not title or title.lower().startswith("shop on ebay"): continue
|
|
|
|
price_tag = item.find("span", class_="s-item__price")
|
|
price = price_tag.get_text(strip=True) if price_tag else ""
|
|
|
|
img = ""
|
|
img_tag = item.find("img")
|
|
if img_tag:
|
|
img = (_extract_best_src(img_tag) or img_tag.get("src",""))
|
|
if img: img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', r's-l500.\1', img)
|
|
|
|
loc_tag = (item.find("span", class_="s-item__location") or
|
|
item.find("span", class_="s-item__itemLocation"))
|
|
location = ""
|
|
if loc_tag:
|
|
location = (loc_tag.get_text(strip=True)
|
|
.replace("Located in: ","").strip())
|
|
|
|
results.append({
|
|
"url": href, "title": title, "snippet": f"{price} {location}".strip(),
|
|
"price_text": price, "img_url": img, "location": location,
|
|
"source": src.get("name","eBay"), "source_type": src.get("type","classifieds"),
|
|
"category": src.get("category","Clasificados USA"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[{src.get('name','eBay')}] {len(results)} listings (old layout)")
|
|
return results
|
|
|
|
|
|
def scrape_boattrader(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
BoatTrader scraper — uses Playwright (Cloudflare Turnstile on plain requests).
|
|
|
|
Card structure (stable classes):
|
|
li.lib-card — card root
|
|
a[href^="/boat/...-<ID>/"] — listing URL
|
|
[class*=listingTitle] — title element
|
|
[class*=listingPrice] — price element
|
|
img — photo
|
|
city, STATE ZIP pattern in text — location
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = src.get("search_url", "")
|
|
if not raw_url:
|
|
return []
|
|
|
|
clean_q = " ".join(dict.fromkeys(query.strip().split()))
|
|
url = raw_url.replace("{query}", requests.utils.quote(clean_q))
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US",
|
|
timezone_id="America/New_York",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
# BoatTrader needs time to hydrate React and load listing cards
|
|
page.wait_for_timeout(random.randint(4000, 6000))
|
|
page.evaluate("window.scrollBy(0, 600)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Playwright nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# ── Card root: li.lib-card ─────────────────────────────────────────────
|
|
cards = soup.find_all("li", class_="lib-card")
|
|
if not cards:
|
|
# Fallback: any element with lib-card class
|
|
cards = soup.find_all(class_=re.compile(r'\blib-card\b'))
|
|
|
|
for card in cards:
|
|
try:
|
|
# Link ── /boat/YEAR-MAKE-...-ID/
|
|
link_tag = card.find(
|
|
"a", href=re.compile(r'^/boat/[\w-]+-\d+/$'))
|
|
if not link_tag:
|
|
continue
|
|
href = "https://www.boattrader.com" + link_tag["href"]
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
|
|
# Title ── element whose class contains 'listingTitle'
|
|
title_el = card.find(
|
|
class_=re.compile(r'listingTitle', re.I))
|
|
if title_el:
|
|
title = title_el.get_text(strip=True)
|
|
else:
|
|
# Fallback: build from URL slug (2026-catalina-34-123 → 2026 Catalina 34)
|
|
slug = link_tag["href"].strip("/").split("/")[-1]
|
|
parts = slug.rsplit("-", 1)[0].replace("-", " ").title()
|
|
title = parts
|
|
if not title:
|
|
continue
|
|
|
|
# Price ── element whose class contains 'listingPrice'
|
|
price_el = card.find(
|
|
class_=re.compile(r'listingPrice', re.I))
|
|
price = ""
|
|
if price_el:
|
|
raw_price = price_el.get_text(" ", strip=True)
|
|
# Extract only the first dollar amount — ignore "/mo*" noise
|
|
pm = re.search(r'\$\s*([\d,]+)', raw_price)
|
|
if pm:
|
|
price = f"${pm.group(1)}"
|
|
|
|
# Image ── first <img> with a boatsgroup or boattrader CDN src
|
|
img = ""
|
|
for im in card.find_all("img"):
|
|
raw = (_extract_best_src(im) or
|
|
im.get("src","") or im.get("data-src",""))
|
|
if raw and raw.startswith("http") and not raw.endswith(".svg"):
|
|
img = raw
|
|
break
|
|
|
|
# Location ── "City, ST ZIP" pattern in card text
|
|
# Use listingCaption element if available (more precise)
|
|
location = ""
|
|
caption_el = card.find(class_=re.compile(r'listingCaption|listingLocation', re.I))
|
|
search_text = caption_el.get_text(" ", strip=True) if caption_el else card.get_text(" ", strip=True)
|
|
lm = re.search(
|
|
r'\b([A-Z][a-zA-Z\s]{2,20},\s+[A-Z]{2}(?:\s+\d{5})?)',
|
|
search_text)
|
|
if lm:
|
|
location = lm.group(1).strip()
|
|
|
|
results.append({
|
|
"url": href,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "BoatTrader"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_apolloduck(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Apollo Duck scraper — plain requests + BS4 (no JS needed).
|
|
|
|
Two card types on the listing page:
|
|
Sidebar cards: div.eastSDFPPanel → a.SidebarTitle, a.SidebarPrice, img
|
|
Featured cards: div._FeatureAdPanel → a._FeatureTitle, span._FeaturePrice,
|
|
img, td._PanelSpecData (location)
|
|
|
|
Listing URL pattern: https://www.apolloduck.com/boat/{slug}/{id}
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
# Use Apollo Duck keyword search — returns results filtered by query.
|
|
# Strip trailing "for sale" / "en venta" / "a vendre" since Apollo Duck
|
|
# searches listing titles and those phrases rarely appear there.
|
|
stripped_q = re.sub(
|
|
r'\s*(for\s+sale|en\s+venta|à\s+vendre|zu\s+verkaufen)\s*$',
|
|
'', query.strip(), flags=re.I).strip()
|
|
clean_q = requests.utils.quote(stripped_q or query.strip())
|
|
if clean_q:
|
|
url = f"https://www.apolloduck.com/search.phtml?search={clean_q}&sr=1&q=1"
|
|
else:
|
|
raw_url = src.get("search_url", "") or "https://www.apolloduck.com/boats/used-boats-for-sale"
|
|
url = raw_url.replace("{query}", clean_q)
|
|
is_search = bool(clean_q) # only featured cards are query-filtered
|
|
|
|
try:
|
|
headers = {
|
|
"User-Agent": random.choice(USER_AGENTS),
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
}
|
|
resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
|
|
resp.raise_for_status()
|
|
resp.encoding = resp.apparent_encoding or "utf-8"
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
def _parse_card(card, title_sel, price_sel, is_featured=False):
|
|
"""Common extraction for both card types."""
|
|
title_el = card.select_one(title_sel)
|
|
if not title_el:
|
|
return
|
|
title = title_el.get_text(strip=True)
|
|
if not title:
|
|
return
|
|
|
|
# URL — from title link or image link
|
|
href = title_el.get("href", "")
|
|
if not href:
|
|
a = card.find("a", href=re.compile(r'/boat/'))
|
|
href = a["href"] if a else ""
|
|
if not href:
|
|
return
|
|
full_url = ("https://www.apolloduck.com" + href
|
|
if href.startswith("/") else href)
|
|
if full_url in seen:
|
|
return
|
|
seen.add(full_url)
|
|
|
|
# Price
|
|
price_el = card.select_one(price_sel)
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
|
|
# Image
|
|
img = ""
|
|
for im in card.find_all("img"):
|
|
raw = (im.get("src") or im.get("data-src") or
|
|
im.get("data-lazy-src") or "")
|
|
if raw and raw.startswith("http") and not raw.endswith(".svg"):
|
|
img = raw
|
|
break
|
|
# srcset fallback
|
|
ss = im.get("srcset","")
|
|
if ss:
|
|
img = ss.split()[0]
|
|
break
|
|
|
|
# Location — only featured cards have it
|
|
location = ""
|
|
if is_featured:
|
|
for lbl in card.select("td._PanelSpecLabel"):
|
|
if "location" in lbl.get_text(strip=True).lower():
|
|
loc_td = lbl.find_next_sibling("td")
|
|
if loc_td:
|
|
location = loc_td.get_text(strip=True)
|
|
break
|
|
|
|
results.append({
|
|
"url": full_url,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "Apollo Duck"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
|
|
# Featured cards — always query-filtered on search results (~60-100/page)
|
|
for card in soup.select("div._FeatureAdPanel"):
|
|
_parse_card(card, "a._FeatureTitle", "span._FeaturePrice",
|
|
is_featured=True)
|
|
|
|
# Sidebar cards — only when browsing a category (NOT on keyword search,
|
|
# because sidebar is always the same 101 generic listings regardless of query)
|
|
if not is_search:
|
|
for card in soup.select("div.eastSDFPPanel"):
|
|
_parse_card(card, "a.SidebarTitle", "a.SidebarPrice")
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_boatsdotcom(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Boats.com scraper — uses Playwright (same Boats Group infrastructure as BoatTrader).
|
|
|
|
Two card types:
|
|
Sponsored/real: li[data-listing-id] → h2+div.year, div.price,
|
|
div.img-container img, div.country
|
|
OEM specs: li.enhanced.oem → h2+div.year, div.price,
|
|
div.img-container img (no location)
|
|
|
|
Listing URL pattern: https://www.boats.com/{type}/{year}-{make}-{id}/
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = src.get("search_url", "") or "https://www.boats.com/boats-for-sale/?query={query}"
|
|
clean_q = requests.utils.quote(query.strip())
|
|
url = raw_url.replace("{query}", clean_q)
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US",
|
|
timezone_id="America/New_York",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(4000, 6000))
|
|
page.evaluate("window.scrollBy(0, 600)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Playwright nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
def _extract_card(card, has_location=True):
|
|
# URL
|
|
a = card.find("a", href=re.compile(r'^/'))
|
|
if not a:
|
|
return
|
|
href = "https://www.boats.com" + a["href"]
|
|
if href in seen:
|
|
return
|
|
seen.add(href)
|
|
|
|
# Title = year + model name
|
|
year_el = card.select_one("div.year")
|
|
name_el = card.select_one("h2")
|
|
year = year_el.get_text(strip=True) if year_el else ""
|
|
name = name_el.get_text(strip=True) if name_el else ""
|
|
title = f"{year} {name}".strip() if year else name
|
|
if not title:
|
|
return
|
|
|
|
# Price
|
|
price_el = card.select_one("div.price")
|
|
price = ""
|
|
if price_el:
|
|
raw_p = price_el.get_text(" ", strip=True)
|
|
pm = re.search(r'\$\s*([\d,]+)', raw_p)
|
|
price = f"${pm.group(1)}" if pm else raw_p[:30]
|
|
|
|
# Image
|
|
img = ""
|
|
img_container = card.select_one("div.img-container")
|
|
if img_container:
|
|
im = img_container.find("img")
|
|
if im:
|
|
img = (_extract_best_src(im) or im.get("src","")
|
|
or im.get("data-src",""))
|
|
|
|
# Location
|
|
location = ""
|
|
if has_location:
|
|
loc_el = card.select_one("div.country")
|
|
if loc_el:
|
|
location = loc_el.get_text(strip=True)
|
|
|
|
results.append({
|
|
"url": href,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "Boats.com"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
|
|
# Sponsored/real marketplace listings
|
|
for card in soup.select("li[data-listing-id]"):
|
|
_extract_card(card, has_location=True)
|
|
|
|
# OEM spec sheets
|
|
for card in soup.select("li.enhanced.oem"):
|
|
_extract_card(card, has_location=False)
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_craigslist(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Craigslist boats scraper — plain requests + BS4.
|
|
|
|
Card root : div[data-pid] (class="cl-search-result")
|
|
Title : a.posting-title span.label
|
|
URL : a.main[href] (full absolute URL with regional subdomain)
|
|
Price : span.priceinfo
|
|
Location : span.result-location
|
|
Image : img[data-image-index="0"] inside div.cl-gallery
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
# Craigslist has no national search — scrape several major coastal cities
|
|
CITIES = ["sfbay", "losangeles", "seattle", "miami", "boston",
|
|
"newyork", "chicago", "houston", "dallas", "denver",
|
|
"phoenix", "atlanta", "portland", "sandiego", "tampa",
|
|
"minneapolis", "stlouis", "nashville", "raleigh", "saltlakecity"]
|
|
qs = requests.utils.quote(query.strip())
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
all_html_parts = []
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
# Fetch 3 random cities to keep runtime reasonable
|
|
for city in random.sample(CITIES, min(3, len(CITIES))):
|
|
city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel"
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(city_url, timeout=25000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(2500)
|
|
all_html_parts.append(page.content())
|
|
except Exception:
|
|
pass
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not all_html_parts:
|
|
return []
|
|
|
|
# Parse all city HTMLs
|
|
for html in all_html_parts:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
for card in soup.find_all(attrs={"data-pid": True}):
|
|
try:
|
|
# URL — from the main image link (absolute)
|
|
a_main = card.find("a", class_="main")
|
|
if not a_main:
|
|
continue
|
|
listing_url = a_main.get("href", "")
|
|
if not listing_url or listing_url in seen:
|
|
continue
|
|
seen.add(listing_url)
|
|
|
|
# Title — from card title attr or span.label
|
|
title = card.get("title", "")
|
|
if not title:
|
|
span = card.find("span", class_="label")
|
|
title = span.get_text(strip=True) if span else ""
|
|
if not title:
|
|
continue
|
|
|
|
# Price
|
|
price_el = card.find("span", class_="priceinfo")
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
|
|
# Location
|
|
loc_el = card.find("span", class_="result-location")
|
|
location = loc_el.get_text(strip=True) if loc_el else ""
|
|
|
|
# Image — first img with data-image-index="0"
|
|
img = ""
|
|
im = card.find("img", attrs={"data-image-index": "0"})
|
|
if im:
|
|
img = im.get("src", "") or im.get("data-src", "")
|
|
if not img:
|
|
im = card.find("img")
|
|
if im:
|
|
img = im.get("src", "") or im.get("data-src", "")
|
|
|
|
results.append({
|
|
"url": listing_url,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "Craigslist Boats"),
|
|
"source_type": src.get("type", "classifieds"),
|
|
"category": src.get("category", "Clasificados Generales"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_rightboat(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Rightboat scraper — Playwright (JS-rendered, Tailwind CSS).
|
|
|
|
Card root : div[data-tracking-bound="true"]
|
|
Image : img.object-cover (first inside card)
|
|
Title : first <a> with href containing /boats-for-sale/ that has text
|
|
Price : element containing fa-tag icon's sibling text
|
|
Location : element containing fa-location-pin icon's sibling text
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = (src.get("search_url", "")
|
|
or "https://www.rightboat.com/boats-for-sale/?q={query}")
|
|
clean_q = requests.utils.quote(query.strip())
|
|
url = raw_url.replace("{query}", clean_q)
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"]
|
|
)
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
)
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(5000, 7000))
|
|
page.evaluate("window.scrollBy(0, 800)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Playwright nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Cards are div[data-tracking-bound="true"]
|
|
cards = soup.find_all(attrs={"data-tracking-bound": "true"})
|
|
|
|
for card in cards:
|
|
try:
|
|
# URL — the card ITSELF is the <a> element
|
|
href = card.get("href", "")
|
|
if not href or "/boats-for-sale/" not in href:
|
|
continue
|
|
listing_url = ("https://www.rightboat.com" + href
|
|
if href.startswith("/") else href)
|
|
if listing_url in seen:
|
|
continue
|
|
seen.add(listing_url)
|
|
|
|
# Image — first object-cover img (main photo)
|
|
img = ""
|
|
im = card.find("img", class_=re.compile(r'object-cover'))
|
|
if im:
|
|
img = im.get("src", "") or im.get("data-src", "")
|
|
|
|
# Title — from img alt attribute (most reliable) or heading
|
|
title = ""
|
|
if im:
|
|
title = im.get("alt", "").strip()
|
|
if not title:
|
|
h_el = card.find(re.compile(r'^h[1-4]$'))
|
|
title = h_el.get_text(strip=True) if h_el else ""
|
|
if not title:
|
|
# Build from URL slug: /boats-for-sale/make/model/rbXXX
|
|
parts = href.strip("/").split("/")
|
|
if len(parts) >= 3:
|
|
title = " ".join(parts[1:-1]).replace("-", " ").title()
|
|
if not title:
|
|
continue
|
|
|
|
# Price — <p class="...mb-2 ml-auto font-bold..."> or regex fallback
|
|
price = ""
|
|
price_el = card.find("p", class_=re.compile(r'font-bold'))
|
|
if price_el:
|
|
pt = price_el.get_text(strip=True)
|
|
if re.search(r'[\$£€]', pt):
|
|
price = pt
|
|
if not price:
|
|
pm = re.search(r'[\$£€]\s*[\d,]+', card.get_text())
|
|
if pm:
|
|
price = pm.group(0)
|
|
|
|
# Location — text inside same div as fa-location-pin icon
|
|
location = ""
|
|
pin_icon = card.find("i", class_=re.compile(r'fa-location'))
|
|
if pin_icon:
|
|
# Typically: <div><i fa-location-pin/> "City, State"</div>
|
|
row = pin_icon.find_parent()
|
|
if row:
|
|
location = row.get_text(" ", strip=True).strip()
|
|
|
|
results.append({
|
|
"url": listing_url,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "Rightboat"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_cooperss(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Cooper Capital Specialty Salvage (cooperss.com).
|
|
Salvage / insurance-loss vessels.
|
|
|
|
Structure (paired divs, same index):
|
|
div.listing-thumb — image + link (assets/detail/?name=marine&id=N)
|
|
div.listing-detail — h5.blue (name) + table (Year,Size,Location,Min Bid…)
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
base = "https://www.cooperss.com"
|
|
|
|
try:
|
|
headers = {"User-Agent": random.choice(USER_AGENTS),
|
|
"Accept-Language": "en-US,en;q=0.9"}
|
|
resp = requests.get(base + "/", headers=headers, timeout=20)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
thumbs = [el for el in soup.find_all(class_="listing-thumb")
|
|
if "slick-cloned" not in (el.get("class") or [])]
|
|
details = [el for el in soup.find_all(class_="listing-detail")
|
|
if "slick-cloned" not in (el.get("class") or [])]
|
|
|
|
for thumb, detail in zip(thumbs, details):
|
|
try:
|
|
# URL
|
|
a = thumb.find("a", href=True)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + "/" + href.lstrip("/")
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
|
|
# Image
|
|
img_tag = thumb.find("img")
|
|
img = img_tag.get("src", "") if img_tag else ""
|
|
if img and not img.startswith("http"):
|
|
img = base + "/" + img.lstrip("/")
|
|
|
|
# Title — h5.blue (vessel name)
|
|
h5 = detail.find("h5", class_="blue")
|
|
title = h5.get_text(strip=True).split("\n")[0].strip() if h5 else ""
|
|
# Remove video-button text artifact
|
|
for tag in (h5.find_all("a") if h5 else []):
|
|
tag.decompose()
|
|
title = h5.get_text(strip=True) if h5 else title
|
|
if not title:
|
|
continue
|
|
|
|
# Parse the detail table
|
|
rows = {td.get_text(strip=True): tds[1].get_text(strip=True)
|
|
for tr in detail.find_all("tr")
|
|
if len(tds := tr.find_all("td")) == 2
|
|
for td in [tds[0]]}
|
|
year = rows.get("Year", "")
|
|
size = rows.get("Size", "")
|
|
location = rows.get("Location", "")
|
|
min_bid = rows.get("Minimum Bid", "")
|
|
loss_type= rows.get("Type of Loss", "")
|
|
deadline = rows.get("Bid Deadline", "")
|
|
|
|
if year:
|
|
title = f"{year} {title}".strip()
|
|
price = f"Min Bid ${min_bid}" if min_bid else ""
|
|
snippet_parts = [p for p in [price, loss_type, location, f"Deadline: {deadline}" if deadline else ""] if p]
|
|
|
|
results.append({
|
|
"url": href,
|
|
"title": title[:120],
|
|
"snippet": " | ".join(snippet_parts),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"size_m": size,
|
|
"source": src.get("name", "Cooper Salvage"),
|
|
"source_type": "salvage",
|
|
"category": src.get("category", "Salvage & Wrecks"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_inautia(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
iNautia scraper — same Boats Group platform as BoatTrader/Boats.com.
|
|
|
|
Card: div[data-grid-index]
|
|
Link: a.grid-listing-link[href] → /boat/YEAR-MAKE-MODEL-ID/
|
|
Title: [class*=listingTitle]
|
|
Price: data-ssr-meta="make|type|len||price_eur" (5th field)
|
|
Location: [class*=listingBody]
|
|
Image: first CDN img in card
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = (src.get("search_url", "")
|
|
or "https://www.inautia.com/boats/?q={query}")
|
|
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"])
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", ignore_https_errors=True)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};")
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(4000, 6000))
|
|
page.evaluate("window.scrollBy(0,600)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards = soup.find_all(attrs={"data-grid-index": True})
|
|
|
|
for card in cards:
|
|
try:
|
|
link_tag = card.find("a", class_=re.compile(r'grid-listing-link'))
|
|
if not link_tag:
|
|
continue
|
|
href = link_tag.get("href", "")
|
|
if not href:
|
|
continue
|
|
full_url = ("https://www.inautia.com" + href
|
|
if href.startswith("/") else href)
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
|
|
# Title
|
|
title_el = card.find(class_=re.compile(r'listingTitle', re.I))
|
|
title = title_el.get_text(strip=True) if title_el else ""
|
|
if not title:
|
|
slug = href.strip("/").split("/")[-1]
|
|
title = slug.rsplit("-", 1)[0].replace("-", " ").title()
|
|
if not title:
|
|
continue
|
|
|
|
# Price from data-ssr-meta (make|type|length||price_eur)
|
|
price = ""
|
|
meta = link_tag.get("data-ssr-meta", "")
|
|
if meta:
|
|
parts = meta.split("|")
|
|
if len(parts) >= 5 and parts[4]:
|
|
try:
|
|
price = f"€{int(float(parts[4])):,}"
|
|
except ValueError:
|
|
pass
|
|
if not price:
|
|
price_el = card.find(class_=re.compile(r'listingPrice', re.I))
|
|
if price_el:
|
|
raw_p = price_el.get_text(" ", strip=True)
|
|
pm = re.search(r'[\$€£]\s*[\d,]+', raw_p)
|
|
price = pm.group(0) if pm else ""
|
|
|
|
# Location — listingBody contains "Broker | City, Country"
|
|
loc_el = card.find(class_=re.compile(r'listingBody', re.I))
|
|
location = loc_el.get_text(" ", strip=True) if loc_el else ""
|
|
|
|
# Image
|
|
img = ""
|
|
for im in card.find_all("img"):
|
|
raw = (_extract_best_src(im) or im.get("src","") or im.get("data-src",""))
|
|
if raw and raw.startswith("http") and not raw.endswith(".svg"):
|
|
img = raw
|
|
break
|
|
|
|
results.append({
|
|
"url": full_url,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "iNautia"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_boat24(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Boat24 scraper — European marketplace, plain requests.
|
|
|
|
Card: div.blurb.blurb--strip
|
|
Link: data-link attr (base64 → ROT13 → URL)
|
|
Title: h3.blurb__title
|
|
Price: p.blurb__price
|
|
Location: p.blurb__location
|
|
Image: lazy via slider — extract from li.slider__slide img[src] or data-src
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
BASE = "https://www.boat24.com"
|
|
|
|
raw_url = (src.get("search_url", "")
|
|
or "https://www.boat24.com/en/usedboats/")
|
|
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
|
|
|
|
_rot13 = str.maketrans(
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
|
|
"NOPQRSTUVWXYZABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm")
|
|
|
|
def _decode_link(encoded: str) -> str:
|
|
try:
|
|
import base64
|
|
rot = base64.b64decode(encoded).decode("utf-8", errors="ignore")
|
|
return rot.translate(_rot13)
|
|
except Exception:
|
|
return ""
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"])
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", ignore_https_errors=True)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});")
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(4000, 6000))
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards = soup.find_all("div", class_=re.compile(r'\bblurb\b'))
|
|
for card in cards:
|
|
try:
|
|
encoded = card.get("data-link", "")
|
|
if not encoded:
|
|
continue
|
|
listing_url = _decode_link(encoded)
|
|
if not listing_url or not listing_url.startswith("http"):
|
|
# Try building from title link
|
|
a = card.find("a", href=re.compile(r'/en/'))
|
|
if a:
|
|
listing_url = (BASE + a["href"] if a["href"].startswith("/")
|
|
else a["href"])
|
|
else:
|
|
continue
|
|
if listing_url in seen:
|
|
continue
|
|
seen.add(listing_url)
|
|
|
|
title_el = card.select_one("h3.blurb__title, h2.blurb__title")
|
|
title = title_el.get_text(strip=True) if title_el else ""
|
|
if not title:
|
|
continue
|
|
|
|
price_el = card.select_one("p.blurb__price")
|
|
price = price_el.get_text(strip=True) if price_el else ""
|
|
|
|
loc_el = card.select_one("p.blurb__location")
|
|
location = ""
|
|
if loc_el:
|
|
location = re.sub(r'\s+', ' ',
|
|
loc_el.get_text(" ", strip=True)).strip()
|
|
|
|
# Image — try slider slides or first img
|
|
img = ""
|
|
for im in card.find_all("img"):
|
|
raw = (im.get("data-src") or im.get("data-lazy")
|
|
or im.get("srcset","").split()[0] or im.get("src",""))
|
|
if raw and raw.startswith("http") and "/alpha.gif" not in raw:
|
|
img = raw
|
|
break
|
|
|
|
results.append({
|
|
"url": listing_url,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "Boat24"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_facebook_marketplace(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Facebook Marketplace scraper.
|
|
|
|
Requires a saved session file: fb_session.json (cookies from a logged-in session).
|
|
If not found, returns a single instructional result.
|
|
|
|
Setup: POST /api/fb-setup → launches a visible browser for the user to log in.
|
|
Session file is saved automatically after login.
|
|
"""
|
|
import json as _json
|
|
results = []
|
|
seen = set()
|
|
|
|
SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json")
|
|
SEARCH_URL = ("https://www.facebook.com/marketplace/search/"
|
|
f"?query={requests.utils.quote(query.strip())}"
|
|
"&deliveryMethod=local_pick_up")
|
|
|
|
if not os.path.exists(SESSION_FILE):
|
|
return [{
|
|
"url": "https://www.facebook.com/marketplace/",
|
|
"title": "⚠ Facebook Marketplace — Configuración requerida",
|
|
"snippet": ("Para habilitar Facebook Marketplace, ve a Fuentes y "
|
|
"haz clic en 'Configurar FB'. Solo se necesita una vez."),
|
|
"price_text": "",
|
|
"img_url": "",
|
|
"location": "",
|
|
"source": "Facebook Marketplace",
|
|
"source_type": "setup_required",
|
|
"category": src.get("category", "Clasificados Generales"),
|
|
}]
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with open(SESSION_FILE) as f:
|
|
cookies = _json.load(f)
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled",
|
|
"--no-sandbox", "--disable-dev-shm-usage"])
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", ignore_https_errors=True)
|
|
context.add_cookies(cookies)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};")
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(SEARCH_URL, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(5000, 7000))
|
|
page.evaluate("window.scrollBy(0,800)")
|
|
page.wait_for_timeout(2000)
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[Facebook Marketplace] nav error: {e}")
|
|
html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# FB Marketplace listing cards — data-testid or aria-label patterns
|
|
# Each listing is usually an <a> with href /marketplace/item/ID/
|
|
listing_links = soup.find_all(
|
|
"a", href=re.compile(r'/marketplace/item/\d+'))
|
|
|
|
for a in listing_links:
|
|
try:
|
|
href = a.get("href", "")
|
|
full_url = ("https://www.facebook.com" + href
|
|
if href.startswith("/") else href)
|
|
# Normalize: remove query params after item ID
|
|
full_url = re.sub(r'(/marketplace/item/\d+/).*', r'\1', full_url)
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
|
|
# Title — span or div with listing title
|
|
title_el = (a.find("span", style=re.compile(r'line-clamp'))
|
|
or a.find("span", class_=re.compile(r'x1lliihq|xt0psk2'))
|
|
or a.find("div", class_=re.compile(r'x1lliihq')))
|
|
title = title_el.get_text(strip=True) if title_el else ""
|
|
if not title:
|
|
# Try aria-label on the card
|
|
title = a.get("aria-label", "")
|
|
if not title:
|
|
continue
|
|
|
|
# Price
|
|
price = ""
|
|
for span in a.find_all("span"):
|
|
t = span.get_text(strip=True)
|
|
if re.match(r'[\$£€][\d,]+', t):
|
|
price = t
|
|
break
|
|
|
|
# Image
|
|
img = ""
|
|
im = a.find("img")
|
|
if im:
|
|
img = im.get("src", "") or im.get("data-src", "")
|
|
|
|
# Location — usually a second span below price
|
|
location = ""
|
|
spans = [s.get_text(strip=True) for s in a.find_all("span")
|
|
if s.get_text(strip=True) and s.get_text(strip=True) != title]
|
|
for s in spans:
|
|
if re.search(r'[A-Z][a-z]+,\s+[A-Z]{2}', s) or (
|
|
not re.match(r'[\$£€\d]', s) and len(s) > 3 and s != price):
|
|
location = s
|
|
break
|
|
|
|
results.append({
|
|
"url": full_url,
|
|
"title": title[:120],
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": "Facebook Marketplace",
|
|
"source_type": "classifieds",
|
|
"category": src.get("category", "Clasificados Generales"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[Facebook Marketplace] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[Facebook Marketplace] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_hmy(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
HMY Yachts — queries Algolia directly (app ECN3QX1VBL).
|
|
Fast, no Playwright needed.
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
ALGOLIA_URL = "https://ecn3qx1vbl-dsn.algolia.net/1/indexes/*/queries"
|
|
ALGOLIA_HEADERS = {
|
|
"x-algolia-application-id": "ECN3QX1VBL",
|
|
"x-algolia-api-key": "d86ccdd9ac0292ba76ee4755693d0c10",
|
|
"content-type": "application/json",
|
|
"referer": "https://www.hmy.com/",
|
|
"user-agent": random.choice(USER_AGENTS),
|
|
}
|
|
|
|
import urllib.parse
|
|
params_str = urllib.parse.urlencode({
|
|
"filters": "SalesStatus:Active",
|
|
"facetFilters": '[["SaleClassCode:used"]]',
|
|
"query": query,
|
|
"hitsPerPage": 40,
|
|
"page": 0,
|
|
})
|
|
|
|
payload = {
|
|
"requests": [{
|
|
"indexName": "production_oceanelite_yachts",
|
|
"params": params_str,
|
|
}]
|
|
}
|
|
|
|
try:
|
|
resp = requests.post(ALGOLIA_URL, json=payload, headers=ALGOLIA_HEADERS, timeout=15)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
hits = data.get("results", [{}])[0].get("hits", [])
|
|
|
|
for h in hits:
|
|
try:
|
|
slug = h.get("Slug", "")
|
|
url = h.get("URL") or (f"https://www.hmy.com/yachts-for-sale/{slug}" if slug else "")
|
|
if not url or url in seen:
|
|
continue
|
|
seen.add(url)
|
|
|
|
year = h.get("ModelYear", "")
|
|
make = h.get("MakeStringExact", "")
|
|
model = h.get("ModelExact", "")
|
|
name = h.get("BoatName", "")
|
|
title = f"{year} {make} {model}".strip()
|
|
if name:
|
|
title += f' "{name}"'
|
|
|
|
price_raw = h.get("NormPrice", 0)
|
|
price_text = f"${int(price_raw):,}" if price_raw else ""
|
|
|
|
length = h.get("NominalLengthNormalized", "")
|
|
country = h.get("country", "USA")
|
|
location = f"{length}ft · {country}" if length else country
|
|
|
|
img = h.get("mainImage", "")
|
|
|
|
results.append({
|
|
"url": url,
|
|
"title": title[:120],
|
|
"snippet": f"{price_text} · {location}".strip(" ·"),
|
|
"price_text": price_text,
|
|
"img_url": img,
|
|
"location": country,
|
|
"source": src.get("name", "HMY Yachts"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Venta Especializada"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src.get('name','HMY')}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src.get('name','HMY')}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_boatcrazy(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
BoatCrazy — US aggregator with 105+ listings per page.
|
|
|
|
Card: div.boat-list-item
|
|
Link: a[href*="/boat-for-sale/"]
|
|
Image: div.item-img img or div.list-itemimg img
|
|
Details: div.item-details
|
|
URL pattern: /boat-for-sale/YEAR-MAKE-LOCATION-id
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
raw_url = src.get("search_url", "") or "https://boatcrazy.com/boats?q={query}"
|
|
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True,
|
|
args=["--disable-blink-features=AutomationControlled","--no-sandbox"])
|
|
context = browser.new_context(
|
|
viewport={"width": 1280, "height": 900},
|
|
user_agent=random.choice(USER_AGENTS),
|
|
locale="en-US", ignore_https_errors=True)
|
|
context.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};")
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(random.randint(4000, 6000))
|
|
html = page.content()
|
|
except Exception as e:
|
|
print(f"[{src['name']}] nav error: {e}"); html = ""
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards = soup.find_all(class_="boat-list-item")
|
|
if not cards:
|
|
# fallback: find by link pattern
|
|
cards = []
|
|
for a in soup.find_all("a", href=re.compile(r'/boat-for-sale/')):
|
|
parent = a.find_parent(class_=re.compile(r'boat|list|item|card'))
|
|
if parent and parent not in cards:
|
|
cards.append(parent)
|
|
|
|
for card in cards:
|
|
try:
|
|
a = card.find("a", href=re.compile(r'/boat-for-sale/'))
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
full_url = href if href.startswith("http") else "https://boatcrazy.com" + href
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
|
|
# Title — prefer h3, then aria-label, then slug
|
|
title = ""
|
|
h3 = card.find("h3")
|
|
if h3:
|
|
title = h3.get_text(strip=True)[:80]
|
|
if not title:
|
|
al = card.find(attrs={"aria-label": True})
|
|
if al:
|
|
title = al["aria-label"][:80]
|
|
if not title:
|
|
slug = href.rstrip("/").split("/")[-1]
|
|
slug_clean = re.sub(r'-id[-\w]*$', '', slug).replace("-", " ")
|
|
title = slug_clean.title()[:80]
|
|
if not title:
|
|
continue
|
|
|
|
# Price
|
|
price = ""
|
|
price_el = card.find(class_=re.compile(r'\bprice\b'))
|
|
if price_el:
|
|
pm = re.search(r'\$[\d,]+', price_el.get_text())
|
|
if pm:
|
|
price = pm.group(0)
|
|
if not price:
|
|
pm = re.search(r'\$[\d,]+', card.get_text(" ", strip=True))
|
|
if pm:
|
|
price = pm.group(0)
|
|
|
|
# Location
|
|
location = ""
|
|
loc_el = card.find(class_="location")
|
|
if loc_el:
|
|
location = loc_el.get_text(strip=True)[:60]
|
|
if not location:
|
|
lm = re.search(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)?,\s*[A-Z]{2})', card.get_text(" ", strip=True))
|
|
if lm:
|
|
location = lm.group(1)
|
|
|
|
# Image
|
|
img = ""
|
|
img_div = card.find(class_=re.compile(r'item.?img|list.?item.?img'))
|
|
if img_div:
|
|
im = img_div.find("img")
|
|
if im:
|
|
img = (_extract_best_src(im) or im.get("src","") or im.get("data-src",""))
|
|
if not img:
|
|
im = card.find("img")
|
|
if im:
|
|
img = im.get("src","") or im.get("data-src","")
|
|
|
|
results.append({
|
|
"url": full_url,
|
|
"title": title,
|
|
"snippet": f"{price} {location}".strip(),
|
|
"price_text": price,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "BoatCrazy"),
|
|
"source_type": src.get("type", "classifieds"),
|
|
"category": src.get("category", "Clasificados Generales"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
def scrape_denison(src: dict, query: str, filters: dict) -> list:
|
|
"""
|
|
Denison Yachting — static HTML, 30 cards per page.
|
|
|
|
Card: div.boat-item
|
|
URL: a[href*=/yachts-for-sale/SLUG] (non-dashboard link)
|
|
Title: boat_length + make/model + year + name
|
|
Price: h4.boat_price[data-price] + [data-default_currency]
|
|
Location: h3 text | Image: div.news_pic img
|
|
Search: ?search={query}
|
|
"""
|
|
results = []
|
|
seen = set()
|
|
|
|
base = "https://www.denisonyachtsales.com/yachts-for-sale/"
|
|
url = f"{base}?search={requests.utils.quote(query.strip())}"
|
|
|
|
LISTING_RE = re.compile(r'/yachts-for-sale/[a-z][a-z0-9-]{4,}$', re.I)
|
|
CURRENCY_SYMBOLS = {"USD": "$", "EUR": "€", "GBP": "£", "AUD": "A$"}
|
|
|
|
try:
|
|
resp = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)},
|
|
timeout=20, verify=False)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
for card in soup.find_all(class_="boat-item"):
|
|
try:
|
|
a = card.find("a", href=LISTING_RE)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
full_url = href if href.startswith("http") else "https://www.denisonyachtsales.com" + href
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
|
|
# Title: length + make/model year + "name"
|
|
h2 = card.find("h2")
|
|
if h2:
|
|
length_el = h2.find(class_="boat_length")
|
|
length_txt = length_el.get_text(strip=True) if length_el else ""
|
|
if length_el:
|
|
length_el.extract()
|
|
name_el = h2.find("span")
|
|
name_txt = name_el.get_text(strip=True) if name_el else ""
|
|
if name_el:
|
|
name_el.extract()
|
|
rest = " ".join(h2.get_text(" ", strip=True).split())
|
|
parts = [p for p in [length_txt, rest, f'"{name_txt}"' if name_txt else ""] if p]
|
|
title = " ".join(parts)[:100]
|
|
else:
|
|
title = (a.get("title", "") or "")[:100]
|
|
if not title:
|
|
continue
|
|
|
|
# Price
|
|
price_text = ""
|
|
price_el = card.find(class_="boat_price")
|
|
if price_el:
|
|
raw_price = price_el.get("data-price", "")
|
|
currency = price_el.get("data-default_currency", "USD")
|
|
sym = CURRENCY_SYMBOLS.get(currency, currency + " ")
|
|
if raw_price:
|
|
try:
|
|
price_text = f"{sym}{int(raw_price):,}"
|
|
except ValueError:
|
|
price_text = price_el.get_text(strip=True)[:30]
|
|
|
|
# Location
|
|
location = ""
|
|
h3 = card.find("h3")
|
|
if h3:
|
|
location = h3.get_text(strip=True)[:80]
|
|
|
|
# Image
|
|
img = ""
|
|
pic_div = card.find(class_="news_pic")
|
|
if pic_div:
|
|
im = pic_div.find("img")
|
|
if im:
|
|
img = im.get("src", "") or im.get("data-src", "")
|
|
|
|
results.append({
|
|
"url": full_url,
|
|
"title": title,
|
|
"snippet": f"{price_text} · {location}".strip(" ·"),
|
|
"price_text": price_text,
|
|
"img_url": img,
|
|
"location": location,
|
|
"source": src.get("name", "Denison Yachting"),
|
|
"source_type": src.get("type", "broker"),
|
|
"category": src.get("category", "Brokers USA"),
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"[{src.get('name','Denison')}] {len(results)} listings")
|
|
|
|
except Exception as e:
|
|
print(f"[{src.get('name','Denison')}] Error: {e}")
|
|
|
|
return results
|
|
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: GovPlanet + IronPlanet (Ritchie Bros family — same HTML .sr_lot)
|
|
# =============================================================================
|
|
def scrape_govplanet(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
GovPlanet (recreational marine) and IronPlanet (commercial marine).
|
|
Both share Ritchie Bros HTML: listing cards use .sr_lot selector.
|
|
GovPlanet: https://www.govplanet.com/Recreational+Marine
|
|
IronPlanet: https://www.ironplanet.com/Commercial+Marine+Vessels
|
|
"""
|
|
results = []
|
|
try:
|
|
url = src["search_url"]
|
|
base = "https://" + url.split("/")[2]
|
|
headers = get_headers(referer=base + "/")
|
|
time.sleep(1.0)
|
|
r = requests.get(url, headers=headers, timeout=25, verify=False)
|
|
if r.status_code not in (200, 206):
|
|
print(f"[{src['name']}] HTTP {r.status_code}")
|
|
return []
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
seen = set()
|
|
for card in soup.select(".sr_lot, .lot-tile, article.lot, [class*=srItem]"):
|
|
try:
|
|
a = card.find("a", href=True)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
title = a.get_text(strip=True)[:100] or card.get_text(" ", strip=True)[:80]
|
|
price_el = card.select_one(".price, .lot-price, span[class*=price]")
|
|
price_txt = price_el.get_text(strip=True) if price_el else ""
|
|
img_el = card.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if img and img.startswith("/"):
|
|
img = base + img
|
|
if title and len(title) > 4:
|
|
results.append({
|
|
"title": title,
|
|
"url": href,
|
|
"snippet": card.get_text(" ", strip=True)[:200],
|
|
"price_text": price_txt,
|
|
"location": "",
|
|
"img_url": img,
|
|
"source": src["name"],
|
|
"source_type": src.get("type", "auction"),
|
|
"category": src.get("category", ""),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[{src['name']}] {len(results)} listings")
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: HiBid (React SPA — Playwright required)
|
|
# =============================================================================
|
|
def scrape_hibid(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
HiBid online auction platform — React SPA requires Playwright.
|
|
URL: https://www.hibid.com/lots?q={query}+boat
|
|
Cards: .lot-tile Title: h3/.lot-title Price: .high-bid/.lot-price
|
|
"""
|
|
results = []
|
|
try:
|
|
q = requests.utils.quote((query.strip() + " boat"))
|
|
url = f"https://www.hibid.com/lots?q={q}"
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
viewport={"width": 1280, "height": 900},
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
ctx.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(4000)
|
|
html = page.content()
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
seen = set()
|
|
for card in soup.select(".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"):
|
|
try:
|
|
a = card.find("a", href=True)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = "https://www.hibid.com" + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]")
|
|
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
|
|
price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]")
|
|
price_txt = price_el.get_text(strip=True) if price_el else ""
|
|
img_el = card.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if title and len(title) > 4:
|
|
results.append({
|
|
"title": title,
|
|
"url": href,
|
|
"snippet": card.get_text(" ", strip=True)[:200],
|
|
"price_text": price_txt,
|
|
"location": "",
|
|
"img_url": img,
|
|
"source": src["name"],
|
|
"source_type": src.get("type", "auction"),
|
|
"category": src.get("category", ""),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[{src['name']}] {len(results)} lots")
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: Copart salvage boats (heavy JS SPA — Playwright)
|
|
# =============================================================================
|
|
def scrape_copart(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Copart salvage/insurance lots for watercraft.
|
|
URL: https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT
|
|
Lots render in a React table after JS executes.
|
|
"""
|
|
results = []
|
|
try:
|
|
q = requests.utils.quote(query.strip())
|
|
url = f"https://www.copart.com/vehicleFinderSection/?searchStr={q}&vehicleType=BOAT"
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"]
|
|
)
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
viewport={"width": 1280, "height": 900},
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
ctx.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(5000)
|
|
try:
|
|
page.wait_for_selector(
|
|
".lot-row, tr[data-lot], .lot-details, [class*=lottile], [class*=lot-card]",
|
|
timeout=8000
|
|
)
|
|
except Exception:
|
|
pass
|
|
html = page.content()
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
seen = set()
|
|
for row in soup.select(
|
|
"tr[data-lot], .lot-row, [class*=lot-card], [class*=lottile], [class*=lot-item]"
|
|
):
|
|
try:
|
|
a = row.find("a", href=re.compile(r"/lot/"))
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = "https://www.copart.com" + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
title_el = row.select_one("[class*=title], [class*=desc], td.des")
|
|
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
|
|
price_el = row.select_one("[class*=bid], [class*=price], td.bid")
|
|
price_txt = price_el.get_text(strip=True) if price_el else ""
|
|
img_el = row.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if title and len(title) > 4:
|
|
results.append({
|
|
"title": title,
|
|
"url": href,
|
|
"snippet": row.get_text(" ", strip=True)[:200],
|
|
"price_text": price_txt,
|
|
"location": "",
|
|
"img_url": img,
|
|
"source": src["name"],
|
|
"source_type": "salvage",
|
|
"category": src.get("category", ""),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[{src['name']}] {len(results)} lots")
|
|
except Exception as e:
|
|
print(f"[{src['name']}] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: Trade a Boat AU (server-rendered Material-UI)
|
|
# =============================================================================
|
|
def scrape_tradeaboat(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
TradeABoat Australia — server-rendered with Material-UI CSS classes.
|
|
Cards use jss* dynamic class names; fallback to /details/ link detection.
|
|
URL: https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query}
|
|
"""
|
|
results = []
|
|
try:
|
|
q = requests.utils.quote(query.strip())
|
|
url = f"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={q}"
|
|
headers = get_headers(referer="https://www.tradeaboat.com.au/")
|
|
time.sleep(1.0)
|
|
r = requests.get(url, headers=headers, timeout=25, verify=False)
|
|
if r.status_code not in (200, 206):
|
|
print(f"[Trade a Boat AU] HTTP {r.status_code}")
|
|
return []
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
base = "https://www.tradeaboat.com.au"
|
|
seen = set()
|
|
# MUI class names are dynamic (jss77, jss78 …) — find cards via /details/ links
|
|
detail_links = soup.find_all("a", href=re.compile(r"/details/"))
|
|
visited_parents = set()
|
|
for a in detail_links:
|
|
try:
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
# Walk up to find card container
|
|
card = a.find_parent("div") or a
|
|
card_id = id(card)
|
|
if card_id in visited_parents:
|
|
continue
|
|
visited_parents.add(card_id)
|
|
title_el = card.select_one("h2, h3, [class*=title]")
|
|
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
|
|
price_el = card.select_one("[class*=price], [class*=Price]")
|
|
price_txt = price_el.get_text(strip=True) if price_el else ""
|
|
img_el = card.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if img and img.startswith("/"):
|
|
img = base + img
|
|
if title and len(title) > 4:
|
|
results.append({
|
|
"title": title,
|
|
"url": href,
|
|
"snippet": card.get_text(" ", strip=True)[:200],
|
|
"price_text": price_txt,
|
|
"location": "Australia",
|
|
"img_url": img,
|
|
"source": "Trade a Boat AU",
|
|
"source_type": "broker",
|
|
"category": src.get("category", ""),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[Trade a Boat AU] {len(results)} listings")
|
|
except Exception as e:
|
|
print(f"[Trade a Boat AU] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: Galati Yachts (requests, WordPress / YSP plugin)
|
|
# =============================================================================
|
|
def scrape_galati(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Galati Yachts — server-rendered WordPress with YachtSalesPlugin.
|
|
URL: https://www.galatiyachts.com/yachts-for-sale/?keywords={query}
|
|
"""
|
|
results = []
|
|
try:
|
|
q = requests.utils.quote(query.strip())
|
|
url = f"https://www.galatiyachts.com/yachts-for-sale/?keywords={q}"
|
|
headers = get_headers(referer="https://www.galatiyachts.com/")
|
|
time.sleep(1.0)
|
|
r = requests.get(url, headers=headers, timeout=25, verify=False)
|
|
if r.status_code not in (200, 206):
|
|
print(f"[Galati Yachts] HTTP {r.status_code}")
|
|
return []
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
base = "https://www.galatiyachts.com"
|
|
seen = set()
|
|
# YSP listing cards — try common selectors, fallback to /yachts/ links
|
|
cards = soup.select(".ysp-listing, .listing-card, .yacht-card, [class*=yacht-listing]")
|
|
if not cards:
|
|
# fallback: group by /yachts/details/ anchor
|
|
for a in soup.find_all("a", href=re.compile(r"/yachts/")):
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if href in seen or "galatiyachts.com" not in href:
|
|
continue
|
|
if href.count("/") < 4:
|
|
continue
|
|
seen.add(href)
|
|
card = a.find_parent("div") or a
|
|
title_el = card.select_one("h2, h3, [class*=title]")
|
|
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
|
|
price_el = card.select_one("[class*=price], .price")
|
|
price_txt = price_el.get_text(strip=True) if price_el else ""
|
|
img_el = card.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if img and img.startswith("/"):
|
|
img = base + img
|
|
if title and len(title) > 4:
|
|
results.append({
|
|
"title": title, "url": href,
|
|
"snippet": card.get_text(" ", strip=True)[:200],
|
|
"price_text": price_txt, "location": "USA",
|
|
"img_url": img, "source": "Galati Yachts",
|
|
"source_type": "broker", "category": src.get("category", ""),
|
|
})
|
|
else:
|
|
for card in cards:
|
|
try:
|
|
a = card.find("a", href=True)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
title_el = card.select_one("h2, h3, [class*=title]")
|
|
title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100]
|
|
price_el = card.select_one("[class*=price], .price")
|
|
price_txt = price_el.get_text(strip=True) if price_el else ""
|
|
img_el = card.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if img and img.startswith("/"):
|
|
img = base + img
|
|
if title and len(title) > 4:
|
|
results.append({
|
|
"title": title, "url": href,
|
|
"snippet": card.get_text(" ", strip=True)[:200],
|
|
"price_text": price_txt, "location": "USA",
|
|
"img_url": img, "source": "Galati Yachts",
|
|
"source_type": "broker", "category": src.get("category", ""),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[Galati Yachts] {len(results)} listings")
|
|
except Exception as e:
|
|
print(f"[Galati Yachts] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: Luxury brokers (Fraser, Burgess, Worth Ave, Merle Wood, N&J)
|
|
# Playwright — JS-heavy sites that won't render with plain requests
|
|
# =============================================================================
|
|
def scrape_luxury_broker(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Generic Playwright scraper for luxury yacht broker sites.
|
|
Covers: Fraser Yachts, Worth Ave Yachts, Merle Wood, Burgess, N&J.
|
|
Follows internal links with /yacht/, /vessel/, /boat/, /listing/ in path.
|
|
"""
|
|
results = []
|
|
name = src.get("name", "Broker")
|
|
try:
|
|
raw_url = src["search_url"]
|
|
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
|
|
base = "https://" + url.split("/")[2]
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"]
|
|
)
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
viewport={"width": 1280, "height": 900},
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
ctx.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
"window.chrome={runtime:{}};"
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(3000)
|
|
page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
|
page.wait_for_timeout(1500)
|
|
html = page.content()
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
seen = set()
|
|
LISTING_RE = re.compile(
|
|
r'/(yacht[s]?|vessel[s]?|boat[s]?|listing[s]?|detail[s]?|sale|for-sale)/',
|
|
re.I
|
|
)
|
|
for a in soup.find_all("a", href=LISTING_RE):
|
|
try:
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if href in seen or len(href) < 25:
|
|
continue
|
|
path = href.split("?")[0].rstrip("/")
|
|
if path.count("/") < 3:
|
|
continue
|
|
seen.add(href)
|
|
parent = a.find_parent("div") or a.find_parent("li") or a
|
|
title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80]
|
|
title = " ".join(title.split())[:100]
|
|
if len(title) < 5:
|
|
continue
|
|
ctx_txt = parent.get_text(" ", strip=True)[:300]
|
|
pm = re.search(r'[\$€£]\s*[\d,\.]+(?:\s*[Mm]illion|M)?', ctx_txt)
|
|
price_txt = pm.group() if pm else ""
|
|
img_el = parent.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if img and img.startswith("/"):
|
|
img = base + img
|
|
results.append({
|
|
"title": title, "url": href,
|
|
"snippet": ctx_txt[:200], "price_text": price_txt,
|
|
"location": "", "img_url": img,
|
|
"source": name, "source_type": src.get("type", "broker"),
|
|
"category": src.get("category", ""),
|
|
})
|
|
if len(results) >= 30:
|
|
break
|
|
except Exception:
|
|
continue
|
|
print(f"[{name}] {len(results)} listings")
|
|
except Exception as e:
|
|
print(f"[{name}] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: EU/International brokers blocked on requests (Playwright)
|
|
# Covers: Boat24, YachtAll, Annonces Bateau, Inautia ES, Boats&Outboards UK,
|
|
# Boatsales AU, YachtMarket, Apollo Duck UK subdomain
|
|
# =============================================================================
|
|
def scrape_eu_broker(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Generic Playwright scraper for EU/AU/UK broker sites that block plain
|
|
requests (403/ECONNREFUSED). Navigates with real browser, extracts listings.
|
|
"""
|
|
results = []
|
|
name = src.get("name", "EU Broker")
|
|
try:
|
|
raw_url = src["search_url"]
|
|
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
|
|
base = "https://" + url.split("/")[2]
|
|
domain = url.split("/")[2]
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
viewport={"width": 1280, "height": 900},
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
ctx.add_init_script(
|
|
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(url, timeout=35000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(3000)
|
|
html = page.content()
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
seen = set()
|
|
for a in soup.find_all("a", href=True):
|
|
try:
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if domain not in href or href in seen:
|
|
continue
|
|
path = href.split("?")[0].rstrip("/")
|
|
if path.count("/") < 3:
|
|
continue
|
|
if any(s in href.lower() for s in [
|
|
"login","register","contact","about","help","privacy",
|
|
"sitemap","category","search","tag","page=","lang="
|
|
]):
|
|
continue
|
|
seen.add(href)
|
|
parent = a.find_parent("div") or a.find_parent("li") or a
|
|
title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80]
|
|
title = " ".join(title.split())[:100]
|
|
if len(title) < 5:
|
|
continue
|
|
ctx_txt = parent.get_text(" ", strip=True)[:300]
|
|
pm = re.search(r'[\$€£]\s*[\d,\.]+', ctx_txt)
|
|
price_txt = pm.group() if pm else ""
|
|
img_el = parent.find("img")
|
|
img = _extract_best_src(img_el) if img_el else ""
|
|
if img and img.startswith("/"):
|
|
img = base + img
|
|
results.append({
|
|
"title": title, "url": href,
|
|
"snippet": ctx_txt[:200], "price_text": price_txt,
|
|
"location": "", "img_url": img,
|
|
"source": name, "source_type": src.get("type", "broker"),
|
|
"category": src.get("category", ""),
|
|
})
|
|
if len(results) >= 30:
|
|
break
|
|
except Exception:
|
|
continue
|
|
print(f"[{name}] {len(results)} listings")
|
|
except Exception as e:
|
|
print(f"[{name}] Error: {e}")
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPER: Forum For-Sale sections (TheHullTruth, Cruisers Forum)
|
|
# =============================================================================
|
|
def scrape_forum_fs(src: dict, query: str, filters: dict) -> list[dict]:
|
|
"""
|
|
Scrapes For-Sale classified threads from boating forums (Playwright).
|
|
TheHullTruth: /boating-forum/search.php?do=process&query={query}&prefixid=FS
|
|
Cruisers Forum: /forums/f152/ (Classifieds subforum)
|
|
"""
|
|
results = []
|
|
name = src.get("name", "Forum")
|
|
try:
|
|
raw_url = src["search_url"]
|
|
url = raw_url.replace("{query}", requests.utils.quote(query.strip()))
|
|
base = "https://" + url.split("/")[2]
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(
|
|
user_agent=random.choice(USER_AGENTS),
|
|
viewport={"width": 1280, "height": 900},
|
|
locale="en-US",
|
|
ignore_https_errors=True,
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(2000)
|
|
html = page.content()
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
seen = set()
|
|
# vBulletin/XenForo thread rows
|
|
for row in soup.select(
|
|
"li.threadbit, div.threadbit, .thread-item, "
|
|
"tr.odd, tr.even, .search-result, [class*=thread], "
|
|
".js-threadListItem, li[id*=thread]"
|
|
):
|
|
try:
|
|
a = row.find("a", href=re.compile(
|
|
r'showthread|/thread[s]?/|/t/\d|/post', re.I
|
|
))
|
|
if not a:
|
|
a = row.find("a", href=True)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
if not href.startswith("http"):
|
|
href = base + href
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
title = a.get_text(strip=True)[:100]
|
|
ctx_txt = row.get_text(" ", strip=True)[:200]
|
|
pm = re.search(r'\$\s*[\d,]{3,}', ctx_txt)
|
|
price_txt = pm.group() if pm else ""
|
|
if title and len(title) > 5:
|
|
results.append({
|
|
"title": title, "url": href,
|
|
"snippet": ctx_txt, "price_text": price_txt,
|
|
"location": "", "img_url": "",
|
|
"source": name, "source_type": "classifieds",
|
|
"category": src.get("category", ""),
|
|
})
|
|
except Exception:
|
|
continue
|
|
print(f"[{name}] {len(results)} threads")
|
|
except Exception as e:
|
|
print(f"[{name}] Error: {e}")
|
|
return results
|
|
|
|
|
|
def scrape_source_router(src: dict, query: str, filters: dict, page: int = 1):
|
|
"""Central dispatcher — routes each source to its dedicated scraper."""
|
|
name = src.get("name", "")
|
|
|
|
# ── Dedicated scrapers ────────────────────────────────────────────────────
|
|
if name == "YachtWorld":
|
|
return scrape_yachtworld(query, filters, max_pages=1)
|
|
|
|
if name.startswith("eBay"): # covers all 5 eBay entries
|
|
return scrape_ebay(src, query, filters)
|
|
|
|
if name == "BoatTrader":
|
|
return scrape_boattrader(src, query, filters)
|
|
|
|
if name in ("Apollo Duck", "Apollo Duck Workboats"):
|
|
return scrape_apolloduck(src, query, filters)
|
|
|
|
if name == "Boats.com":
|
|
return scrape_boatsdotcom(src, query, filters)
|
|
|
|
if name == "Craigslist": # single multi-city Craigslist entry
|
|
return scrape_craigslist(src, query, filters)
|
|
|
|
if name.startswith("Craigslist "): # individual city entries — one request each
|
|
return scrape_direct_source(src, query, filters)
|
|
|
|
if name in ("GovPlanet", "GovPlanet Recreational",
|
|
"IronPlanet", "IronPlanet Marine"):
|
|
return scrape_govplanet(src, query, filters)
|
|
|
|
if name == "HiBid":
|
|
return scrape_hibid(src, query, filters)
|
|
|
|
if name in ("Copart Marine", "Copart Boats", "Copart Watercraft"):
|
|
return scrape_copart(src, query, filters)
|
|
|
|
if name == "Trade a Boat AU":
|
|
return scrape_tradeaboat(src, query, filters)
|
|
|
|
if name == "Galati Yachts":
|
|
return scrape_galati(src, query, filters)
|
|
|
|
if name in ("Fraser Yachts", "Burgess Yachts", "Northrop & Johnson",
|
|
"Worth Ave Yachts"):
|
|
return scrape_luxury_broker(src, query, filters)
|
|
|
|
# Boat24 handled below by dedicated scrape_boat24; Inautia handled by scrape_inautia
|
|
if name in ("Boat24 EU", "YachtAll", "Annonces Bateau",
|
|
"Annonces Bateau FR", "Inautia ES", "Boats & Outboards UK",
|
|
"Boats Outboards UK", "Apollo Duck UK",
|
|
"Boatsales AU", "YachtMarket", "Boatpoint AU"):
|
|
return scrape_eu_broker(src, query, filters)
|
|
|
|
if name in ("TheHullTruth", "Cruisers Forum"):
|
|
return scrape_forum_fs(src, query, filters)
|
|
|
|
if name == "YachtWorld Commercial":
|
|
return scrape_yachtworld(query, filters, max_pages=1)
|
|
|
|
if name == "Rightboat":
|
|
return scrape_rightboat(src, query, filters)
|
|
|
|
if name in ("Cooper Salvage", "Cooper Capital Salvage"):
|
|
return scrape_cooperss(src, query, filters)
|
|
|
|
if name == "Inautia":
|
|
return scrape_inautia(src, query, filters)
|
|
|
|
if name == "Boat24":
|
|
return scrape_boat24(src, query, filters)
|
|
|
|
if name == "Facebook Marketplace":
|
|
return scrape_facebook_marketplace(src, query, filters)
|
|
|
|
if name == "HMY Yachts":
|
|
return scrape_hmy(src, query, filters)
|
|
|
|
if name == "BoatCrazy":
|
|
return scrape_boatcrazy(src, query, filters)
|
|
|
|
if name == "Denison Yachting":
|
|
return scrape_denison(src, query, filters)
|
|
|
|
# ── Generic HTML scraper (fallback) ──────────────────────────────────────
|
|
return scrape_direct_source(src, query, filters)
|
|
|
|
|
|
def extract_vessel_fast(raw: dict) -> dict | None:
|
|
"""
|
|
Pure-regex vessel extraction — no Ollama call.
|
|
Used for results from known boat marketplaces (broker/classifieds/auction/etc.)
|
|
Returns a data dict compatible with save_vessel(), or None if too sparse.
|
|
"""
|
|
title = (raw.get("title") or "").strip()
|
|
snippet = (raw.get("snippet") or "")
|
|
price_text = (raw.get("price_text") or "")
|
|
location = (raw.get("location") or "")
|
|
src_name = (raw.get("source") or "").lower()
|
|
src_type = (raw.get("source_type") or "")
|
|
category = (raw.get("category") or "").lower()
|
|
|
|
if not title or len(title) < 5:
|
|
return None
|
|
|
|
combined = f"{title} {snippet} {price_text}"
|
|
|
|
# ── Price ────────────────────────────────────────────────────────────────
|
|
price_usd = None
|
|
currency_out = "USD"
|
|
for txt in [price_text, snippet, title]:
|
|
# USD
|
|
m = re.search(r'\$\s*([\d,]{3,})', txt)
|
|
if m:
|
|
try:
|
|
v = float(m.group(1).replace(",",""))
|
|
if 500 < v < 50_000_000:
|
|
price_usd = v; currency_out = "USD"; break
|
|
except: pass
|
|
# GBP
|
|
m = re.search(r'£\s*([\d,]{3,})', txt)
|
|
if m:
|
|
try:
|
|
v = float(m.group(1).replace(",","")) * 1.27
|
|
if 500 < v < 50_000_000:
|
|
price_usd = round(v); currency_out = "GBP"; break
|
|
except: pass
|
|
# EUR
|
|
m = re.search(r'€\s*([\d,]{3,})', txt)
|
|
if m:
|
|
try:
|
|
v = float(m.group(1).replace(",","")) * 1.09
|
|
if 500 < v < 50_000_000:
|
|
price_usd = round(v); currency_out = "EUR"; break
|
|
except: pass
|
|
# plain number + currency word
|
|
m = re.search(r'([\d,]{4,})\s*(?:USD|usd|GBP|gbp|EUR|eur)', txt)
|
|
if m:
|
|
try:
|
|
v = float(m.group(1).replace(",",""))
|
|
if 500 < v < 50_000_000:
|
|
price_usd = round(v); break
|
|
except: pass
|
|
|
|
# ── LOA ──────────────────────────────────────────────────────────────────
|
|
loa_m = None
|
|
for pat, in_meters in [
|
|
(r'(?:loa|length)[:\s]+([\d.]+)\s*(?:ft|\'|feet)', False),
|
|
(r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', False), # starts with size
|
|
(r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet)\b', False),
|
|
(r"(\d{2,3}(?:\.\d)?)'", False),
|
|
(r'(?:loa|length)[:\s]+([\d.]+)\s*m\b', True),
|
|
]:
|
|
m = re.search(pat, combined, re.IGNORECASE)
|
|
if m:
|
|
try:
|
|
v = float(m.group(1))
|
|
if in_meters:
|
|
if 5 < v < 200: loa_m = round(v, 1); break
|
|
else:
|
|
if 10 < v < 500: loa_m = round(v * 0.3048, 1); break
|
|
except: pass
|
|
|
|
# ── Year ─────────────────────────────────────────────────────────────────
|
|
year = None
|
|
ym = re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title)
|
|
if ym: year = int(ym.group(1))
|
|
|
|
# ── Vessel type ──────────────────────────────────────────────────────────
|
|
cl = combined.lower()
|
|
if any(k in src_name for k in ["sailboat","sail"]) or "veleros" in category:
|
|
vtype = "Sailboat"
|
|
elif any(k in src_name for k in ["workboat","commercial","osv","offshore"]):
|
|
vtype = "Offshore"
|
|
elif "tug" in src_name: vtype = "Tug"
|
|
elif "barge" in src_name: vtype = "Barge"
|
|
elif any(k in cl for k in ["sailboat","sailing","velero","ketch","sloop","schooner",
|
|
"yawl","cutter","catamaran","trimaran","voilier"]):
|
|
vtype = "Sailboat"
|
|
elif any(k in cl for k in ["tugboat","tug boat","remolcador"]): vtype = "Tug"
|
|
elif "barge" in cl or "barcaza" in cl: vtype = "Barge"
|
|
elif any(k in cl for k in ["offshore","osv","supply vessel","crew boat"]): vtype = "Offshore"
|
|
elif any(k in cl for k in ["fishing","trawler","seiner","pesquero"]): vtype = "Fishing"
|
|
elif any(k in cl for k in ["yacht","motor yacht","motoryacht"]): vtype = "Yacht"
|
|
else: vtype = "Motor"
|
|
|
|
status = ("auction" if src_type == "auction" else
|
|
"salvage" if src_type == "salvage" else "active")
|
|
|
|
# Infer location from source name when missing (e.g. "Craigslist Houston" → "Houston")
|
|
if not location and raw.get("source"):
|
|
src_full = raw["source"]
|
|
if re.search(r'[Cc]raigslist', src_full):
|
|
city = re.sub(r'[Cc]raigslist\s*', '', src_full).strip()
|
|
if city: location = city
|
|
elif "Kijiji" in src_full: location = "Canada"
|
|
elif "Gumtree" in src_full: location = "Australia"
|
|
elif "LeBonCoin" in src_full: location = "France"
|
|
elif "Subito" in src_full: location = "Italy"
|
|
|
|
# For trusted marketplace sources keep the result even with partial data.
|
|
# For web-search results require at least one data point to avoid garbage.
|
|
is_trusted = src_type in ("broker", "classifieds", "salvage", "commercial", "auction")
|
|
if not is_trusted and not (price_usd or loa_m or year or location):
|
|
return None
|
|
|
|
score = 50
|
|
if loa_m:
|
|
score += min(10, int(loa_m - 10))
|
|
if year and year > 1990:
|
|
score += min(10, (year - 1990) // 3)
|
|
if price_usd and loa_m:
|
|
pft = price_usd / max(loa_m / 0.3048, 1)
|
|
if pft < 600: score += 15
|
|
elif pft < 1200: score += 8
|
|
score = min(100, max(0, score))
|
|
|
|
return {
|
|
"_fast": True, # flag: skip unit-conversion block downstream
|
|
"skip": False,
|
|
"name": title[:100],
|
|
"vessel_type": vtype,
|
|
"loa_m": loa_m,
|
|
"beam_m": None,
|
|
"draft_m": None,
|
|
"year_built": year,
|
|
"hull": "Unknown",
|
|
"propulsion": "Sail" if vtype == "Sailboat" else "Diesel",
|
|
"status": status,
|
|
"price_usd": price_usd,
|
|
"currency": currency_out,
|
|
"location": location,
|
|
"country": None,
|
|
"description": f"{title[:140]}",
|
|
"flags": [],
|
|
"score": score,
|
|
}
|
|
|
|
|
|
def search_with_ai(query: str, filters: dict) -> list:
|
|
"""
|
|
Hybrid search: direct scraping of open sources + web search to reach
|
|
blocked sites (YachtWorld, Boats.com, Apollo Duck, etc.)
|
|
"""
|
|
vessel_type = filters.get("type", "")
|
|
region = filters.get("region", "").lower()
|
|
|
|
base = query
|
|
if vessel_type and vessel_type.lower() not in query.lower():
|
|
base = f"{vessel_type} {base}"
|
|
|
|
# Filter sources by region if specified
|
|
# Load custom sources from DB and merge with built-in
|
|
try:
|
|
conn = get_db()
|
|
custom = [dict(r) for r in conn.execute(
|
|
"SELECT * FROM custom_sources WHERE active=1").fetchall()]
|
|
conn.close()
|
|
all_sources = DIRECT_SOURCES + [{
|
|
"name": c["name"],
|
|
"category": c["category"],
|
|
"search_url": c["search_url"],
|
|
"result_sel": "a[href]",
|
|
"price_sel": "",
|
|
"img_sel": "img",
|
|
"loc_sel": "",
|
|
"type": c["source_type"],
|
|
} for c in custom]
|
|
except:
|
|
all_sources = DIRECT_SOURCES
|
|
|
|
sources_to_use = all_sources
|
|
if region and region not in ["global", "todo", "all", ""]:
|
|
region_map = {
|
|
"usa": ["USA", "Clasificados USA", "Subastas Gobierno USA", "Subastas USA", "Subastas Gobierno", "Comercial Offshore"],
|
|
"europa": ["Europa", "Brokers Europa", "Francia", "Italia", "Reino Unido", "España", "España / Global"],
|
|
"caribe": ["Latinoamérica", "Latinoamérica / España", "España / Global"],
|
|
"latin": ["Latinoamérica", "Latinoamérica / España", "España", "España / Global"],
|
|
"asia": ["Australia / Pacífico"],
|
|
"australia": ["Australia / Pacífico"],
|
|
}
|
|
allowed_cats = None
|
|
for key, cats in region_map.items():
|
|
if key in region:
|
|
allowed_cats = cats
|
|
break
|
|
if allowed_cats:
|
|
sources_to_use = [s for s in all_sources if any(c in s["category"] for c in allowed_cats)]
|
|
if not sources_to_use:
|
|
sources_to_use = all_sources
|
|
|
|
# Filter by status
|
|
status = filters.get("status", "")
|
|
if status == "auction":
|
|
sources_to_use = [s for s in sources_to_use if s["type"] in ["auction", "salvage"]] or sources_to_use
|
|
elif status == "salvage":
|
|
sources_to_use = [s for s in sources_to_use if s["type"] == "salvage"] or sources_to_use
|
|
elif status not in ("salvage",):
|
|
# Exclude salvage-only sources unless explicitly searching for salvage
|
|
sources_to_use = [s for s in sources_to_use if s["type"] != "salvage"] or sources_to_use
|
|
|
|
# Vessel-type-aware source prioritization
|
|
OFFSHORE_TYPES = {"offshore", "tug", "barge", "ferry", "fishing", "commercial", "salvage"}
|
|
SAILBOAT_TYPES = {"sailboat", "sail", "velero", "ketch", "sloop", "cutter", "schooner"}
|
|
COMMERCIAL_ONLY_SOURCES = {
|
|
"Seaboats Tug", "Seaboats Barge", "Seaboats Offshore", "Seaboats Fishing",
|
|
"OSV Broker", "OSVBroker", "WorkBoat Classifieds", "VT Halter Marine",
|
|
"Maritime Connector", "ShipXchange", "Commercial Vessel",
|
|
}
|
|
SAILBOAT_ONLY_SOURCES = {"SailboatListings", "SailboatListings View", "Cruisers Forum", "Sailboat Listing"}
|
|
vessel_type_lower = vessel_type.lower() if vessel_type else ""
|
|
|
|
if vessel_type_lower in OFFSHORE_TYPES:
|
|
# Skip sailboat-only sources, float commercial ones to front
|
|
sources_to_use = [s for s in sources_to_use if s["name"] not in SAILBOAT_ONLY_SOURCES]
|
|
commercial = [s for s in sources_to_use if s["type"] in ("commercial", "salvage", "auction")]
|
|
rest = [s for s in sources_to_use if s["type"] not in ("commercial", "salvage", "auction")]
|
|
sources_to_use = commercial + rest
|
|
elif vessel_type_lower in SAILBOAT_TYPES or "sail" in base.lower() or "velero" in base.lower():
|
|
# Skip commercial-only offshore sources for sailboat searches
|
|
sources_to_use = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES]
|
|
elif not vessel_type_lower:
|
|
# Generic search: keep all but put commercial sources after general ones
|
|
commercial = [s for s in sources_to_use if s["name"] in COMMERCIAL_ONLY_SOURCES]
|
|
rest = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES]
|
|
sources_to_use = rest + commercial
|
|
|
|
print(f"[Search] Querying {len(sources_to_use)} sources for: {base}")
|
|
search_state['total_sources'] = len(sources_to_use)
|
|
search_state['log'].append(f"Consultando {len(sources_to_use)} fuentes...")
|
|
|
|
def get_query_for_source(src):
|
|
"""Match query language to source region."""
|
|
cat = src.get("category","").lower()
|
|
if any(x in cat for x in ["france","franc","veleros franc"]):
|
|
return base
|
|
elif any(x in cat for x in ["spain","españa","espana","mexico","colombia","latin"]):
|
|
return base
|
|
else:
|
|
return f"{base} for sale" if "for sale" not in base.lower() else base
|
|
|
|
# Build web search queries targeting specific sites
|
|
web_queries = build_web_queries(base, filters)
|
|
|
|
total = len(sources_to_use) + len(web_queries)
|
|
search_state['total_sources'] = total
|
|
search_state['log'].append(f"Consultando {len(sources_to_use)} sitios directos + {len(web_queries)} búsquedas web...")
|
|
print(f"[Search] {len(sources_to_use)} direct + {len(web_queries)} web searches for: {base}")
|
|
|
|
# Run BOTH direct scraping AND web searches in parallel
|
|
all_raw = []
|
|
|
|
# ── SailboatListings: dedicated parallel thread (handles its own AI extraction) ──
|
|
# Only for sailboat/velero or generic searches, not for offshore/tug/barge/etc.
|
|
sbl_thread = None
|
|
if vessel_type_lower not in OFFSHORE_TYPES and vessel_type_lower not in {"motor", "motorboat"}:
|
|
sbl_thread = threading.Thread(
|
|
target=scrape_and_extract_sailboatlistings,
|
|
args=(query, filters, search_state.get('search_id', ''), 8),
|
|
daemon=True,
|
|
)
|
|
sbl_thread.start()
|
|
search_state['log'].append("SailboatListings: iniciado en paralelo (hilo dedicado)...")
|
|
print("[Search] SailboatListings dedicated thread started")
|
|
|
|
# ── Breadth-First Search across all sources ──────────────────────────────
|
|
# Round 1: page 1 of all sources simultaneously
|
|
# Round 2: page 2 of sources that had results
|
|
# Round 3: page 3, etc.
|
|
# Between rounds, a natural pause occurs as we process results
|
|
# This avoids hammering any single source with consecutive requests
|
|
|
|
MAX_ROUNDS = 6 # max pages per source
|
|
active_srcs = {src["name"]: {"src": src, "page": 1, "has_more": True}
|
|
for src in sources_to_use}
|
|
|
|
# Web searches only run once (no pagination)
|
|
web_done = False
|
|
|
|
for round_num in range(1, MAX_ROUNDS + 1):
|
|
if search_state.get("cancelled"):
|
|
break
|
|
|
|
round_sources = {name: info for name, info in active_srcs.items()
|
|
if info["has_more"]}
|
|
if not round_sources:
|
|
break
|
|
|
|
search_state['log'].append(f"Ronda {round_num}: consultando {len(round_sources)} fuentes...")
|
|
print(f"[Search] Round {round_num}: {len(round_sources)} active sources")
|
|
|
|
round_raw = []
|
|
with ThreadPoolExecutor(max_workers=12) as executor:
|
|
futures = {}
|
|
|
|
# Submit page N of all active sources
|
|
for name, info in round_sources.items():
|
|
src = info["src"]
|
|
q = get_query_for_source(src)
|
|
# Add page parameter to URL if supported and page > 1
|
|
src_with_page = dict(src)
|
|
if round_num > 1:
|
|
url = src["search_url"]
|
|
# Common pagination patterns
|
|
if "craigslist.org" in url:
|
|
src_with_page["search_url"] = url + f"&s={round_num * 25 - 25}"
|
|
elif "ebay.com" in url:
|
|
src_with_page["search_url"] = url + f"&_pgn={round_num}"
|
|
elif "seaboats.net" in url:
|
|
src_with_page["search_url"] = url + f"&page={round_num}"
|
|
elif "kijiji.ca" in url:
|
|
src_with_page["search_url"] = url.rstrip('/') + f"/page-{round_num}/"
|
|
else:
|
|
# Most sites don't support pagination via URL params we know
|
|
# Mark as done after page 1
|
|
active_srcs[name]["has_more"] = False
|
|
continue
|
|
futures[executor.submit(scrape_source_router, src_with_page, q, filters, round_num)] = name
|
|
|
|
# Web searches on round 1 only
|
|
if round_num == 1 and not web_done:
|
|
for wq in web_queries:
|
|
futures[executor.submit(web_search, wq, 6)] = f"Web:{wq[:20]}"
|
|
web_done = True
|
|
|
|
# Collect results for this round
|
|
for future in as_completed(futures, timeout=90):
|
|
name = futures[future]
|
|
try:
|
|
results = future.result()
|
|
count = len(results)
|
|
round_raw.extend(results)
|
|
search_state['sources_done'] += 1
|
|
|
|
if name.startswith("Web:"):
|
|
if count:
|
|
search_state['log'].append(f"🌐 Web: {count} resultados")
|
|
else:
|
|
if count:
|
|
search_state['log'].append(f"✓ {name} p{round_num}: {count}")
|
|
print(f"[Round {round_num}] {name}: {count} listings")
|
|
else:
|
|
# No results this round — remove from future rounds
|
|
if name in active_srcs:
|
|
active_srcs[name]["has_more"] = False
|
|
except Exception as e:
|
|
search_state['sources_done'] += 1
|
|
if name in active_srcs:
|
|
active_srcs[name]["has_more"] = False
|
|
|
|
all_raw.extend(round_raw)
|
|
print(f"[Search] Round {round_num} complete: {len(round_raw)} new results (total: {len(all_raw)})")
|
|
|
|
# Small pause between rounds — natural break
|
|
if round_num < MAX_ROUNDS and not search_state.get("cancelled"):
|
|
polite_pause("BFS-round")
|
|
|
|
print(f"[Search] Got {len(all_raw)} raw results, extracting vessel data...")
|
|
|
|
if not all_raw:
|
|
return []
|
|
|
|
# Extract vessel data — parallel with dedup and real-time save
|
|
vessels = []
|
|
lock = threading.Lock()
|
|
max_price = float(filters.get("max_price") or 0)
|
|
min_loa = float(filters.get("min_loa") or 0)
|
|
query_words = [w.lower() for w in query.split() if len(w) > 2]
|
|
|
|
# Deduplicate raw results by URL
|
|
seen_urls = set()
|
|
unique_raw = []
|
|
for r in all_raw:
|
|
if r["url"] not in seen_urls:
|
|
seen_urls.add(r["url"])
|
|
unique_raw.append(r)
|
|
|
|
print(f"[Extract] Processing {len(unique_raw)} unique URLs...")
|
|
|
|
SYNONYMS = {
|
|
"sailboat":["sail","velero","vela","ketch","sloop","schooner","yawl","voilier"],
|
|
"velero": ["sail","sailboat","vela","ketch","sloop"],
|
|
"tug": ["tugboat","remolcador","tug boat","schlepper"],
|
|
"barge": ["barcaza","chaland","ponton","landing craft","lct"],
|
|
"fishing": ["pesquero","trawler","seiner","longliner","fisher"],
|
|
"offshore":["osv","supply vessel","supply boat","platform"],
|
|
"yacht": ["yate","motoryacht","m/y"],
|
|
"motor": ["motorboat","lancha","speedboat","cruiser"],
|
|
}
|
|
NON_VESSELS = ["outboard motor","engine only","motor only","parts only",
|
|
"trailer only","propeller","honda bf","yamaha f","suzuki df",
|
|
"life jacket","anchor","marine insurance","boat storage",
|
|
# Land vehicles — never boats
|
|
"ford expedition","ford explorer","ford f-1","ford ranger",
|
|
"ford bronco","ford mustang","ford escape","ford transit",
|
|
"chevy silverado","chevy tahoe","chevy suburban","chevy colorado",
|
|
"chevrolet silverado","chevrolet tahoe","chevrolet suburban",
|
|
"gmc sierra","gmc yukon","gmc terrain","gmc canyon",
|
|
"dodge ram","ram 1500","ram 2500","ram 3500",
|
|
"jeep wrangler","jeep cherokee","jeep grand","jeep gladiator",
|
|
"toyota camry","toyota tacoma","toyota tundra","toyota 4runner",
|
|
"toyota highlander","toyota rav4","toyota sienna",
|
|
"subaru outback","subaru forester","subaru crosstrek",
|
|
"honda cr-v","honda pilot","honda accord","honda civic","honda odyssey",
|
|
"tesla model","bmw x","mercedes benz","audi q","volkswagen jetta",
|
|
"cadillac escalade","cadillac xt","buick enclave","buick encore",
|
|
# Non-vessel services
|
|
"sailing lesson","sailing partner","sailing school","sailing class",
|
|
"sailing instruction","boating lesson","boat lesson","boating class",
|
|
"sailing instructor","boat rental","kayak rental","canoe rental",
|
|
]
|
|
|
|
def expand_query(words):
|
|
expanded = set(words)
|
|
for w in words:
|
|
for key, syns in SYNONYMS.items():
|
|
if w == key or w in syns:
|
|
expanded.add(key)
|
|
expanded.update(syns)
|
|
return expanded
|
|
|
|
expanded_query = expand_query(query_words)
|
|
|
|
GENERIC_NAMES = {
|
|
"sailboat","velero","barco","yacht","boat","vessel","embarcación",
|
|
"sailboat for sale","velero en venta","boat for sale","barco en venta",
|
|
"motor boat","motorboat","fishing boat","tug boat","tugboat",
|
|
"within25 mi","within 25 mi","results","listing","listings",
|
|
}
|
|
|
|
def process_one(raw):
|
|
try:
|
|
if search_state.get("cancelled"):
|
|
return
|
|
|
|
# Quick title pre-check
|
|
title_lower = raw["title"].lower()
|
|
if any(kw in title_lower for kw in NON_VESSELS):
|
|
return
|
|
|
|
src_type = raw.get("source_type", "")
|
|
all_images = []
|
|
data = None
|
|
|
|
# ── FAST PATH: known boat marketplace → pure regex, no AI ────────
|
|
if src_type in ("broker","classifieds","auction","salvage","commercial"):
|
|
data = extract_vessel_fast(raw)
|
|
if data:
|
|
img = raw.get("img_url","")
|
|
if img:
|
|
all_images = [img]
|
|
else:
|
|
# Derive thumbnail from URL (no page fetch needed)
|
|
listing_url = raw.get("url","")
|
|
ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url)
|
|
if ebay_m:
|
|
all_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"]
|
|
cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url)
|
|
if cl_m:
|
|
all_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"]
|
|
|
|
# ── Fast path: validate the listing is actually a boat ──────────────
|
|
if data and data.get("_fast"):
|
|
combined_text = (raw.get("title","") + " " + raw.get("snippet","")).lower()
|
|
url_l = raw.get("url","").lower()
|
|
|
|
# URLs that are guaranteed to be boat listings (trusted sections)
|
|
BOAT_URLS = ("/boa","/boat","/sail","sailboatlistings","yachtworld",
|
|
"boattrader","seaboats","apolloduck","rightboat","boat24",
|
|
"annonces-bateau","barcos.net","tradeaboat","marinetraffic")
|
|
is_boat_url = any(k in url_l for k in BOAT_URLS)
|
|
|
|
# General auction sites (sell everything) need a boat keyword in the text
|
|
BOAT_WORDS = ["boat","sail","yacht","vessel","ketch","sloop","catamaran",
|
|
"trimaran","mast","hull","marina","keel","watercraft","cruiser",
|
|
"trawler","dinghy","skiff","pontoon","motorboat","powerboat",
|
|
"sailboat","barge","tugboat","outboard","inboard","nautical",
|
|
"marine","stern","bow","aft","draft","beam","knot","starboard"]
|
|
has_boat_word = any(k in combined_text for k in BOAT_WORDS)
|
|
|
|
if not is_boat_url and not has_boat_word:
|
|
return # Cars, furniture, etc. from general auction sites — skip
|
|
|
|
# ── SLOW PATH: web-search results → fetch page + AI ──────────────
|
|
if not data:
|
|
page_text, page_images = "", []
|
|
try:
|
|
fut = ThreadPoolExecutor(max_workers=1).submit(fetch_page_with_images, raw["url"])
|
|
page_text, page_images = fut.result(timeout=12)
|
|
except Exception:
|
|
page_text = (f"Title: {raw['title']} "
|
|
f"| Location: {raw.get('location','')} | {raw.get('snippet','')}")
|
|
|
|
if not page_images and raw.get("img_url"):
|
|
page_images = [raw["img_url"]]
|
|
if not page_images:
|
|
listing_url = raw.get("url", "")
|
|
ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url)
|
|
if ebay_m:
|
|
page_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"]
|
|
cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url)
|
|
if cl_m:
|
|
page_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"]
|
|
all_images = page_images
|
|
|
|
status = ("auction" if src_type == "auction"
|
|
else "salvage" if src_type == "salvage"
|
|
else "active")
|
|
|
|
context = ("URL: " + raw["url"] + "\nTitle: " + raw["title"] +
|
|
"\nPrice: " + raw.get("price_text","") + "\n" + page_text[:1500])
|
|
|
|
prompt = (
|
|
"Analyze this boat listing from " + str(raw.get('source','')) +
|
|
". Search was: " + query + "\n"
|
|
"TEXT: " + context + "\n\n"
|
|
"If NOT a boat for sale respond {skip:true}. "
|
|
"If IS a boat respond JSON with: skip=false, name, vessel_type "
|
|
"(Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Other), "
|
|
"loa_m, beam_m, draft_m (ALWAYS in METERS — detect unit from text; "
|
|
"if feet multiply by 0.3048, e.g. 45ft=13.7m, 60ft=18.3m, 100ft=30.5m), "
|
|
"year_built, hull, propulsion, "
|
|
"status=" + status + ", price_usd, currency, location, country, "
|
|
"description (Spanish max 150 chars), flags=[], score 0-100."
|
|
)
|
|
|
|
response = ollama_generate(prompt, model=MODELS['classify'], json_mode=True)
|
|
m = re.search(r'\{.*\}', response or '', re.DOTALL)
|
|
if not m:
|
|
return
|
|
data = json.loads(m.group())
|
|
if data.get("skip") or not data.get("name"):
|
|
return
|
|
|
|
# Override AI loa_m with regex (AI misses feet→m conversion)
|
|
loa_from_ctx = None
|
|
for pat in [
|
|
r'(?:length|loa|eslora)[:\s]+([\d.]+)\s*(?:ft|\'|feet)',
|
|
r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet|\')',
|
|
r'^(\d{2,3}(?:\.\d)?)\s*\'',
|
|
]:
|
|
lm = re.search(pat, context, re.IGNORECASE)
|
|
if not lm:
|
|
lm = re.search(pat, raw.get("title",""), re.IGNORECASE)
|
|
if lm:
|
|
try:
|
|
ft = float(lm.group(1))
|
|
if 10 < ft < 500:
|
|
loa_from_ctx = round(ft * 0.3048, 1)
|
|
break
|
|
except: pass
|
|
if loa_from_ctx and not data.get("loa_m"):
|
|
data["loa_m"] = loa_from_ctx
|
|
elif loa_from_ctx and data.get("loa_m") and data["loa_m"] > 25:
|
|
data["loa_m"] = round(data["loa_m"] * 0.3048, 1)
|
|
|
|
# AI unit conversion guard (only needed for AI output)
|
|
ctx_lower = (page_text + " " + raw.get("title","")).lower()
|
|
has_feet = bool(re.search(r"\d+\s*(?:ft|feet|')\b|loa[:\s]+\d+\s*(?:ft|')", ctx_lower))
|
|
vtype_lower = data.get("vessel_type","").lower()
|
|
MAX_M = {"sailboat":25,"yacht":35,"motor":30,"fishing":30,
|
|
"tug":60,"barge":120,"offshore":90,"ferry":100,"other":50}
|
|
max_reasonable = MAX_M.get(vtype_lower, 50)
|
|
for dim in ["loa_m","beam_m","draft_m"]:
|
|
val = data.get(dim)
|
|
if not val or not isinstance(val,(int,float)):
|
|
continue
|
|
convert = False
|
|
if dim == "loa_m" and (val > 100 or val > max_reasonable or (val > 25 and has_feet)): convert = True
|
|
elif dim == "beam_m" and (val > 30 or (val > 8 and has_feet)): convert = True
|
|
elif dim == "draft_m"and (val > 15 or (val > 5 and has_feet)): convert = True
|
|
if convert:
|
|
data[dim] = round(val * 0.3048, 1)
|
|
|
|
# ── Shared post-processing (fast path + AI path) ──────────────────
|
|
if not data or not data.get("name"):
|
|
return
|
|
|
|
# Query match check
|
|
combined = (data.get("name","") + " " + data.get("description","") +
|
|
" " + data.get("vessel_type","") + " " +
|
|
raw.get("title","") + " " + raw.get("url","")).lower()
|
|
if query_words:
|
|
if not any(qw in combined for qw in expanded_query):
|
|
# Skip query-match filter for results from direct scrapers (not web search).
|
|
# Web search results have category="Web Search" and may return off-topic pages.
|
|
# Direct scraper results already passed through a relevant search query.
|
|
is_web_search = raw.get("category","").lower() == "web search"
|
|
if is_web_search:
|
|
source_lower = raw.get("source","").lower()
|
|
if not any(kw in source_lower for kw in
|
|
["sailboat","yacht","workboat","offshore","tug","commercial",
|
|
"boats","boattrader","apolloduck","rightboat","seaboats",
|
|
"yachtworld","govplanet","govdeals","hibid","copart","ebay",
|
|
"salvex","kijiji","craigslist","denison","galati","hmy"]):
|
|
return
|
|
|
|
# Non-vessel + generic name check
|
|
if any(kw in data.get("name","").lower() for kw in NON_VESSELS):
|
|
return
|
|
if data.get("name","").lower().strip() in GENERIC_NAMES:
|
|
return
|
|
|
|
# Filters (price + LOA)
|
|
if max_price and data.get("price_usd") and data["price_usd"] > max_price * 1.01:
|
|
return
|
|
if min_loa and data.get("loa_m") and data["loa_m"] < (min_loa - 0.15):
|
|
return
|
|
|
|
data["images"] = all_images[:8]
|
|
data["source_url"] = raw["url"]
|
|
data["source_name"] = raw["source"]
|
|
|
|
vid = save_vessel(data)
|
|
if vid > 0:
|
|
with lock:
|
|
search_state["found"] += 1
|
|
vessels.append(data)
|
|
tag = "[Fast]" if data.get("_fast") else "[AI]"
|
|
msg = f"✓ {data.get('name','?')} — {raw['source']}"
|
|
print(f"{tag} {msg}")
|
|
search_state["log"].append(msg)
|
|
except Exception as e:
|
|
print(f"[Extract] Error: {e}")
|
|
|
|
# Fast path: more workers + more URLs since most results skip AI now
|
|
with ThreadPoolExecutor(max_workers=16) as ex:
|
|
futs = [ex.submit(process_one, r) for r in unique_raw[:300]]
|
|
for f in as_completed(futs, timeout=180):
|
|
if search_state.get("cancelled"):
|
|
break
|
|
try:
|
|
f.result()
|
|
except Exception:
|
|
pass
|
|
|
|
print(f"[Search] Done — {len(vessels)} vessels found")
|
|
return vessels
|
|
|
|
return vessels
|
|
|
|
|
|
|
|
# ── Fingerprint ───────────────────────────────────────────────────────────────
|
|
def fingerprint(v: dict) -> str:
|
|
raw = f"{v.get('name','').lower().strip()}|{round(v.get('loa_m') or 0)}|{v.get('year_built',0)}|{v.get('vessel_type','')}"
|
|
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
|
|
def save_vessel(v: dict) -> int:
|
|
# Reject pure shells — need at least name + 1 real data field
|
|
if not v.get("name") or v["name"].strip() in ("", "Unknown"):
|
|
return -1
|
|
data_points = sum(1 for f in ['price_usd', 'loa_m', 'year_built', 'location'] if v.get(f))
|
|
if data_points < 1:
|
|
return -1
|
|
|
|
fp = fingerprint(v)
|
|
conn = get_db()
|
|
c = conn.cursor()
|
|
existing = c.execute("SELECT id FROM vessels WHERE fingerprint=?", (fp,)).fetchone()
|
|
if existing:
|
|
conn.close()
|
|
return existing['id']
|
|
try:
|
|
c.execute("""INSERT INTO vessels
|
|
(name,vessel_type,loa_m,beam_m,draft_m,year_built,hull,propulsion,
|
|
status,price_usd,currency,location,country,source_name,source_url,
|
|
description,images,flags,score,fingerprint,raw_data)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
|
|
(v.get('name'), v.get('vessel_type'), v.get('loa_m'),
|
|
v.get('beam_m'), v.get('draft_m'), v.get('year_built'),
|
|
v.get('hull'), v.get('propulsion'), v.get('status','active'),
|
|
v.get('price_usd'), v.get('currency','USD'),
|
|
v.get('location'), v.get('country'),
|
|
v.get('source_name'), v.get('source_url'),
|
|
v.get('description'), json.dumps(v.get('images',[])),
|
|
json.dumps(v.get('flags',[])), v.get('score',50),
|
|
fp, json.dumps(v)))
|
|
vid = c.lastrowid
|
|
conn.commit()
|
|
except Exception as e:
|
|
print(f"[DB] Error: {e}")
|
|
vid = -1
|
|
finally:
|
|
conn.close()
|
|
return vid
|
|
|
|
# ── API Routes ────────────────────────────────────────────────────────────────
|
|
|
|
def hash_pw(pw):
|
|
return _hashlib.sha256(pw.encode()).hexdigest()
|
|
|
|
def seed_admin():
|
|
conn = get_db()
|
|
existing = conn.execute("SELECT id FROM users WHERE username='admin'").fetchone()
|
|
if not existing:
|
|
conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)",
|
|
('admin', hash_pw('admin123'), 'admin'))
|
|
conn.commit()
|
|
print("[Auth] Default user created: admin / admin123")
|
|
conn.close()
|
|
|
|
@app.route('/api/login', methods=['POST'])
|
|
def login():
|
|
body = request.json or {}
|
|
username = body.get('username','').strip()
|
|
password = body.get('password','')
|
|
conn = get_db()
|
|
user = conn.execute("SELECT * FROM users WHERE username=? AND password=?",
|
|
(username, hash_pw(password))).fetchone()
|
|
conn.close()
|
|
if user:
|
|
session['user_id'] = user['id']
|
|
session['username'] = user['username']
|
|
session['role'] = user['role']
|
|
return jsonify({'ok': True, 'username': user['username'], 'role': user['role']})
|
|
return jsonify({'ok': False, 'error': 'Usuario o contraseña incorrectos'}), 401
|
|
|
|
@app.route('/api/logout', methods=['POST'])
|
|
def logout():
|
|
session.clear()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/me')
|
|
def me():
|
|
if 'user_id' not in session:
|
|
return jsonify({'logged_in': False}), 401
|
|
return jsonify({'logged_in': True, 'username': session.get('username'), 'role': session.get('role')})
|
|
|
|
@app.route('/api/users', methods=['GET'])
|
|
def list_users():
|
|
if session.get('role') != 'admin':
|
|
return jsonify({'error': 'forbidden'}), 403
|
|
conn = get_db()
|
|
rows = [dict(r) for r in conn.execute("SELECT id,username,role,created_at FROM users").fetchall()]
|
|
conn.close()
|
|
return jsonify({'users': rows})
|
|
|
|
@app.route('/api/users', methods=['POST'])
|
|
def create_user():
|
|
if session.get('role') != 'admin':
|
|
return jsonify({'error': 'forbidden'}), 403
|
|
body = request.json or {}
|
|
username = body.get('username','').strip()
|
|
password = body.get('password','')
|
|
role = body.get('role','user')
|
|
if not username or not password:
|
|
return jsonify({'error': 'username and password required'}), 400
|
|
conn = get_db()
|
|
try:
|
|
conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)",
|
|
(username, hash_pw(password), role))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
except:
|
|
conn.close()
|
|
return jsonify({'error': 'username already exists'}), 400
|
|
|
|
@app.route('/api/change_password', methods=['POST'])
|
|
def change_password():
|
|
if 'user_id' not in session:
|
|
return jsonify({'error': 'not logged in'}), 401
|
|
body = request.json or {}
|
|
old_pw = body.get('old_password','')
|
|
new_pw = body.get('new_password','')
|
|
conn = get_db()
|
|
user = conn.execute("SELECT * FROM users WHERE id=? AND password=?",
|
|
(session['user_id'], hash_pw(old_pw))).fetchone()
|
|
if not user:
|
|
conn.close()
|
|
return jsonify({'error': 'Contraseña actual incorrecta'}), 400
|
|
conn.execute("UPDATE users SET password=? WHERE id=?", (hash_pw(new_pw), session['user_id']))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/')
|
|
def index():
|
|
return send_from_directory('static', 'index.html')
|
|
|
|
@app.route('/api/status')
|
|
def status():
|
|
models = ollama_models()
|
|
conn = get_db()
|
|
counts = {
|
|
'vessels': conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0],
|
|
'saved': conn.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0],
|
|
'alerts': conn.execute("SELECT COUNT(*) FROM alerts WHERE active=1").fetchone()[0],
|
|
}
|
|
conn.close()
|
|
return jsonify({
|
|
'ok': True,
|
|
'ollama_models': models,
|
|
'active_model': MODELS['extract'],
|
|
'db_counts': counts,
|
|
'sources_count': len(DIRECT_SOURCES),
|
|
'categories': list(set(s['category'] for s in DIRECT_SOURCES)),
|
|
})
|
|
|
|
@app.route('/api/vessels')
|
|
def list_vessels():
|
|
conn = get_db()
|
|
q = "SELECT * FROM vessels WHERE 1=1"
|
|
params = []
|
|
if t := request.args.get('type'):
|
|
q += " AND vessel_type=?"; params.append(t)
|
|
if s := request.args.get('status'):
|
|
q += " AND status=?"; params.append(s)
|
|
if h := request.args.get('hull'):
|
|
q += " AND hull=?"; params.append(h)
|
|
if mp := request.args.get('max_price'):
|
|
q += " AND price_usd <= ?"; params.append(float(mp))
|
|
if ml := request.args.get('min_loa'):
|
|
q += " AND loa_m IS NOT NULL AND loa_m >= ?"; params.append(round(float(ml) - 0.15, 2))
|
|
if yr_min := request.args.get('year_min'):
|
|
try: q += " AND year_built >= ?"; params.append(int(yr_min))
|
|
except: pass
|
|
if yr_max := request.args.get('year_max'):
|
|
try: q += " AND year_built <= ?"; params.append(int(yr_max))
|
|
except: pass
|
|
sort = request.args.get('sort', 'score')
|
|
sorts = {
|
|
'score':'score DESC', 'price_asc':'price_usd ASC',
|
|
'price_desc':'price_usd DESC', 'loa':'loa_m DESC',
|
|
'year':'year_built DESC', 'newest':'created_at DESC'
|
|
}
|
|
q += f" ORDER BY {sorts.get(sort,'score DESC')}"
|
|
q += f" LIMIT {min(int(request.args.get('limit',200)),500)}"
|
|
rows = [dict(r) for r in conn.execute(q, params).fetchall()]
|
|
for r in rows:
|
|
r['flags'] = json.loads(r.get('flags') or '[]')
|
|
r['images'] = json.loads(r.get('images') or '[]')
|
|
conn.close()
|
|
return jsonify({'vessels': rows, 'count': len(rows)})
|
|
|
|
_PROXY_ALLOWED = [
|
|
'sailboatlistings.com', 'yachtworld.com', 'boattrader.com',
|
|
'apolloduck.com', 'rightboat.com', 'boat24.com', 'seaboats.net',
|
|
'boats.com', 'iboats.com', 'yachtworld.co.uk',
|
|
]
|
|
|
|
@app.route('/api/img_proxy')
|
|
def img_proxy():
|
|
url = request.args.get('url', '')
|
|
if not url:
|
|
return '', 404
|
|
from urllib.parse import urlparse
|
|
host = urlparse(url).hostname or ''
|
|
if not any(d in host for d in _PROXY_ALLOWED):
|
|
return '', 403
|
|
try:
|
|
resp = requests.get(url, timeout=10, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Referer': f'https://{host}/',
|
|
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
|
|
})
|
|
if resp.status_code == 200:
|
|
ct = resp.headers.get('content-type', 'image/jpeg')
|
|
return Response(resp.content, content_type=ct,
|
|
headers={'Cache-Control': 'public, max-age=86400'})
|
|
return '', resp.status_code
|
|
except Exception as e:
|
|
app.logger.debug(f"img_proxy error: {e}")
|
|
return '', 502
|
|
|
|
|
|
# Global search state
|
|
search_state = {
|
|
'running': False,
|
|
'cancelled': False,
|
|
'query': '',
|
|
'found': 0,
|
|
'total_sources': 0,
|
|
'sources_done': 0,
|
|
'log': [],
|
|
}
|
|
|
|
@app.route('/api/search', methods=['POST'])
|
|
def search():
|
|
body = request.json or {}
|
|
query = body.get('query', '')
|
|
filters = body.get('filters', {})
|
|
if not query:
|
|
return jsonify({'error': 'query requerido'}), 400
|
|
|
|
# Clear previous results immediately
|
|
conn = get_db()
|
|
conn.execute("DELETE FROM vessels")
|
|
conn.execute("DELETE FROM saved_vessels")
|
|
conn.execute("INSERT INTO search_history (query,filters) VALUES (?,?)",
|
|
(query, json.dumps(filters)))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
# Reset state
|
|
search_state['running'] = True
|
|
search_state['cancelled'] = False
|
|
search_state['query'] = query
|
|
search_state['found'] = 0
|
|
search_state['sources_done'] = 0
|
|
search_state['total_sources'] = len(DIRECT_SOURCES)
|
|
search_state['log'] = [f"Iniciando búsqueda: {query}"]
|
|
|
|
# Tag this search with a unique ID so old threads don't pollute new searches
|
|
import uuid
|
|
search_id = str(uuid.uuid4())
|
|
search_state['search_id'] = search_id
|
|
|
|
# Run search in background thread
|
|
def run_bg(sid):
|
|
try:
|
|
search_with_ai(query, filters)
|
|
except Exception as e:
|
|
search_state['log'].append(f"Error: {e}")
|
|
print(f"[BG] Error: {e}")
|
|
finally:
|
|
if search_state.get('search_id') == sid:
|
|
search_state['running'] = False
|
|
total = search_state['found']
|
|
msg = f"✓ Búsqueda completa — {total} embarcaciones encontradas"
|
|
search_state['log'].append(msg)
|
|
print(f"[BG] {msg}")
|
|
|
|
t = threading.Thread(target=run_bg, args=(search_id,), daemon=True)
|
|
t.start()
|
|
|
|
return jsonify({'ok': True, 'message': 'Búsqueda iniciada en background'})
|
|
|
|
@app.route('/api/search/status')
|
|
def search_status():
|
|
return jsonify(search_state)
|
|
|
|
@app.route('/api/search/cancel', methods=['POST'])
|
|
def cancel_search():
|
|
import uuid
|
|
search_state['cancelled'] = True
|
|
search_state['running'] = False
|
|
search_state['search_id'] = str(uuid.uuid4()) # invalidate any running thread
|
|
search_state['log'].append('⏹ Búsqueda cancelada por el usuario')
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/fb-status')
|
|
def fb_status():
|
|
SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json")
|
|
return jsonify({"active": os.path.exists(SESSION_FILE)})
|
|
|
|
|
|
@app.route('/api/fb-setup', methods=['POST'])
|
|
def fb_setup():
|
|
"""
|
|
Launch a visible Chromium window so the user can log in to Facebook.
|
|
After login is detected (marketplace URL is accessible), saves cookies to fb_session.json.
|
|
"""
|
|
SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json")
|
|
import json as _json
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
result = {"ok": False, "msg": ""}
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=False,
|
|
args=["--disable-blink-features=AutomationControlled"])
|
|
context = browser.new_context(
|
|
viewport={"width": 1100, "height": 800},
|
|
user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/122.0.0.0 Safari/537.36"))
|
|
page = context.new_page()
|
|
page.goto("https://www.facebook.com/login", timeout=30000,
|
|
wait_until="domcontentloaded")
|
|
# Wait up to 3 minutes for user to log in and reach marketplace
|
|
try:
|
|
page.wait_for_url(
|
|
re.compile(r'facebook\.com/(marketplace|home|feed)'),
|
|
timeout=180000)
|
|
# Give extra time to fully load
|
|
page.wait_for_timeout(3000)
|
|
cookies = context.cookies()
|
|
with open(SESSION_FILE, "w") as f:
|
|
_json.dump(cookies, f)
|
|
result = {"ok": True,
|
|
"msg": f"Sesión guardada ({len(cookies)} cookies). "
|
|
"Facebook Marketplace activado."}
|
|
except Exception as e:
|
|
result = {"ok": False, "msg": f"Tiempo agotado o error: {e}"}
|
|
finally:
|
|
try: page.close()
|
|
except: pass
|
|
browser.close()
|
|
return jsonify(result)
|
|
except Exception as e:
|
|
return jsonify({"ok": False, "msg": str(e)}), 500
|
|
|
|
|
|
@app.route('/api/vessels/<int:vid>', methods=['GET'])
|
|
def get_vessel(vid):
|
|
conn = get_db()
|
|
row = conn.execute("SELECT * FROM vessels WHERE id=?", (vid,)).fetchone()
|
|
conn.close()
|
|
if not row:
|
|
return jsonify({'error': 'not found'}), 404
|
|
v = dict(row)
|
|
v['flags'] = json.loads(v.get('flags') or '[]')
|
|
v['images'] = json.loads(v.get('images') or '[]')
|
|
return jsonify(v)
|
|
|
|
@app.route('/api/vessels', methods=['POST'])
|
|
def add_vessel():
|
|
v = request.json or {}
|
|
v['source_name'] = v.get('source_name', 'Manual')
|
|
vid = save_vessel(v)
|
|
return jsonify({'id': vid, 'ok': True})
|
|
|
|
@app.route('/api/vessels/<int:vid>', methods=['PUT'])
|
|
def update_vessel(vid):
|
|
body = request.json or {}
|
|
conn = get_db()
|
|
fields = ['name','vessel_type','loa_m','beam_m','draft_m','year_built',
|
|
'hull','propulsion','status','price_usd','location','description','score']
|
|
updates = {k: body[k] for k in fields if k in body}
|
|
if updates:
|
|
set_clause = ', '.join(f"{k}=?" for k in updates)
|
|
conn.execute(f"UPDATE vessels SET {set_clause}, updated_at=datetime('now') WHERE id=?",
|
|
[*updates.values(), vid])
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/vessels/<int:vid>', methods=['DELETE'])
|
|
def delete_vessel(vid):
|
|
conn = get_db()
|
|
conn.execute("DELETE FROM vessels WHERE id=?", (vid,))
|
|
conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/saved', methods=['GET'])
|
|
def list_saved():
|
|
conn = get_db()
|
|
rows = conn.execute("""
|
|
SELECT v.*, s.notes, s.saved_at
|
|
FROM vessels v JOIN saved_vessels s ON v.id=s.vessel_id
|
|
ORDER BY s.saved_at DESC
|
|
""").fetchall()
|
|
result = []
|
|
for r in rows:
|
|
v = dict(r)
|
|
v['flags'] = json.loads(v.get('flags') or '[]')
|
|
v['images'] = json.loads(v.get('images') or '[]')
|
|
result.append(v)
|
|
conn.close()
|
|
return jsonify({'vessels': result, 'count': len(result)})
|
|
|
|
@app.route('/api/saved/<int:vid>', methods=['POST'])
|
|
def save_vessel_fav(vid):
|
|
notes = (request.json or {}).get('notes', '')
|
|
conn = get_db()
|
|
existing = conn.execute("SELECT id FROM saved_vessels WHERE vessel_id=?", (vid,)).fetchone()
|
|
if not existing:
|
|
conn.execute("INSERT INTO saved_vessels (vessel_id, notes) VALUES (?,?)", (vid, notes))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/saved/<int:vid>', methods=['DELETE'])
|
|
def unsave_vessel(vid):
|
|
conn = get_db()
|
|
conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/alerts', methods=['GET'])
|
|
def list_alerts():
|
|
conn = get_db()
|
|
rows = [dict(r) for r in conn.execute("SELECT * FROM alerts WHERE active=1").fetchall()]
|
|
conn.close()
|
|
return jsonify({'alerts': rows})
|
|
|
|
@app.route('/api/alerts', methods=['POST'])
|
|
def create_alert():
|
|
body = request.json or {}
|
|
conn = get_db()
|
|
conn.execute("INSERT INTO alerts (name, filters) VALUES (?,?)",
|
|
(body.get('name','Alerta'), json.dumps(body.get('filters',{}))))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/alerts/<int:aid>', methods=['DELETE'])
|
|
def delete_alert(aid):
|
|
conn = get_db()
|
|
conn.execute("UPDATE alerts SET active=0 WHERE id=?", (aid,))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/sources')
|
|
def list_sources():
|
|
by_cat = {}
|
|
for s in DIRECT_SOURCES:
|
|
cat = s['category']
|
|
if cat not in by_cat:
|
|
by_cat[cat] = []
|
|
by_cat[cat].append({'name': s['name'], 'url': s['search_url'].split('?')[0], 'type': s['type'], 'builtin': True})
|
|
# Add custom sources
|
|
try:
|
|
conn = get_db()
|
|
custom = [dict(r) for r in conn.execute("SELECT * FROM custom_sources ORDER BY category").fetchall()]
|
|
conn.close()
|
|
for c in custom:
|
|
cat = c['category'] or 'Custom'
|
|
if cat not in by_cat:
|
|
by_cat[cat] = []
|
|
by_cat[cat].append({
|
|
'name': c['name'], 'url': c['search_url'].split('?')[0],
|
|
'type': c['source_type'], 'builtin': False,
|
|
'id': c['id'], 'active': bool(c['active'])
|
|
})
|
|
except:
|
|
pass
|
|
return jsonify({'sources': by_cat, 'total': sum(len(v) for v in by_cat.values())})
|
|
|
|
@app.route('/api/history')
|
|
def search_history():
|
|
conn = get_db()
|
|
rows = [dict(r) for r in conn.execute(
|
|
"SELECT * FROM search_history ORDER BY searched_at DESC LIMIT 50").fetchall()]
|
|
conn.close()
|
|
return jsonify({'history': rows})
|
|
|
|
@app.route('/api/analyze', methods=['POST'])
|
|
def analyze_text():
|
|
body = request.json or {}
|
|
text = body.get('text', '')
|
|
source = body.get('source', 'Manual')
|
|
if not text:
|
|
return jsonify({'error': 'text requerido'}), 400
|
|
result = extract_vessel_from_text(text, source)
|
|
if result:
|
|
vid = save_vessel({**result, 'source_name': source})
|
|
result['id'] = vid
|
|
return jsonify(result)
|
|
|
|
@app.route('/api/collections', methods=['GET'])
|
|
def list_collections():
|
|
conn = get_db()
|
|
cols = [dict(r) for r in conn.execute(
|
|
"SELECT c.*, COUNT(cv.vessel_id) as vessel_count FROM collections c "
|
|
"LEFT JOIN collection_vessels cv ON c.id=cv.collection_id "
|
|
"GROUP BY c.id ORDER BY c.created_at DESC").fetchall()]
|
|
conn.close()
|
|
return jsonify({'collections': cols})
|
|
|
|
@app.route('/api/collections', methods=['POST'])
|
|
def create_collection():
|
|
body = request.json or {}
|
|
name = body.get('name','').strip()
|
|
if not name:
|
|
return jsonify({'error': 'name required'}), 400
|
|
conn = get_db()
|
|
conn.execute("INSERT INTO collections (name,description,color,icon) VALUES (?,?,?,?)",
|
|
(name, body.get('description',''), body.get('color','#00b4ff'), body.get('icon','📁')))
|
|
conn.commit()
|
|
cid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
|
conn.close()
|
|
return jsonify({'ok': True, 'id': cid})
|
|
|
|
@app.route('/api/collections/<int:cid>', methods=['DELETE'])
|
|
def delete_collection(cid):
|
|
conn = get_db()
|
|
conn.execute("DELETE FROM collection_vessels WHERE collection_id=?", (cid,))
|
|
conn.execute("DELETE FROM collections WHERE id=?", (cid,))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/collections/<int:cid>/vessels', methods=['GET'])
|
|
def collection_vessels(cid):
|
|
conn = get_db()
|
|
rows = conn.execute("""
|
|
SELECT v.*, cv.notes, cv.added_at FROM vessels v
|
|
JOIN collection_vessels cv ON v.id=cv.vessel_id
|
|
WHERE cv.collection_id=? ORDER BY cv.added_at DESC""", (cid,)).fetchall()
|
|
result = []
|
|
for r in rows:
|
|
v = dict(r)
|
|
v['flags'] = json.loads(v.get('flags') or '[]')
|
|
v['images'] = json.loads(v.get('images') or '[]')
|
|
result.append(v)
|
|
conn.close()
|
|
return jsonify({'vessels': result, 'count': len(result)})
|
|
|
|
@app.route('/api/collections/<int:cid>/vessels', methods=['POST'])
|
|
def add_to_collection(cid):
|
|
body = request.json or {}
|
|
vessel_ids = body.get('vessel_ids', [])
|
|
notes = body.get('notes', '')
|
|
conn = get_db()
|
|
added = 0
|
|
for vid in vessel_ids:
|
|
try:
|
|
conn.execute("INSERT OR IGNORE INTO collection_vessels (collection_id,vessel_id,notes) VALUES (?,?,?)",
|
|
(cid, vid, notes))
|
|
added += 1
|
|
except:
|
|
pass
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True, 'added': added})
|
|
|
|
@app.route('/api/collections/<int:cid>/vessels/<int:vid>', methods=['DELETE'])
|
|
def remove_from_collection(cid, vid):
|
|
conn = get_db()
|
|
conn.execute("DELETE FROM collection_vessels WHERE collection_id=? AND vessel_id=?", (cid, vid))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/custom_sources', methods=['GET'])
|
|
def get_custom_sources():
|
|
conn = get_db()
|
|
rows = [dict(r) for r in conn.execute(
|
|
"SELECT * FROM custom_sources ORDER BY created_at DESC").fetchall()]
|
|
conn.close()
|
|
return jsonify({'sources': rows})
|
|
|
|
@app.route('/api/custom_sources', methods=['POST'])
|
|
def add_custom_source():
|
|
body = request.json or {}
|
|
name = body.get('name','').strip()
|
|
url = body.get('search_url','').strip()
|
|
if not name or not url:
|
|
return jsonify({'error': 'name and search_url required'}), 400
|
|
# Ensure URL has {query} placeholder
|
|
if '{query}' not in url:
|
|
url = url.rstrip('/') + '?q={query}'
|
|
conn = get_db()
|
|
conn.execute("""INSERT INTO custom_sources (name,category,search_url,source_type,added_by)
|
|
VALUES (?,?,?,?,?)""",
|
|
(name, body.get('category','Custom'),
|
|
url, body.get('source_type','broker'),
|
|
session.get('username','admin')))
|
|
conn.commit()
|
|
sid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
|
conn.close()
|
|
return jsonify({'ok': True, 'id': sid})
|
|
|
|
@app.route('/api/custom_sources/<int:sid>', methods=['PUT'])
|
|
def update_custom_source(sid):
|
|
body = request.json or {}
|
|
conn = get_db()
|
|
fields = ['name','category','search_url','source_type','active']
|
|
updates = {k: body[k] for k in fields if k in body}
|
|
if updates:
|
|
set_clause = ', '.join(f"{k}=?" for k in updates)
|
|
conn.execute(f"UPDATE custom_sources SET {set_clause} WHERE id=?",
|
|
[*updates.values(), sid])
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/custom_sources/<int:sid>', methods=['DELETE'])
|
|
def delete_custom_source(sid):
|
|
conn = get_db()
|
|
conn.execute("DELETE FROM custom_sources WHERE id=?", (sid,))
|
|
conn.commit()
|
|
conn.close()
|
|
return jsonify({'ok': True})
|
|
|
|
@app.route('/api/stats')
|
|
def stats():
|
|
conn = get_db()
|
|
c = conn.cursor()
|
|
data = {
|
|
'total': c.execute("SELECT COUNT(*) FROM vessels").fetchone()[0],
|
|
'saved': c.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0],
|
|
'by_type': dict(c.execute("SELECT vessel_type, COUNT(*) FROM vessels GROUP BY vessel_type").fetchall()),
|
|
'by_status': dict(c.execute("SELECT status, COUNT(*) FROM vessels GROUP BY status").fetchall()),
|
|
'by_country':dict((k or 'Unknown', v) for k,v in c.execute("SELECT country, COUNT(*) FROM vessels WHERE country IS NOT NULL GROUP BY country ORDER BY COUNT(*) DESC LIMIT 10").fetchall()),
|
|
'avg_score': c.execute("SELECT AVG(score) FROM vessels").fetchone()[0] or 0,
|
|
'avg_price': c.execute("SELECT AVG(price_usd) FROM vessels WHERE price_usd > 0").fetchone()[0] or 0,
|
|
'top_opportunities': [dict(r) for r in c.execute(
|
|
"SELECT id,name,vessel_type,price_usd,score,location FROM vessels ORDER BY score DESC LIMIT 5").fetchall()],
|
|
}
|
|
conn.close()
|
|
return jsonify(data)
|
|
|
|
# ── Seed sample data ──────────────────────────────────────────────────────────
|
|
def seed_sample_data():
|
|
samples = [
|
|
{"name":"M/Y Stella Maris","vessel_type":"Yacht","loa_m":28.4,"beam_m":6.8,"draft_m":1.9,"year_built":2008,"hull":"Fiberglass","propulsion":"Diesel","status":"active","price_usd":189000,"location":"Fort Lauderdale, FL","country":"US","source_name":"YachtWorld","source_url":"https://yachtworld.com","description":"Yate motor bien mantenido, twin Volvo IPS, refit 2022.","flags":["below_market","motivated_seller"],"score":87},
|
|
{"name":"F/V Cape Hatteras","vessel_type":"Fishing","loa_m":19.2,"beam_m":5.1,"draft_m":1.4,"year_built":1997,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":22000,"location":"Gloucester, MA","country":"US","source_name":"GovDeals","source_url":"https://govdeals.com","description":"Ex buque NOAA, motor operativo, casco requiere trabajo.","flags":["rare","salvage_value","below_market"],"score":94},
|
|
{"name":"TUG Bravo Eagle","vessel_type":"Tug","loa_m":32.0,"beam_m":9.4,"draft_m":3.8,"year_built":1989,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":310000,"location":"New Orleans, LA","country":"US","source_name":"AuctionTime","source_url":"https://auctiontime.com","description":"Remolcador 2400HP, clase ABS, listo para operación comercial.","flags":["rare","auction","motivated_seller"],"score":91},
|
|
{"name":"OSV Pacific Ranger","vessel_type":"Offshore","loa_m":52.0,"beam_m":13.2,"draft_m":4.1,"year_built":2005,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":890000,"location":"Port Fourchon, LA","country":"US","source_name":"GovPlanet","source_url":"https://govplanet.com","description":"Buque apoyo offshore DP1, 400T carga, documentación completa.","flags":["rare","auction","government_surplus"],"score":79},
|
|
{"name":"Barge RJ-440","vessel_type":"Barge","loa_m":44.0,"beam_m":12.0,"draft_m":1.8,"year_built":1978,"hull":"Steel","propulsion":"None","status":"active","price_usd":55000,"location":"Houston, TX","country":"US","source_name":"WorkBoat Classifieds","source_url":"https://workboat.com","description":"Barcaza cubierta, capacidad 800T, buen estado estructural.","flags":["below_market","rare"],"score":73},
|
|
{"name":"LCT Endeavour","vessel_type":"Barge","loa_m":61.0,"beam_m":14.6,"draft_m":1.5,"year_built":1968,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":38000,"location":"Manila, Filipinas","country":"PH","source_name":"Salvex","source_url":"https://salvex.com","description":"Landing craft, estructura sólida, motores requieren overhaul.","flags":["salvage_value","rare","below_market"],"score":82},
|
|
]
|
|
for s in samples:
|
|
save_vessel(s)
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
if __name__ == '__main__':
|
|
import socket, signal, atexit, sys
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
PID_FILE = os.path.join(BASE_DIR, ".server.pid")
|
|
|
|
# ── Handle existing instance ───────────────────────────────────────────────
|
|
def kill_pid(pid):
|
|
try:
|
|
import ctypes
|
|
handle = ctypes.windll.kernel32.OpenProcess(1, False, pid)
|
|
ctypes.windll.kernel32.TerminateProcess(handle, -1)
|
|
ctypes.windll.kernel32.CloseHandle(handle)
|
|
return True
|
|
except:
|
|
try:
|
|
os.kill(pid, 9)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def pid_running(pid):
|
|
try:
|
|
os.kill(pid, 0)
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
if os.path.exists(PID_FILE):
|
|
try:
|
|
old_pid = int(open(PID_FILE).read().strip())
|
|
if pid_running(old_pid):
|
|
print(f"\n ⚠️ Ya hay una instancia corriendo (PID {old_pid})")
|
|
resp = input(" ¿Cerrar la instancia anterior y continuar? [S/n]: ").strip().lower()
|
|
if resp in ("", "s", "si", "sí", "y", "yes"):
|
|
if kill_pid(old_pid):
|
|
print(f" ✓ Instancia anterior (PID {old_pid}) cerrada.")
|
|
import time; time.sleep(1)
|
|
else:
|
|
print(f" ✗ No se pudo cerrar. Ciérrala manualmente y vuelve a intentar.")
|
|
sys.exit(1)
|
|
else:
|
|
print(" Saliendo sin cambios.")
|
|
sys.exit(0)
|
|
except (ValueError, IOError):
|
|
pass # PID file corrupted — ignore
|
|
|
|
# ── Write PID file ─────────────────────────────────────────────────────────
|
|
with open(PID_FILE, "w") as f:
|
|
f.write(str(os.getpid()))
|
|
|
|
def cleanup_pid():
|
|
try: os.remove(PID_FILE)
|
|
except: pass
|
|
|
|
atexit.register(cleanup_pid)
|
|
|
|
def handle_signal(sig, frame):
|
|
print("\n\n 👋 Cerrando Boat&Ship-Finder...")
|
|
cleanup_pid()
|
|
sys.exit(0)
|
|
|
|
signal.signal(signal.SIGINT, handle_signal)
|
|
signal.signal(signal.SIGTERM, handle_signal)
|
|
|
|
# ── Port selection ─────────────────────────────────────────────────────────
|
|
def port_free(p):
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
try: s.bind(("0.0.0.0", p)); return True
|
|
except: return False
|
|
|
|
desired = int(os.environ.get('MARINE_PORT', 8765))
|
|
port = desired
|
|
if not port_free(desired):
|
|
for p in range(desired + 1, desired + 20):
|
|
if port_free(p):
|
|
port = p
|
|
break
|
|
print(f"\n ⚠️ Puerto {desired} ocupado — usando {port}")
|
|
|
|
# ── DB init ────────────────────────────────────────────────────────────────
|
|
print("\n" + "="*55)
|
|
print(" Boat&Ship-Finder — Iniciando...")
|
|
print("="*55)
|
|
init_db()
|
|
seed_admin()
|
|
conn = get_db()
|
|
count = conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0]
|
|
conn.close()
|
|
if count == 0:
|
|
print("[DB] Base de datos vacía — lista para búsquedas reales")
|
|
else:
|
|
print(f"[DB] {count} embarcaciones en caché de sesión anterior")
|
|
|
|
print(f"\n Local: http://localhost:{port}")
|
|
print(f" Tailscale: http://<tu-ip-tailscale>:{port}")
|
|
print(f" Fuentes directas: {len(DIRECT_SOURCES)}")
|
|
print(f" Modelos Ollama: {list(MODELS.values())}")
|
|
print(f" PID: {os.getpid()} (guardado en .server.pid)")
|
|
print("\n [Ctrl+C para detener]\n")
|
|
app.run(host='0.0.0.0', port=port, debug=False)
|