""" Boat&Ship-Finder - Backend Server Requiere: pip install flask flask-cors requests beautifulsoup4 playwright """ from flask import Flask, jsonify, request, send_from_directory, session import hashlib as _hashlib from flask_cors import CORS import requests import json import sqlite3 import os import re import time import hashlib from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup import threading import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) app = Flask(__name__, static_folder='static') _secret = os.environ.get('SECRET_KEY') if not _secret: raise RuntimeError("SECRET_KEY not set — add SECRET_KEY= to your environment") app.secret_key = _secret CORS(app, origins=["http://localhost:8765", "http://127.0.0.1:8765"], supports_credentials=True) DB_PATH = 'marine.db' OLLAMA_URL = 'http://localhost:11434/api/generate' # ── Modelos Ollama por tarea ────────────────────────────────────────────────── MODELS = { 'extract': 'qwen2.5:32b', # Extracción de specs (más rápido que 72b, igual de preciso) 'classify': 'llama3.1:8b', # Clasificación rápida 'embed': 'nomic-embed-text:latest', # Embeddings para dedup 'parse': 'qwen3-coder:latest' # Parsing estructurado } # ── Fuentes globales por categoría ──────────────────────────────────────────── SOURCES = { "Subastas USA": [ {"name": "GovPlanet", "url": "https://www.govplanet.com/boats", "type": "auction"}, {"name": "GovDeals", "url": "https://www.govdeals.com", "type": "auction"}, {"name": "PropertyRoom", "url": "https://www.propertyroom.com/boats", "type": "auction"}, {"name": "PublicSurplus", "url": "https://www.publicsurplus.com", "type": "auction"}, {"name": "AuctionTime", "url": "https://www.auctiontime.com/boats", "type": "auction"}, {"name": "IronPlanet", "url": "https://www.ironplanet.com/boats", "type": "auction"}, {"name": "HiBid", "url": "https://www.hibid.com/boats", "type": "auction"}, {"name": "Copart Marine", "url": "https://www.copart.com/boats", "type": "auction"}, {"name": "BidSpotter", "url": "https://www.bidspotter.com/boats", "type": "auction"}, {"name": "32auctions", "url": "https://www.32auctions.com", "type": "auction"}, ], "Subastas Internacionales": [ {"name": "Ritchie Bros", "url": "https://www.rbauction.com/boats", "type": "auction"}, {"name": "Euro Auctions", "url": "https://www.euroauctions.com", "type": "auction"}, {"name": "Troostwijk", "url": "https://www.troostwijkauctions.com", "type": "auction"}, {"name": "Surplex", "url": "https://www.surplex.com/marine", "type": "auction"}, {"name": "BVA Auctions", "url": "https://www.bva-auctions.com", "type": "auction"}, {"name": "Catawiki Marine", "url": "https://www.catawiki.com/boats", "type": "auction"}, {"name": "Barnebys", "url": "https://www.barnebys.com/boats", "type": "auction"}, {"name": "ShipXchange", "url": "https://www.shipxchange.com", "type": "auction"}, ], "Venta Especializada": [ {"name": "YachtWorld", "url": "https://www.yachtworld.com", "type": "broker"}, {"name": "Boats.com", "url": "https://www.boats.com", "type": "broker", "search_url": "https://www.boats.com/boats-for-sale/?query={query}"}, {"name": "BoatTrader", "url": "https://www.boattrader.com", "type": "broker", "search_url": "https://www.boattrader.com/boats/?query={query}"}, {"name": "Apollo Duck", "url": "https://www.apolloduck.com", "type": "broker", "search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1"}, {"name": "Rightboat", "url": "https://www.rightboat.com", "type": "broker", "search_url": "https://www.rightboat.com/boats-for-sale/?q={query}"}, {"name": "Boat24", "url": "https://www.boat24.com", "type": "broker", "search_url": "https://www.boat24.com/en/usedboats/"}, {"name": "Inautia", "url": "https://www.inautia.com", "type": "broker", "search_url": "https://www.inautia.com/boats/?q={query}"}, # ── US Brokers ──────────────────────────────────────────────────────── {"name": "HMY Yachts", "url": "https://hmy.com", "type": "broker", "search_url": "https://www.hmy.com/yachts-for-sale/?SaleClassCode=used", "category": "Brokers USA"}, {"name": "Denison Yachting","url": "https://www.denisonyachtsales.com", "type": "broker", "search_url": "https://www.denisonyachtsales.com/yachts-for-sale/?search={query}", "category": "Brokers USA"}, {"name": "United Yacht", "url": "https://www.unitedyacht.com", "type": "broker", "search_url": "https://www.unitedyacht.com/yachts-for-sale/", "category": "Brokers USA"}, {"name": "Northrop & Johnson","url": "https://www.n-j.com", "type": "broker", "search_url": "https://www.n-j.com/yachts-for-sale/", "category": "Brokers USA"}, {"name": "Worth Ave Yachts","url": "https://www.worthavenueyachts.com", "type": "broker", "search_url": "https://www.worthavenueyachts.com/yachts-for-sale/", "category": "Brokers USA"}, {"name": "Bluewater Yachting","url": "https://www.bluewateryachting.com", "type": "broker", "category": "Brokers USA"}, {"name": "Galati Yachts", "url": "https://www.galatiyachts.com", "type": "broker", "search_url": "https://www.galatiyachts.com/boat-search/?q={query}", "category": "Brokers USA"}, {"name": "Fraser Yachts", "url": "https://www.fraseryachts.com", "type": "broker", "search_url": "https://www.fraseryachts.com/en/yachts-for-sale/?search={query}", "category": "Brokers INT"}, {"name": "Burgess Yachts", "url": "https://www.burgessyachts.com", "type": "broker", "search_url": "https://www.burgessyachts.com/en/yacht-sale?q={query}", "category": "Brokers INT"}, {"name": "Ocean Alexander", "url": "https://www.oceanalexander.com", "type": "broker", "search_url": "https://www.oceanalexander.com/find-a-boat/?q={query}", "category": "Brokers USA"}, {"name": "Merle Wood", "url": "https://www.merlewood.com", "type": "broker", "search_url": "https://www.merlewood.com/yachts-for-sale/", "category": "Brokers INT"}, # ── Other ───────────────────────────────────────────────────────────── {"name": "NauticExpo", "url": "https://www.nauticexpo.com", "type": "broker"}, {"name": "Seaboats", "url": "https://www.seaboats.net", "type": "broker"}, {"name": "YachtBroker", "url": "https://www.yachtbroker.com", "type": "broker"}, ], "Comercial / Industrial": [ {"name": "WorkBoat", "url": "https://www.workboat.com/classifieds", "type": "commercial"}, {"name": "TradeABoat", "url": "https://www.tradeaboat.com.au", "type": "broker"}, {"name": "Boatpoint", "url": "https://www.boatpoint.com.au", "type": "broker"}, {"name": "Boats & Outboards","url": "https://www.boatsandoutboards.co.uk", "type": "broker"}, {"name": "Commercial Vessel","url": "https://www.commercialvessel.com", "type": "commercial"}, {"name": "ShipServ", "url": "https://www.shipserv.com", "type": "commercial"}, {"name": "Marine Classifieds","url": "https://www.marineclassifieds.com", "type": "classifieds"}, {"name": "Barcos.net", "url": "https://www.barcos.net", "type": "broker"}, # ── Offshore / DP / OSV ─────────────────────────────────────────────── {"name": "Offshore Vessel Exchange","url": "https://www.offshorevessel.exchange","type": "commercial", "search_url": "https://www.offshorevessel.exchange/?s={query}", "category": "Offshore / DP"}, {"name": "MarineTraffic Vessels For Sale","url": "https://www.marinetraffic.com/en/ads/p/list","type": "commercial", "search_url": "https://www.marinetraffic.com/en/ads/p/list?search={query}", "category": "Offshore / DP"}, {"name": "YachtWorld Commercial","url": "https://www.yachtworld.com","type": "commercial", "search_url": "https://www.yachtworld.com/boats-for-sale/type-commercial/?query={query}", "category": "Offshore / DP"}, {"name": "Apollo Duck Workboats","url": "https://www.apolloduck.com","type": "commercial", "search_url": "https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "category": "Offshore / DP"}, {"name": "Seawork Classifieds","url": "https://www.seawork.com","type": "commercial", "search_url": "https://www.seawork.com/classifieds/", "category": "Offshore / DP"}, {"name": "ShipXchange OSV", "url": "https://www.shipxchange.com", "type": "commercial", "search_url": "https://www.shipxchange.com/en/vessel-types/offshore-support-vessel", "category": "Offshore / DP"}, {"name": "Vessel Sales & Acquisitions","url": "https://www.vsl.no", "type": "commercial", "search_url": "https://www.vsl.no/vessels-for-sale/", "category": "Offshore / DP"}, ], "Clasificados Generales": [ {"name": "Craigslist Boats", "url": "https://www.craigslist.org/search/boa", "type": "classifieds"}, {"name": "eBay Motors Marine","url": "https://www.ebay.com/b/Boats/26429", "type": "classifieds", "search_url": "https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10"}, {"name": "Facebook Marketplace","url": "https://www.facebook.com/marketplace/boats","type": "classifieds"}, {"name": "BoatCrazy", "url": "https://boatcrazy.com", "type": "classifieds", "search_url": "https://boatcrazy.com/boats?q={query}", "category": "Clasificados USA"}, {"name": "Kijiji Marine", "url": "https://www.kijiji.ca/b-boats", "type": "classifieds"}, {"name": "Gumtree Boats", "url": "https://www.gumtree.com/boats", "type": "classifieds"}, {"name": "Subito.it Barche", "url": "https://www.subito.it/barche", "type": "classifieds"}, {"name": "LeBonCoin Bateaux","url": "https://www.leboncoin.fr/bateaux", "type": "classifieds"}, {"name": "Wallapop Barcos", "url": "https://es.wallapop.com/barcos", "type": "classifieds"}, {"name": "MercadoLibre", "url": "https://www.mercadolibre.com/barcos", "type": "classifieds"}, {"name": "OLX Marine", "url": "https://www.olx.com/boats", "type": "classifieds"}, ], "Salvage & Wrecks": [ {"name": "Cooper Capital Salvage", "url": "https://www.cooperss.com", "type": "salvage", "search_url": "https://www.cooperss.com/", "category": "Salvage USA"}, {"name": "Salvex", "url": "https://www.salvex.com", "type": "salvage", "search_url": "https://www.salvex.com/search/?q={query}&cat=30", "category": "Salvage USA"}, {"name": "Copart Marine", "url": "https://www.copart.com", "type": "salvage", "search_url": "https://www.copart.com/public/data/lotSearchResults/?query={query}&vehicleType=BOAT", "category": "Salvage USA"}, {"name": "IAA Watercraft", "url": "https://www.iaai.com", "type": "salvage", "search_url": "https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft", "category": "Salvage USA"}, {"name": "Ritchie Bros Marine","url": "https://www.rbauction.com", "type": "auction", "search_url": "https://www.rbauction.com/used-equipment?q={query}&searchType=MODEL&equipmentCategory=marine", "category": "Salvage USA"}, {"name": "NavAuctions", "url": "https://www.navauctions.com", "type": "salvage"}, {"name": "MarineWrecks", "url": "https://www.marinewrecks.com", "type": "salvage"}, {"name": "BoatBreakers", "url": "https://www.boatbreakers.com", "type": "salvage"}, {"name": "Barnacle Marine", "url": "https://www.barnaclemarine.com", "type": "salvage"}, {"name": "Boat Breakers AU","url": "https://www.boatbreakersnz.com", "type": "salvage"}, ], "Revistas & Noticias": [ {"name": "Trade Only Today","url": "https://www.tradeonlytoday.com", "type": "news"}, {"name": "Nautical News", "url": "https://www.nauticalnews.com", "type": "news"}, {"name": "Boat International","url": "https://www.boatinternational.com/yachts","type": "magazine"}, {"name": "Superyacht Times","url": "https://www.superyachttimes.com", "type": "magazine"}, {"name": "The Triton", "url": "https://www.the-triton.com/classifieds", "type": "magazine"}, {"name": "Passagemaker", "url": "https://www.passagemaker.com", "type": "magazine"}, {"name": "WorkBoat Mag", "url": "https://www.workboat.com", "type": "magazine"}, {"name": "Lloyd's List", "url": "https://lloydslist.maritimeintelligence.informa.com", "type": "news"}, {"name": "Tradewinds", "url": "https://www.tradewindsnews.com", "type": "news"}, {"name": "Maritime Executive","url": "https://www.maritime-executive.com", "type": "news"}, {"name": "Splash247", "url": "https://splash247.com", "type": "news"}, {"name": "Bairdmaritime", "url": "https://www.bairdmaritime.com", "type": "news"}, ], "Registros & Gobierno": [ {"name": "USCG Docs", "url": "https://www.dco.uscg.mil/nvdc", "type": "registry"}, {"name": "UK Ship Register","url": "https://www.ukshipregister.co.uk", "type": "registry"}, {"name": "Panama Registry", "url": "https://www.segumar.com", "type": "registry"}, {"name": "Marshall Islands","url": "https://www.register-iri.com", "type": "registry"}, {"name": "Liberian Registry","url": "https://www.liscr.com", "type": "registry"}, {"name": "Bahamas Maritime","url": "https://www.bahamasmaritime.com", "type": "registry"}, {"name": "IHS Sea-web", "url": "https://maritime.ihs.com", "type": "registry"}, ], } # ── Database ────────────────────────────────────────────────────────────────── def init_db(): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.executescript(""" CREATE TABLE IF NOT EXISTS vessels ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, vessel_type TEXT, loa_m REAL, beam_m REAL, draft_m REAL, year_built INTEGER, hull TEXT, propulsion TEXT, status TEXT, price_usd REAL, currency TEXT DEFAULT 'USD', location TEXT, country TEXT, source_name TEXT, source_url TEXT, description TEXT, images TEXT, flags TEXT, score REAL DEFAULT 0, fingerprint TEXT UNIQUE, raw_data TEXT, created_at TEXT DEFAULT (datetime('now')), updated_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS saved_vessels ( id INTEGER PRIMARY KEY AUTOINCREMENT, vessel_id INTEGER REFERENCES vessels(id), notes TEXT, saved_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS search_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, query TEXT, filters TEXT, results INTEGER, searched_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS custom_sources ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category TEXT DEFAULT 'Custom', search_url TEXT NOT NULL, source_type TEXT DEFAULT 'broker', active INTEGER DEFAULT 1, added_by TEXT, last_status TEXT DEFAULT 'unknown', created_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS users ( id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT UNIQUE NOT NULL, password TEXT NOT NULL, role TEXT DEFAULT 'user', created_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS collections ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT, color TEXT DEFAULT '#00b4ff', icon TEXT DEFAULT '📁', created_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS collection_vessels ( id INTEGER PRIMARY KEY AUTOINCREMENT, collection_id INTEGER REFERENCES collections(id), vessel_id INTEGER REFERENCES vessels(id), notes TEXT, added_at TEXT DEFAULT (datetime('now')), UNIQUE(collection_id, vessel_id) ); CREATE TABLE IF NOT EXISTS alerts ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, filters TEXT, last_match INTEGER DEFAULT 0, active INTEGER DEFAULT 1, created_at TEXT DEFAULT (datetime('now')) ); CREATE INDEX IF NOT EXISTS idx_vessels_type ON vessels(vessel_type); CREATE INDEX IF NOT EXISTS idx_vessels_status ON vessels(status); CREATE INDEX IF NOT EXISTS idx_vessels_price ON vessels(price_usd); CREATE INDEX IF NOT EXISTS idx_vessels_score ON vessels(score DESC); """) conn.commit() conn.close() def get_db(): conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row return conn # ── Ollama helpers ───────────────────────────────────────────────────────────── _ollama_sem = threading.Semaphore(3) # max 3 concurrent Ollama calls def ollama_generate(prompt: str, model: str = None, json_mode: bool = False) -> str: model = model or MODELS['classify'] payload = { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 2048} } if json_mode: payload["format"] = "json" with _ollama_sem: try: r = requests.post(OLLAMA_URL, json=payload, timeout=120) r.raise_for_status() return r.json().get("response", "") except Exception as e: print(f"[Ollama] Error: {e}") return "" def ollama_models() -> list: try: r = requests.get("http://localhost:11434/api/tags", timeout=5) return [m["name"] for m in r.json().get("models", [])] except: return [] def extract_vessel_from_text(text: str, source: str) -> dict: """Use Ollama to extract structured vessel data from raw text.""" prompt = f"""Eres un experto en inteligencia de mercado marítimo. Analiza este texto de un anuncio de embarcación y extrae los datos disponibles. Fuente: {source} TEXTO: {text[:3000]} Responde SOLO con JSON válido. Si el texto NO es un listing de embarcación específica responde {{"skip": true}}. {{ "skip": false, "name": "nombre del barco o descripción corta", "vessel_type": "Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Salvage|Other", "loa_m": número o null, "beam_m": número o null, "draft_m": número o null, "year_built": número o null, "hull": "Fiberglass|Steel|Aluminum|Wood|Composite|Unknown", "propulsion": "Diesel|Gasoline|Electric|Sail|None|Unknown", "status": "active|auction|salvage|abandoned|sold", "price_usd": número o null, "currency": "USD|EUR|GBP|CAD|AUD|etc", "location": "ciudad, país", "country": "código ISO 2 letras", "description": "resumen en español máximo 200 caracteres", "flags": ["below_market","rare","auction","salvage_value","motivated_seller","commercial","government_surplus"], "score": número del 0 al 100 según oportunidad para un broker }}""" response = ollama_generate(prompt, model=MODELS['extract'], json_mode=True) try: data = json.loads(response) if data.get("skip"): return {} return data except: match = re.search(r'\{.*\}', response, re.DOTALL) if match: try: data = json.loads(match.group()) if data.get("skip"): return {} return data except: pass return {} # ── Direct source scrapers — no search engine middleman ────────────────── import random USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0', ] def get_headers(referer=None): ua = random.choice(USER_AGENTS) h = { 'User-Agent': ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,fr;q=0.7', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0', } if referer: h['Referer'] = referer return h HEADERS = get_headers() # Each source has its own search URL pattern and CSS selectors # Sites we scrape directly (confirmed working) DIRECT_SOURCES = [ # ── Craigslist ───────────────────────────────────────────────────────────── # Single multi-city entry (uses scrape_craigslist internally — Playwright, 3+ cities) {"name":"Craigslist", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"}, # Individual cities — each makes one targeted request via scrape_direct_source {"name":"Craigslist Miami", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Tampa", "category":"Clasificados USA", "search_url":"https://tampa.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Fort Laud", "category":"Clasificados USA", "search_url":"https://miami.craigslist.org/search/boa?query={query}&sort=date", "type":"classifieds"}, {"name":"Craigslist New Orleans","category":"Clasificados USA", "search_url":"https://neworleans.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Houston", "category":"Clasificados USA", "search_url":"https://houston.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Seattle", "category":"Clasificados USA", "search_url":"https://seattle.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist LA", "category":"Clasificados USA", "search_url":"https://losangeles.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist SF", "category":"Clasificados USA", "search_url":"https://sfbay.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Jacksonville","category":"Clasificados USA", "search_url":"https://jacksonville.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Sarasota", "category":"Clasificados USA", "search_url":"https://sarasota.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Chicago", "category":"Clasificados USA", "search_url":"https://chicago.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Boston", "category":"Clasificados USA", "search_url":"https://boston.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Atlanta", "category":"Clasificados USA", "search_url":"https://atlanta.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Baltimore", "category":"Clasificados USA", "search_url":"https://baltimore.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Norfolk", "category":"Clasificados USA", "search_url":"https://norfolk.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist San Diego", "category":"Clasificados USA", "search_url":"https://sandiego.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Portland OR","category":"Clasificados USA", "search_url":"https://portland.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Minneapolis","category":"Clasificados USA", "search_url":"https://minneapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Detroit", "category":"Clasificados USA", "search_url":"https://detroit.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Cleveland", "category":"Clasificados USA", "search_url":"https://cleveland.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Charlotte", "category":"Clasificados USA", "search_url":"https://charlotte.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Denver", "category":"Clasificados USA", "search_url":"https://denver.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Phoenix", "category":"Clasificados USA", "search_url":"https://phoenix.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Annapolis", "category":"Clasificados USA", "search_url":"https://annapolis.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist New Jersey", "category":"Clasificados USA", "search_url":"https://newjersey.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Galveston", "category":"Clasificados USA", "search_url":"https://galveston.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Pensacola", "category":"Clasificados USA", "search_url":"https://pensacola.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Mobile AL", "category":"Clasificados USA", "search_url":"https://mobile.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Key West", "category":"Clasificados USA", "search_url":"https://keys.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Corpus", "category":"Clasificados USA", "search_url":"https://corpuschristi.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Beaumont", "category":"Clasificados USA", "search_url":"https://beaumont.craigslist.org/search/boa?query={query}", "type":"classifieds"}, {"name":"Craigslist Baton Rouge","category":"Clasificados USA", "search_url":"https://batonrouge.craigslist.org/search/boa?query={query}", "type":"classifieds"}, # NOTE: gulfcoast.craigslist.org (Biloxi) no longer exists — replaced with Mobile AL # ── eBay ────────────────────────────────────────────────────────────────── {"name":"eBay Marine", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_BIN=1&_sop=10", "type":"classifieds"}, {"name":"eBay Auction", "category":"Subastas USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=26429&LH_Auction=1", "type":"auction"}, {"name":"eBay Motors Sail", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}&_sacat=36431&LH_BIN=1&_sop=10", "type":"classifieds"}, {"name":"eBay Boats Complete", "category":"Clasificados USA", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+boat&_sacat=26429&LH_BIN=1&_sop=15", "type":"classifieds"}, {"name":"eBay Salvage Boats", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"}, # ── Subastas Gobierno ───────────────────────────────────────────────────── {"name":"GovDeals", "category":"Subastas Gobierno", "search_url":"https://www.govdeals.com/index.cfm?fa=Main.AdvSearchResultsNew&kWord={query}&category=70", "type":"auction"}, {"name":"PublicSurplus", "category":"Subastas Gobierno", "search_url":"https://www.publicsurplus.com/sms/browse/home?search={query}", "type":"auction"}, {"name":"PropertyRoom", "category":"Subastas Gobierno", "search_url":"https://www.propertyroom.com/s?q={query}+boat", "type":"auction"}, # GovPlanet: correct URL confirmed working (Recreational Marine category) {"name":"GovPlanet", "category":"Subastas Gobierno", "search_url":"https://www.govplanet.com/Recreational+Marine", "type":"auction"}, # IronPlanet: correct URL confirmed working (Commercial Marine Vessels) {"name":"IronPlanet", "category":"Subastas Gobierno", "search_url":"https://www.ironplanet.com/Commercial+Marine+Vessels", "type":"auction"}, # HiBid: React SPA — scrape_hibid uses Playwright {"name":"HiBid", "category":"Subastas USA", "search_url":"https://www.hibid.com/lots?q={query}+boat", "type":"auction"}, {"name":"AuctionTime", "category":"Subastas USA", "search_url":"https://www.auctiontime.com/listings/search?q={query}+boat", "type":"auction"}, {"name":"BidSpotter", "category":"Subastas USA", "search_url":"https://www.bidspotter.com/en-us/auction-catalogues?q={query}+boat", "type":"auction"}, # Copart: Playwright scraper handles JS-rendered lots {"name":"Copart Marine", "category":"Subastas USA", "search_url":"https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT", "type":"auction"}, # ── Salvage ─────────────────────────────────────────────────────────────── {"name":"Salvex Marine", "category":"Salvage / Subastas", "search_url":"https://salvex.com/listings/?q={query}&cat=marine", "type":"salvage"}, {"name":"Barnacle Marine", "category":"Salvage / Subastas", "search_url":"https://www.barnaclemarine.com/?s={query}", "type":"salvage"}, {"name":"eBay Salvage", "category":"Salvage / Subastas", "search_url":"https://www.ebay.com/sch/i.html?_nkw={query}+salvage+boat&_sacat=26429&LH_Auction=1", "type":"salvage"}, {"name":"Cooper Capital Salvage","category":"Salvage USA", "search_url":"https://www.cooperss.com/", "type":"salvage"}, {"name":"IAA Watercraft", "category":"Salvage USA", "search_url":"https://www.iaai.com/Search?SearchText={query}&vehicleType=Watercraft", "type":"salvage"}, # ── Venta Especializada — principales ──────────────────────────────────── {"name":"YachtWorld", "category":"Venta Especializada", "search_url":"https://www.yachtworld.com/boats-for-sale/", "type":"broker"}, {"name":"BoatTrader", "category":"Venta Especializada", "search_url":"https://www.boattrader.com/boats/?query={query}", "type":"broker"}, {"name":"Boats.com", "category":"Venta Especializada", "search_url":"https://www.boats.com/boats-for-sale/?query={query}", "type":"broker"}, {"name":"Apollo Duck", "category":"Venta Especializada", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"broker"}, {"name":"Rightboat", "category":"Venta Especializada", "search_url":"https://www.rightboat.com/boats-for-sale/?q={query}", "type":"broker"}, # Boat24: 403 on requests — scrape_eu_broker uses Playwright {"name":"Boat24", "category":"Venta Especializada", "search_url":"https://www.boat24.com/en/boats/?q={query}", "type":"broker"}, # YachtMarket: uses scrape_eu_broker (Playwright) in case of blocks {"name":"YachtMarket", "category":"Venta Especializada", "search_url":"https://www.yachtmarket.com/boats-for-sale/?q={query}", "type":"broker"}, # ── SailboatListings (dedicated thread also runs in parallel) ──────────── {"name":"SailboatListings", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=&material=&hull=&state=&keyword={query}&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True}, {"name":"SailboatListings View", "category":"Veleros Global", "search_url":"https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi?db=default&uid=default&sb=33&so=descend&websearch=1&manufacturer=&model=&length-gt={loa_min_ft}&length-lt={loa_max_ft}&year-lt=---&year-gt=---&price-lt={price_max}&type=Sail&material=&hull=&state=&keyword=&view_records=+Show+Matching+Boats+", "type":"broker", "supports_filters": True}, # Forums: Playwright scraper handles vBulletin/XenForo FS sections {"name":"TheHullTruth", "category":"Veleros Global", "search_url":"https://www.thehulltruth.com/boating-forum/search.php?do=process&query={query}&prefixid=FS&type=post", "type":"classifieds"}, {"name":"Cruisers Forum", "category":"Veleros Global", "search_url":"https://www.cruisersforum.com/forums/f152/", "type":"classifieds"}, # ── Comercial / Offshore ────────────────────────────────────────────────── {"name":"WorkBoat Classifieds", "category":"Comercial Offshore", "search_url":"https://www.workboat.com/classifieds/?keywords={query}", "type":"commercial"}, {"name":"Commercial Vessel", "category":"Comercial Offshore", "search_url":"https://www.commercialvessel.com/search?keywords={query}", "type":"commercial"}, {"name":"OSV Broker", "category":"Comercial Offshore", "search_url":"https://www.osvbroker.com/?s={query}", "type":"commercial"}, {"name":"Marine Classifieds", "category":"Comercial Offshore", "search_url":"https://www.marineclassifieds.com/search.php?search={query}", "type":"commercial"}, {"name":"Seaboats", "category":"Comercial Global", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=0", "type":"commercial"}, {"name":"Seaboats Offshore", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=offshore+support+vessels", "type":"commercial"}, {"name":"Seaboats Tug", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=tugs+%26+pushboats", "type":"commercial"}, {"name":"Seaboats Barge", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=barges+%26+lighters", "type":"commercial"}, {"name":"Seaboats Fishing", "category":"Comercial Offshore", "search_url":"https://www.seaboats.net/search.php?q={query}&cat=fishing+vessels", "type":"commercial"}, {"name":"Apollo Duck Workboats", "category":"Comercial Offshore", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1", "type":"commercial"}, {"name":"YachtWorld Commercial", "category":"Comercial Offshore", "search_url":"https://www.yachtworld.com/boats-for-sale/type-commercial/", "type":"commercial"}, # ── Australia / Pacífico ───────────────────────────────────────────────── # Trade a Boat AU: server-rendered, correct URL confirmed working {"name":"Trade a Boat AU", "category":"Australia / Pacifico","search_url":"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query}", "type":"broker"}, # Boatsales.com.au (Boatpoint redirects here): scrape_eu_broker via Playwright {"name":"Boatsales AU", "category":"Australia / Pacifico","search_url":"https://www.boatsales.com.au/boats-for-sale/?q={query}", "type":"broker"}, # ── Reino Unido ─────────────────────────────────────────────────────────── # Boats & Outboards UK: 403 on requests — scrape_eu_broker uses Playwright {"name":"Boats & Outboards UK", "category":"Reino Unido", "search_url":"https://www.boatsandoutboards.co.uk/boats-for-sale/?q={query}", "type":"broker"}, # Apollo Duck UK: use same apolloduck.com (no separate UK subdomain) {"name":"Apollo Duck UK", "category":"Reino Unido", "search_url":"https://www.apolloduck.com/search.phtml?search={query}&sr=1&q=1&country=GB", "type":"broker"}, # ── Francia ─────────────────────────────────────────────────────────────── # Annonces Bateau: 403 on requests — scrape_eu_broker uses Playwright {"name":"Annonces Bateau", "category":"Francia", "search_url":"https://www.annoncesbateau.com/bateaux/annonces-bateaux?keyword={query}", "type":"broker"}, # ── España / Mediterráneo ──────────────────────────────────────────────── # Inautia ES: 403 on requests — scrape_eu_broker uses Playwright {"name":"Inautia ES", "category":"Espana / Global", "search_url":"https://www.inautia.es/barca?q={query}", "type":"broker"}, {"name":"Barcos.net", "category":"Espana / Global", "search_url":"https://www.barcos.net/busqueda/?q={query}", "type":"broker"}, # ── Europa / Global ─────────────────────────────────────────────────────── # YachtAll: 403 on requests — scrape_eu_broker uses Playwright {"name":"YachtAll", "category":"Clasificados EU", "search_url":"https://yachtall.com/yachts/?search={query}", "type":"broker"}, # ── Brokers USA ─────────────────────────────────────────────────────────── {"name":"HMY Yachts", "category":"Brokers USA", "search_url":"https://www.hmy.com/yachts-for-sale/?SaleClassCode=used", "type":"broker"}, {"name":"Denison Yachting", "category":"Brokers USA", "search_url":"https://www.denisonyachtsales.com/yachts-for-sale/?search={query}", "type":"broker"}, {"name":"BoatCrazy", "category":"Brokers USA", "search_url":"https://boatcrazy.com/boats?q={query}", "type":"classifieds"}, # Galati Yachts: server-rendered WP site — scrape_galati uses requests {"name":"Galati Yachts", "category":"Brokers USA", "search_url":"https://www.galatiyachts.com/yachts-for-sale/?keywords={query}", "type":"broker"}, {"name":"United Yacht Sales", "category":"Brokers USA", "search_url":"https://www.unitedyacht.com/yachts-for-sale/", "type":"broker"}, # Worth Ave Yachts: hybrid server-rendered — scrape_luxury_broker uses Playwright {"name":"Worth Ave Yachts", "category":"Brokers USA", "search_url":"https://www.worthavenueyachts.com/yachts-for-sale/", "type":"broker"}, # ── Brokers Internacionales ─────────────────────────────────────────────── # Fraser Yachts: Vue/JS SPA — scrape_luxury_broker uses Playwright {"name":"Fraser Yachts", "category":"Brokers Internacional","search_url":"https://www.fraseryachts.com/en/yachts-for-sale/", "type":"broker"}, # Burgess Yachts: JS-loaded — scrape_luxury_broker uses Playwright {"name":"Burgess Yachts", "category":"Brokers Internacional","search_url":"https://www.burgessyachts.com/en/yachts/sale/", "type":"broker"}, # Northrop & Johnson: JS-loaded — scrape_luxury_broker uses Playwright {"name":"Northrop & Johnson", "category":"Brokers Internacional","search_url":"https://www.njcharters.com/yachts-for-sale/", "type":"broker"}, {"name":"Merle Wood", "category":"Brokers Internacional","search_url":"https://www.merlewood.com/yachts-for-sale/", "type":"broker"}, # ── Canada ──────────────────────────────────────────────────────────────── {"name":"Kijiji Boats CA", "category":"Canada", "search_url":"https://www.kijiji.ca/b-boats/{query}/k0c132", "type":"classifieds"}, ] # Web search queries — finds listings on ANY site including blocked ones # DuckDuckGo returns results from YachtWorld, Boats.com, Apollo Duck, etc. # Base web search templates — {query} is replaced at runtime # Dynamic templates also get price/loa filters appended when available WEB_SEARCH_TEMPLATES = [ '"{query}" boat for sale', '"{query}" sailboat for sale', '"{query}" vessel for sale', '"{query}" yacht for sale', '"{query}" barco venta', '"{query}" bateau vendre occasion', 'site:yachtworld.com {query} for sale sail cruiser', 'site:boats.com {query} sailboat for sale', 'site:apolloduck.com {query} for sale', 'site:rightboat.com {query} for sale', 'site:boat24.com {query} for sale', 'site:yachtall.com {query} sailboat', 'site:annoncesbateau.com {query} voilier', 'site:cruisersforum.com {query} for sale', 'site:thehulltruth.com {query} for sale fs', 'site:govplanet.com {query} vessel', 'site:ironplanet.com {query} boat vessel', 'site:govdeals.com {query} vessel boat', 'site:publicsurplus.com {query} vessel', 'site:hibid.com {query} boat', 'site:copart.com {query} boat vessel', 'site:rbauction.com {query} boat', '"{query}" boat auction government surplus', '"{query}" vessel auction salvage', # Salvage specific 'site:salvex.com {query} marine vessel', 'site:copart.com {query} boat salvage', 'site:iaai.com {query} boat', 'site:boatbreakers.com {query}', '"{query}" salvage boat for sale', '"{query}" insurance total loss boat', '"{query}" wrecked boat for sale parts', '"{query}" boat salvage title for sale', 'site:seaboats.net {query}', 'site:workboat.com {query} for sale', 'site:commercialvessel.com {query}', # Offshore / commercial 'site:osvbroker.com {query}', 'site:marineclassifieds.com {query} for sale', 'site:apolloduck.com {query} offshore tug barge', '"{query}" offshore supply vessel for sale', '"{query}" OSV for sale broker', '"{query}" crew boat for sale', '"{query}" workboat for sale', '"{query}" tug for sale', '"{query}" barge for sale', '"{query}" supply vessel for sale', '"{query}" fishing vessel for sale', '"{query}" commercial vessel for sale', # Australia / Pacific 'site:tradeaboat.com.au {query} for sale', 'site:boatpoint.com.au {query} for sale', # Europe classifieds 'site:boatsandoutboards.co.uk {query} for sale', 'site:annoncesbateau.com {query} voilier', 'site:inautia.com {query} barco venta', ] def build_web_queries(base_query: str, filters: dict) -> list: """Build web search queries filtered by vessel type/status to avoid irrelevant searches.""" price_ctx = "" loa_ctx = "" if filters.get("max_price"): price_ctx = f" under ${filters['max_price']}" if filters.get("min_loa"): ft = int(float(filters["min_loa"]) / 0.3048) loa_ctx = f" {ft}ft+" vtype = (filters.get("type","") or "").lower() status = (filters.get("status","") or "").lower() # Categorize templates so we only include relevant ones SALVAGE_KWORDS = {"salvage","copart","iaai","boatbreakers","insurance","total loss","wrecked","salvage title"} OFFSHORE_KWORDS = {"workboat","commercial","osvbroker","offshore","osv","crew boat","supply vessel","tug","barge","fishing vessel"} SAIL_KWORDS = {"sailboat","yachtall","annoncesbateau","voilier","cruisersforum","sail cruiser"} GENERIC_KWORDS = {"boat for sale","vessel for sale","yacht for sale","barco venta","bateau","yachtworld","boats.com","apolloduck","rightboat","boat24","govplanet","ironplanet","govdeals","publicsurplus","hibid","rbauction","tradeaboat","boatpoint","boatsandoutboards","inautia"} is_salvage = status == "salvage" or "salvage" in base_query.lower() is_offshore = vtype in {"offshore","tug","barge","ferry","fishing","commercial"} or any(k in base_query.lower() for k in {"tug","barge","osv","crew boat","workboat"}) is_sail = vtype in {"sailboat","velero","sail"} or any(k in base_query.lower() for k in {"sail","velero","ketch","sloop"}) queries = [] for tmpl in WEB_SEARCH_TEMPLATES: tmpl_l = tmpl.lower() # Skip salvage templates for non-salvage searches if any(k in tmpl_l for k in SALVAGE_KWORDS) and not is_salvage: continue # Skip offshore templates for clearly non-offshore searches (sailboat/velero) if any(k in tmpl_l for k in OFFSHORE_KWORDS) and is_sail and not is_offshore: continue # Skip sailboat templates for offshore/salvage searches if any(k in tmpl_l for k in SAIL_KWORDS) and (is_offshore or is_salvage) and not is_sail: continue q = tmpl.replace("{query}", base_query) if not q.startswith("site:") and (price_ctx or loa_ctx): q += loa_ctx + price_ctx queries.append(q) return queries SEARCH_ENGINES = [ { "name": "DuckDuckGo", "url": "https://html.duckduckgo.com/html/?q={query}", "link_sel": "a.result__a", "snippet_sel": "a.result__snippet", }, { "name": "Bing", "url": "https://www.bing.com/search?q={query}&count=20", "link_sel": "h2 a", "snippet_sel": ".b_caption p", }, ] def web_search(query: str, max_results: int = 8) -> list[dict]: """Search web engines for real listings.""" results = [] seen = set() skip = ["google.","bing.","duckduckgo.","yahoo.","wikipedia.","youtube.", "facebook.com/login","instagram.","twitter.","linkedin.", "pinterest.","reddit.com/r/",".pdf","amazon.com/s?"] for engine in SEARCH_ENGINES: try: url = engine["url"].format(query=requests.utils.quote(query)) time.sleep(1.0) r = requests.get(url, headers=get_headers(), timeout=20, verify=False) if r.status_code != 200: continue soup = BeautifulSoup(r.text, "html.parser") links = soup.select(engine["link_sel"]) snippets = soup.select(engine["snippet_sel"]) for i, link in enumerate(links[:max_results*2]): href = link.get("href","") # Clean DDG redirect if "duckduckgo.com" in href: m = re.search(r'uddg=([^&]+)', href) if m: href = requests.utils.unquote(m.group(1)) if not href.startswith("http"): continue if any(s in href for s in skip): continue if href in seen: continue seen.add(href) title = link.get_text(strip=True) snippet = snippets[i].get_text(strip=True) if i < len(snippets) else "" try: source = href.split("/")[2].replace("www.","") except: source = "web" results.append({ "url": href, "title": title, "snippet": snippet, "price_text": "", "img_url": "", "location": "", "source": source, "source_type": "broker", "category": "Web Search" }) if len(results) >= max_results: break except Exception as e: pass if len(results) >= max_results: break return results def scrape_direct_source(source: dict, query: str, filters: dict = None) -> list[dict]: if filters is None: filters = {} """AI-powered scraper — no CSS selectors, reads HTML like a human.""" results = [] try: # Build URL — expand filter placeholders if source supports them raw_url = source["search_url"] if source.get("supports_filters"): min_loa_m = float(filters.get("min_loa") or 0) max_price = filters.get("max_price") or "" min_price = filters.get("min_price") or "" loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else "" loa_max_ft = "" # no max LOA filter in current UI raw_url = raw_url.replace("{loa_min_ft}", str(loa_min_ft)) raw_url = raw_url.replace("{loa_max_ft}", str(loa_max_ft)) raw_url = raw_url.replace("{price_min}", str(min_price)) raw_url = raw_url.replace("{price_max}", str(max_price)) # Clean query - remove duplicate "for sale" # Clean query - remove duplicates clean_q = query.strip() for phrase in [" for sale for sale", "for sale for sale", " velero velero", " sailboat sailboat"]: clean_q = clean_q.replace(phrase, phrase.split()[0] + " " + phrase.split()[1]) clean_q = ' '.join(dict.fromkeys(clean_q.split())) # remove duplicate words url = raw_url.format(query=requests.utils.quote(clean_q.replace(' for sale for sale',' for sale'))) time.sleep(1.0) domain = url.split('/')[2] headers = get_headers(referer=f"https://{domain}/") r = requests.get(url, headers=headers, timeout=25, verify=False) # Retry with different UA if blocked if r.status_code in [403, 429, 503]: time.sleep(2) headers = get_headers() r = requests.get(url, headers=headers, timeout=25, verify=False) if r.status_code not in [200, 206]: print(f"[{source['name']}] HTTP {r.status_code}") return [] soup = BeautifulSoup(r.text, "html.parser") for tag in soup(["script","style","nav","footer","header","aside","noscript","meta","link"]): tag.decompose() base_url = "/".join(url.split("/")[:3]) raw_links = [] skip_words = ["login","register","signup","about","contact","help", "privacy","terms","facebook.com","twitter.com","instagram.com"] for a in soup.find_all("a", href=True)[:80]: href = a["href"].strip() if not href or href.startswith("#") or href.startswith("javascript"): continue if not href.startswith("http"): href = base_url + ("" if href.startswith("/") else "/") + href if any(s in href.lower() for s in skip_words): continue text = a.get_text(strip=True)[:150] parent = a.find_parent() price = "" img = "" if parent: ptxt = parent.get_text(" ", strip=True) pm = re.search(r'[\d,]+(?:\.\d+)?\s*(?:USD|EUR|GBP|CAD|\$|€|£)', ptxt) if pm: price = pm.group() # Traverse up to 4 levels to find a thumbnail image node = parent for _ in range(4): if node is None: break im = node.find("img") if im: src = _extract_best_src(im) if src: # Convert relative to absolute if src.startswith("//"): src = "https:" + src elif src.startswith("/"): src = base_url + src if src.startswith("http") and len(src) > 20: img = src break node = node.parent if text and len(text) > 8: raw_links.append({"url":href,"title":text,"price":price,"img":img}) if not raw_links: print(f"[{source['name']}] No links found") return [] seen = set() unique = [] for lnk in raw_links: if lnk["url"] not in seen: seen.add(lnk["url"]) unique.append(lnk) # ── Heuristic listing filter (no AI needed) ────────────────────────── # Score each link — higher = more likely to be an actual vessel listing BOAT_KW = ["boat","yacht","vessel","sail","ketch","sloop","cutter","schooner", "yawl","catamaran","trimaran","motor","tug","barge","cruiser","skiff", "fishing","trawler","offshore","cabin","dinghy","pontoon","runabout"] def listing_score(lnk): url_l = lnk["url"].lower() title_l = lnk["title"].lower() sc = 0 if lnk["price"]: sc += 4 # price is strong signal if lnk["img"]: sc += 1 # has photo if re.search(r'/\d{5,}', url_l): sc += 3 # 5+ digit ID if re.search(r'/(view|detail|listing|item|vessel|boat|ship|for-sale)[-/]', url_l): sc += 2 if re.search(r'-for-sale[/-]?$', url_l): sc += 2 if re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title_l): sc += 3 # year in title if re.search(r'\d{2,3}\s*(?:\'|ft|feet|meter)', title_l): sc += 2 # size if any(k in title_l for k in BOAT_KW): sc += 1 if re.search(r'\b(for sale|en vente|vendre|en venta)\b', title_l): sc += 1 if len(lnk["title"]) > 15: sc += 1 # nav links are short return sc scored = [(listing_score(lnk), lnk) for lnk in unique[:30]] scored.sort(key=lambda x: x[0], reverse=True) # Keep links with score >= 3, or fall back to top-5 if nothing qualifies good = [lnk for sc, lnk in scored if sc >= 3] if not good: good = [lnk for _, lnk in scored[:5]] # best guesses from this source for lnk in good[:20]: results.append({ "url": lnk["url"], "title": lnk["title"], "snippet": f"Price: {lnk['price']}", "price_text": lnk["price"], "img_url": lnk["img"], "location": "", "source": source["name"], "source_type": source["type"], "category": source["category"], }) print(f"[{source['name']}] {len(results)} listings found") except Exception as e: print(f"[{source['name']}] Error: {e}") return results # Interleave queue for polite scraping _interleave_lock = threading.Lock() _interleave_sites = [ "https://miami.craigslist.org", "https://www.seaboats.net", "https://www.barcos.net", "https://www.ebay.com", "https://boston.craigslist.org", "https://seattle.craigslist.org", ] _interleave_idx = 0 def polite_pause(source_name: str): """ Between pages of the same site, make a quick request to a different site so we look like a human browsing — not a bot hammering one server. """ global _interleave_idx with _interleave_lock: site = _interleave_sites[_interleave_idx % len(_interleave_sites)] _interleave_idx += 1 try: requests.get(site, headers=get_headers(), timeout=5, verify=False) except Exception: pass # Random human-like delay: 2-5 seconds time.sleep(random.uniform(2.0, 5.0)) print(f"[{source_name}] Polite pause done — continuing...") def scrape_sailboatlistings(query: str, filters: dict, max_pages: int = 8) -> list[dict]: """ Multi-page scraper for SailboatListings.com. Captures MAIN listings (sailboat=XXXXX) with full structured data, plus SIDEBAR featured listings (/view/XXXXX) as bonus. """ results = [] seen_urls = set() min_loa_m = float(filters.get("min_loa") or 0) max_loa_m = float(filters.get("max_loa") or 0) max_price = filters.get("max_price") or "" loa_min_ft = int(min_loa_m / 0.3048) if min_loa_m else "" loa_max_ft = int(max_loa_m / 0.3048) if max_loa_m else "" vessel_type = filters.get("type","").lower() sbl_type_map = { "sailboat": "Sail", "sail": "Sail", "yacht": "cruiser", "motor": "powerboat", "motorboat": "powerboat", "fishing": "fishing", "tug": "", "barge": "", "offshore": "", "ferry": "", "commercial": "", } # Default "" → search ALL types on SailboatListings sbl_type = sbl_type_map.get(vessel_type, "") hull = filters.get("hull","").lower() sbl_hull_map = { "fiberglass":"fiberglass","steel":"steel", "aluminum":"aluminum","wood":"wood", } sbl_material = sbl_hull_map.get(hull, "") year_min = filters.get("year_min","---") year_max = filters.get("year_max","---") if not year_min: year_min = "---" if not year_max: year_max = "---" base_url = ( "https://www.sailboatlistings.com/cgi-bin/saildata/db.cgi" "?db=default&uid=default&sb=33&so=descend&websearch=1" f"&manufacturer=&model=" f"&length-gt={loa_min_ft}&length-lt={loa_max_ft}" f"&year-lt={year_max}&year-gt={year_min}&price-lt={max_price}" f"&type={sbl_type}&material={sbl_material}&hull=&state=" f"&keyword={requests.utils.quote(query)}" f"&view_records=+Show+Matching+Boats+" ) for page in range(1, max_pages + 1): if page > 1: polite_pause("SailboatListings") try: url = base_url if page == 1 else base_url + f"&nh={page}" r = requests.get(url, headers=get_headers(), timeout=25, verify=False) if r.status_code == 429: print(f"[SailboatListings] Rate limited on page {page} — stopping") break if r.status_code != 200: print(f"[SailboatListings] Page {page} HTTP {r.status_code}") break soup = BeautifulSoup(r.text, "html.parser") body_text = soup.get_text() if "no records" in body_text.lower() or "0 matches" in body_text.lower(): print(f"[SailboatListings] No more results at page {page}") break page_results = 0 # ── MAIN LISTINGS (sailboat=XXXXX) — full structured data ── for header_link in soup.find_all("a", class_="sailheader"): href = header_link.get("href", "") m = re.search(r'sailboat=(\d+)', href) if not m: continue sid = m.group(1) canonical = f"https://www.sailboatlistings.com/view/{sid}" if canonical in seen_urls: continue seen_urls.add(canonical) title = header_link.get_text(strip=True) # Parent table contains all structured sailvb/sailvk spans listing_table = header_link.find_parent("table") if not listing_table: continue # Extract structured fields fields = {} for label_span in listing_table.find_all("span", class_="sailvb"): label = label_span.get_text(strip=True).rstrip(":").strip() value_span = label_span.find_next("span", class_="sailvk") if value_span: fields[label] = value_span.get_text(strip=True) price_text = fields.get("Asking", "") location = fields.get("Location", "") # Build context string from structured fields context_parts = [f"{k}: {v}" for k, v in fields.items()] context = " | ".join(context_parts) # Extract image — upgrade thumbnail to full-size img_src = "" img_tag = listing_table.find("img") if img_tag: img_src = img_tag.get("src", "") or img_tag.get("data-src", "") if img_src and not img_src.startswith("http"): img_src = "https://www.sailboatlistings.com" + img_src # Upgrade /sailimg/t/ (thumbnail) or /sailimg/m/ (medium) → /sailimg/ (full) for thumb in ["/sailimg/t/", "/sailimg/m/"]: if thumb in img_src: img_src = img_src.replace(thumb, "/sailimg/") break if not img_src: img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg" results.append({ "url": canonical, "title": title or context[:80], "snippet": context, "price_text": price_text, "img_url": img_src, "location": location, "source": "SailboatListings", "source_type": "broker", "category": "Veleros Global", "fields": fields, # pass structured fields for direct extraction }) page_results += 1 # ── SIDEBAR FEATURED (/view/XXXXX) — less data but more listings ── for a in soup.find_all("a", class_="featured"): href = a.get("href", "") view_m = re.search(r'/view/(\d+)', href) if not view_m: continue sid = view_m.group(1) canonical = f"https://www.sailboatlistings.com/view/{sid}" if canonical in seen_urls: continue seen_urls.add(canonical) link_text = a.get_text(" ", strip=True) # Extract price from link text: "45' Alden 45 Falmouth, Maine Asking $355,000" price_m = re.search(r'Asking\s*\$([\d,]+)', link_text) price_text = f"${price_m.group(1)}" if price_m else "" # Extract location from featurespec span spec_span = a.find("span", class_="featurespec") location = "" if spec_span: spec_text = spec_span.get_text(" ", strip=True) # Location is before "Asking" loc_m = re.search(r'^(.+?)\s*Asking', spec_text) if loc_m: location = loc_m.group(1).strip() img_src = "" img_tag = a.find("img") if img_tag: img_src = img_tag.get("src", "") or "" if img_src and not img_src.startswith("http"): img_src = "https://www.sailboatlistings.com" + img_src for thumb in ["/sailimg/t/", "/sailimg/m/"]: if thumb in img_src: img_src = img_src.replace(thumb, "/sailimg/") break if not img_src: img_src = f"https://www.sailboatlistings.com/sailimg/{sid}/photo1.jpg" results.append({ "url": canonical, "title": link_text.split("Asking")[0].strip() if "Asking" in link_text else link_text, "snippet": link_text, "price_text": price_text, "img_url": img_src, "location": location, "source": "SailboatListings", "source_type": "broker", "category": "Veleros Global", "fields": {}, # no structured fields for sidebar listings }) page_results += 1 print(f"[SailboatListings] Page {page}: {page_results} listings (total: {len(results)})") if page_results == 0: break except Exception as e: print(f"[SailboatListings] Error page {page}: {e}") break print(f"[SailboatListings] Done — {len(results)} listings total") return results def scrape_and_extract_sailboatlistings(query: str, filters: dict, search_id: str, max_pages: int = 8): """ Runs SailboatListings scraping + AI extraction inline. Saves each vessel to DB immediately so it appears in dashboard in real-time. """ print(f"[SBL-Thread] Starting SailboatListings extraction...") raw_results = scrape_sailboatlistings(query, filters, max_pages) if not raw_results: print("[SBL-Thread] No results from SailboatListings") return sbl_min_loa = float(filters.get("min_loa") or 0) sbl_max_price = float(filters.get("max_price") or 0) saved = 0 for raw in raw_results: if search_state.get('search_id') != search_id or search_state.get('cancelled'): print("[SBL-Thread] Search cancelled — stopping") return try: snippet = raw.get("snippet", "") title = raw.get("title", "") fields = raw.get("fields", {}) # structured fields from main listings src = snippet + " " + title # ── Helper to parse feet values like "30'" or "5.25'" ── def parse_ft(val): if not val: return None m = re.match(r'([\d.]+)', val) return float(m.group(1)) if m else None # ── Use structured fields directly when available (main listings) ── if fields: loa_ft = parse_ft(fields.get("Length")) beam_ft = parse_ft(fields.get("Beam")) draft_ft = parse_ft(fields.get("Draft")) year_val = fields.get("Year", "") year_m = re.search(r'(\d{4})', year_val) asking = fields.get("Asking", "") price_r = re.search(r'\$\s*([\d,]{3,})', asking) location = fields.get("Location", "") hull_val = fields.get("Hull", "").lower() else: # Fallback: regex for sidebar/featured listings length_r = re.search(r'Length:\s*([\d.]+)', src, re.IGNORECASE) beam_r = re.search(r'Beam:\s*([\d.]+)', src, re.IGNORECASE) draft_r = re.search(r'Draft:\s*([\d.]+)', src, re.IGNORECASE) year_r = re.search(r'Year:\s*(\d{4})', src, re.IGNORECASE) price_r = re.search(r'(?:Asking|Price):?\s*\$\s*([\d,]{3,})', src, re.IGNORECASE) if not price_r: price_r = re.search(r'\$\s*([\d,]{4,})', src) loa_ft = float(length_r.group(1)) if length_r else None beam_ft = float(beam_r.group(1)) if beam_r else None draft_ft = float(draft_r.group(1)) if draft_r else None year_m = year_r location = raw.get("location", "") hull_val = "" loc_r = re.search(r'Location:\s*([^\n\r]{3,60}?)(?:\s{2,}|$)', src, re.IGNORECASE) if loc_r: location = loc_r.group(1).strip() hull_r2 = re.search(r'Hull:\s*([^\n\r]{3,50}?)(?:\s{2,}|$)', src, re.IGNORECASE) if hull_r2: hull_val = hull_r2.group(1).lower() # Fallback: extract LOA from title e.g. "35' Pearson 35" if not loa_ft: tm = re.search(r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', title, re.IGNORECASE) if tm: loa_ft = float(tm.group(1)) loa_m = round(loa_ft * 0.3048, 1) if loa_ft else None beam_m = round(beam_ft * 0.3048, 1) if beam_ft else None draft_m = round(draft_ft * 0.3048, 1) if draft_ft else None year = int(year_m.group(1)) if year_m else None location = location or raw.get("location", "") price_usd = None if price_r: try: price_usd = float(price_r.group(1).replace(",", "")) except: pass if not price_usd and raw.get("price_text"): pm = re.search(r'[\d,]+', raw["price_text"].replace("$","")) if pm: try: price_usd = float(pm.group().replace(",","")) except: pass # Skip only if absolutely no data if not loa_m and not year and not price_usd: continue # Apply filters if sbl_min_loa and loa_m and loa_m < (sbl_min_loa - 0.15): continue if sbl_max_price and price_usd and price_usd > sbl_max_price * 1.01: continue # Hull normalisation hull_txt = hull_val hull = ("Fiberglass" if "fiber" in hull_txt or "glass" in hull_txt else "Steel" if "steel" in hull_txt else "Aluminum" if "alum" in hull_txt else "Wood" if "wood" in hull_txt else "Composite" if "comp" in hull_txt else "Unknown") # Algorithmic score (fast, no AI) score = 50 if loa_m: score += min(15, int((loa_m - 13) * 1.5)) if loa_m >= 13 else 0 if year: score += min(10, max(0, (year - 1980) // 3)) if price_usd and loa_m: price_per_ft = price_usd / (loa_m / 0.3048) if price_per_ft < 500: score += 15 elif price_per_ft < 1000: score += 8 flags = [] if price_usd and loa_m and (price_usd / (loa_m / 0.3048)) < 600: flags.append("below_market") data = { "name": title or "SailboatListings boat", "vessel_type": "Sailboat", "loa_m": loa_m, "beam_m": beam_m, "draft_m": draft_m, "year_built": year, "hull": hull, "propulsion": "Sail", "status": "active", "price_usd": price_usd, "currency": "USD", "location": location, "country": "US", "description": f"Velero {title}. LOA: {loa_ft}ft. {location}".strip("."), "flags": flags, "score": min(100, score), "images": [raw["img_url"]] if raw.get("img_url") else [], "source_url": raw["url"], "source_name": "SailboatListings", } vid = save_vessel(data) if vid > 0: search_state['found'] += 1 saved += 1 msg = f"✓ {title} ({loa_ft}ft, ${price_usd:,.0f}) — SailboatListings" if price_usd else f"✓ {title} ({loa_ft}ft) — SailboatListings" print(f"[SBL-Thread] {msg}") search_state['log'].append(msg) except Exception as e: print(f"[SBL-Thread] Error on {raw.get('title','?')}: {e}") print(f"[SBL-Thread] Done — {saved}/{len(raw_results)} vessels saved") def stealth_fetch(url: str, max_chars: int = 3000) -> tuple: """ Fetch a Cloudflare-protected page using Playwright with human-like behavior. Returns (text, [image_urls]) Techniques used: - Realistic viewport and user agent - Random mouse movements before scrolling - Human-like delays - Accept cookies automatically - Disable webdriver flags """ text = "" images = [] try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=[ '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--no-sandbox', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', ] ) context = browser.new_context( viewport={'width': 1366, 'height': 768}, user_agent=random.choice(USER_AGENTS), locale='en-US', timezone_id='America/New_York', java_script_enabled=True, ignore_https_errors=True, extra_http_headers={ 'Accept-Language': 'en-US,en;q=0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', } ) # Remove webdriver property context.add_init_script(""" Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); window.chrome = {runtime: {}}; """) page = context.new_page() # Navigate with realistic timeout page.goto(url, timeout=30000, wait_until='domcontentloaded') # Random delay like a human reading page.wait_for_timeout(random.randint(1500, 3000)) # Accept cookies if button exists for selector in ['button:has-text("Accept")', 'button:has-text("Accept All")', '#onetrust-accept-btn-handler', '.cookie-accept']: try: page.click(selector, timeout=1000) page.wait_for_timeout(500) break except: pass # Scroll down naturally page.evaluate("window.scrollBy(0, 300)") page.wait_for_timeout(random.randint(500, 1200)) page.evaluate("window.scrollBy(0, 300)") page.wait_for_timeout(random.randint(300, 800)) # Get content html = page.content() browser.close() from bs4 import BeautifulSoup as BS soup = BS(html, 'html.parser') # Extract images — check all lazy-load attributes skip_words = ['logo','icon','avatar','banner','pixel','sprite','ad', 'placeholder','blank','loading','spacer','1x1'] seen_imgs = set() for img in soup.find_all('img'): src = _extract_best_src(img) if src and src not in seen_imgs: if not any(s in src.lower() for s in skip_words): seen_imgs.add(src) images.append(src) if len(images) >= 12: break for tag in soup(['script','style','nav','footer','header','aside']): tag.decompose() text = ' '.join(soup.get_text(' ', strip=True).split())[:max_chars] except Exception as e: print(f"[Stealth] Error: {e}") return text, images # Sites that need stealth scraping (Cloudflare protected) STEALTH_REQUIRED = [ 'yachtworld.com', 'boats.com', 'boattrader.com', 'rightboat.com', 'boat24.com', 'yachtall.com', 'botentekoop.nl', 'leboncoin.fr', 'annoncesbateau.com', 'thehulltruth.com', 'cruisersforum.com', ] def smart_fetch(url: str, max_chars: int = 3000) -> tuple: """Use stealth for protected sites, regular fetch for others.""" domain = url.split('/')[2].replace('www.','') if '//' in url else '' needs_stealth = any(s in domain for s in STEALTH_REQUIRED) if needs_stealth: print(f"[Fetch] Using stealth for {domain}") return stealth_fetch(url, max_chars) else: return fetch_page_with_images(url) def scrape_yachtworld(query: str, filters: dict, max_pages: int = 5) -> list: """ Dedicated YachtWorld stealth scraper. Builds filtered URL and navigates with human-like behavior. """ results = [] seen = set() # Build YachtWorld filtered URL vessel_type = filters.get("type","").lower() yw_type = "sail" if vessel_type in ["sailboat","sail","velero","yacht",""] else "power" min_loa = filters.get("min_loa","") max_price = filters.get("max_price","") base_url = f"https://www.yachtworld.com/boats-for-sale/type-{yw_type}/" if vessel_type in ["sailboat","sail","velero",""]: base_url = "https://www.yachtworld.com/boats-for-sale/type-sail/class-sail-cruiser/" if min_loa: ft = int(float(min_loa) / 0.3048) base_url += f"length-{ft}/" if max_price: base_url += f"price-0,{max_price}/" print(f"[YachtWorld] Scraping: {base_url}") try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=['--disable-blink-features=AutomationControlled','--no-sandbox'] ) context = browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent=random.choice(USER_AGENTS), locale='en-US', timezone_id='America/New_York', ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});" "window.chrome = {runtime: {}};" ) for page_num in range(1, max_pages + 1): if search_state.get('cancelled'): break page_url = base_url if page_num == 1 else base_url + f"?page={page_num}" page = context.new_page() try: page.goto(page_url, timeout=30000, wait_until='domcontentloaded') page.wait_for_timeout(random.randint(2000, 4000)) # Scroll to load lazy content for _ in range(3): page.evaluate("window.scrollBy(0, 400)") page.wait_for_timeout(random.randint(400, 800)) html = page.content() page.close() from bs4 import BeautifulSoup as BS soup = BS(html, 'html.parser') # YachtWorld listing cards page_count = 0 for a in soup.find_all('a', href=True): href = a['href'] if '/boat-details/' in href or '/yacht/' in href: if not href.startswith('http'): href = 'https://www.yachtworld.com' + href if href in seen: continue seen.add(href) title = a.get_text(strip=True) parent = a.find_parent() or a ctx = parent.get_text(' ', strip=True)[:300] img = "" for im in parent.find_all('img'): src = im.get('src') or im.get('data-src','') if src and 'rendered_yacht' in src: img = src break results.append({ "url": href, "title": title, "snippet": ctx, "price_text": "", "img_url": img, "location": "", "source": "YachtWorld", "source_type": "broker", "category": "Brokers Especializados", }) page_count += 1 print(f"[YachtWorld] Page {page_num}: {page_count} listings") if page_count == 0: break # Polite pause between pages if page_num < max_pages: polite_pause("YachtWorld") except Exception as e: print(f"[YachtWorld] Page {page_num} error: {e}") try: page.close() except: pass break browser.close() except Exception as e: print(f"[YachtWorld] Fatal error: {e}") print(f"[YachtWorld] Total: {len(results)} listings") return results def fetch_page_text(url: str, max_chars: int = 2000) -> str: """Fetch plain text from a page.""" try: r = requests.get(url, headers=get_headers(), timeout=15, verify=False) if r.status_code != 200: return "" soup = BeautifulSoup(r.text, "html.parser") for tag in soup(["script","style","nav","footer","header","aside","noscript"]): tag.decompose() return " ".join(soup.get_text(" ", strip=True).split())[:max_chars] except Exception: return "" def _extract_best_src(img_tag) -> str: """Extract the best image URL from an tag, handling lazy-load patterns.""" candidates = [ img_tag.get("src",""), img_tag.get("data-src",""), img_tag.get("data-lazy-src",""), img_tag.get("data-original",""), img_tag.get("data-lazy",""), img_tag.get("data-image",""), img_tag.get("data-full",""), img_tag.get("data-url",""), img_tag.get("data-hi-res-src",""), ] # Also check srcset — take the largest variant srcset = img_tag.get("srcset","") or img_tag.get("data-srcset","") if srcset: parts = [p.strip().split()[0] for p in srcset.split(",") if p.strip()] candidates.extend(parts) for c in candidates: c = c.strip() if c and c.startswith("http") and not c.startswith("data:"): return c return "" def fetch_page_with_images(url: str) -> tuple: """Fetch page text AND images. Returns (text, [image_urls])""" text = "" images = [] base_url = "/".join(url.split("/")[:3]) try: r = requests.get(url, headers=get_headers(referer=url), timeout=18, verify=False) if r.status_code != 200: return fetch_page_text(url), [] soup = BeautifulSoup(r.text, "html.parser") # Extract images before stripping tags skip_words = ["logo","icon","avatar","banner","pixel","track","ad","sprite","button", "placeholder","blank","loading","spacer","1x1","transparent"] seen_imgs = set() for img in soup.find_all("img"): src = _extract_best_src(img) if not src: continue # Normalise relative URLs if src.startswith("//"): src = "https:" + src elif src.startswith("/"): src = base_url + src if not src.startswith("http"): continue if any(s in src.lower() for s in skip_words): continue if src in seen_imgs: continue try: w = int(str(img.get("width","0")).replace("px","") or 0) if 0 < w < 100: continue except: pass seen_imgs.add(src) images.append(src) if len(images) >= 10: break for tag in soup(["script","style","nav","footer","header","aside","noscript"]): tag.decompose() text = " ".join(soup.get_text(" ", strip=True).split())[:3000] except Exception: text = fetch_page_text(url) return text, images # ══════════════════════════════════════════════════════════════════════════════ # DEDICATED SOURCE SCRAPERS # Each function handles one site's quirks. scrape_source_router dispatches here. # ══════════════════════════════════════════════════════════════════════════════ def scrape_ebay(src: dict, query: str, filters: dict) -> list[dict]: """ eBay Marine scraper — uses Playwright (Akamai blocks plain requests). Handles all eBay entries: Marine, Auction, Sail, Salvage, etc. New eBay layout (2024+) uses: - for item links - Text title in nearby spans/divs - with i.ebayimg.com CDN URLs (s-l500 quality) """ results = [] seen = set() raw_url = src.get("search_url", "") if not raw_url: return [] clean_q = " ".join(dict.fromkeys(query.strip().split())) url = raw_url.replace("{query}", requests.utils.quote(clean_q)) # ── Adjust eBay category based on vessel type filter ────────────────────── # 26429=All Boats 36431=Sailboats 36432=Powerboats 26430=PWC 63613=Kayaks vtype = filters.get("type","").lower() if filters else "" EBAY_CAT = { "sailboat": "36431", "sail": "36431", "velero": "36431", "motor": "36432", "motorboat": "36432", "yacht": "36432", "fishing": "36432", "tug": "36432", "barge": "36432", "offshore": "36432", "ferry": "36432", } if vtype and vtype in EBAY_CAT: url = re.sub(r'_sacat=\d+', f'_sacat={EBAY_CAT[vtype]}', url) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"] ) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", timezone_id="America/New_York", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(url, timeout=30000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(1500, 2500)) # Scroll a bit to trigger lazy images page.evaluate("window.scrollBy(0,600)") page.wait_for_timeout(800) html = page.content() except Exception as e: print(f"[{src['name']}] Playwright nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") # ── New layout (2024+): li.s-card ───────────────────────────────────── cards = soup.find_all("li", class_="s-card") # ── Old layout fallback: li.s-item ──────────────────────────────────── if not cards: return _parse_ebay_old_layout(soup, src) for card in cards: try: # Title + URL — a.s-card__link WITHOUT image-treatment class title_link = None for a in card.find_all("a", class_="s-card__link"): if "image-treatment" in (a.get("class") or []): continue t = a.get_text(strip=True) if t and not t.lower().startswith("shop on ebay"): title_link = a break if not title_link: continue href = title_link.get("href", "") if "/itm/" not in href: continue m = re.search(r'(https?://(?:www\.)?ebay\.com/itm/\d+)', href) if not m: continue href = m.group(1) if href in seen: continue seen.add(href) # Clean title — strip eBay UI noise appended to link text title = title_link.get_text(strip=True) title = re.sub(r'\s*Opens in a new window or tab.*', '', title, flags=re.IGNORECASE).strip() # Price ── .s-card__price price_tag = (card.find(class_="s-card__price") or card.find(class_="s-item__price")) price = price_tag.get_text(strip=True) if price_tag else "" # Image ── img inside a.s-card__link.image-treatment img = "" img_link = card.find("a", class_="image-treatment") if img_link: im = img_link.find("img") if im: raw = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) if raw: img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', r's-l500.\1', raw) # Fallback: any ebayimg.com src in the card if not img: for im in card.find_all("img"): raw = (_extract_best_src(im) or im.get("src","")) if raw and "ebayimg.com" in raw: img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', r's-l500.\1', raw) break # Location ── "Located in: XXX" — stop before "Delivery" location = "" card_text = card.get_text(" ", strip=True) lm = re.search( r'[Ll]ocated in[:\s]+([A-Za-z][^,\|•\n$\d]{2,30})', card_text) if lm: loc_raw = lm.group(1).strip() # Trim trailing noise like "Delivery or pickup..." loc_raw = re.split(r'\s+[Dd]elivery|\s+[Ss]hipping', loc_raw)[0].strip() location = loc_raw results.append({ "url": href, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "eBay"), "source_type": src.get("type", "classifieds"), "category": src.get("category", "Clasificados USA"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings (new layout)") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def _parse_ebay_old_layout(soup, src: dict) -> list[dict]: """Fallback for the classic eBay li.s-item layout.""" results = [] seen = set() for item in soup.find_all("li", class_="s-item"): try: link_tag = item.find("a", class_="s-item__link") if not link_tag: continue href = link_tag.get("href","") if "/itm/" not in href: continue m = re.search(r'(https?://www\.ebay\.com/itm/\d+)', href) if m: href = m.group(1) if href in seen: continue seen.add(href) title_tag = (item.find("span", class_="BOLD") or item.find("div", class_="s-item__title") or item.find("span", class_="s-item__title")) title = (title_tag or link_tag).get_text(strip=True) if not title or title.lower().startswith("shop on ebay"): continue price_tag = item.find("span", class_="s-item__price") price = price_tag.get_text(strip=True) if price_tag else "" img = "" img_tag = item.find("img") if img_tag: img = (_extract_best_src(img_tag) or img_tag.get("src","")) if img: img = re.sub(r's-l\d+\.(jpg|webp|jpeg)', r's-l500.\1', img) loc_tag = (item.find("span", class_="s-item__location") or item.find("span", class_="s-item__itemLocation")) location = "" if loc_tag: location = (loc_tag.get_text(strip=True) .replace("Located in: ","").strip()) results.append({ "url": href, "title": title, "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name","eBay"), "source_type": src.get("type","classifieds"), "category": src.get("category","Clasificados USA"), }) except Exception: continue print(f"[{src.get('name','eBay')}] {len(results)} listings (old layout)") return results def scrape_boattrader(src: dict, query: str, filters: dict) -> list[dict]: """ BoatTrader scraper — uses Playwright (Cloudflare Turnstile on plain requests). Card structure (stable classes): li.lib-card — card root a[href^="/boat/...-/"] — listing URL [class*=listingTitle] — title element [class*=listingPrice] — price element img — photo city, STATE ZIP pattern in text — location """ results = [] seen = set() raw_url = src.get("search_url", "") if not raw_url: return [] clean_q = " ".join(dict.fromkeys(query.strip().split())) url = raw_url.replace("{query}", requests.utils.quote(clean_q)) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"] ) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", timezone_id="America/New_York", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") # BoatTrader needs time to hydrate React and load listing cards page.wait_for_timeout(random.randint(4000, 6000)) page.evaluate("window.scrollBy(0, 600)") page.wait_for_timeout(1500) html = page.content() except Exception as e: print(f"[{src['name']}] Playwright nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") # ── Card root: li.lib-card ───────────────────────────────────────────── cards = soup.find_all("li", class_="lib-card") if not cards: # Fallback: any element with lib-card class cards = soup.find_all(class_=re.compile(r'\blib-card\b')) for card in cards: try: # Link ── /boat/YEAR-MAKE-...-ID/ link_tag = card.find( "a", href=re.compile(r'^/boat/[\w-]+-\d+/$')) if not link_tag: continue href = "https://www.boattrader.com" + link_tag["href"] if href in seen: continue seen.add(href) # Title ── element whose class contains 'listingTitle' title_el = card.find( class_=re.compile(r'listingTitle', re.I)) if title_el: title = title_el.get_text(strip=True) else: # Fallback: build from URL slug (2026-catalina-34-123 → 2026 Catalina 34) slug = link_tag["href"].strip("/").split("/")[-1] parts = slug.rsplit("-", 1)[0].replace("-", " ").title() title = parts if not title: continue # Price ── element whose class contains 'listingPrice' price_el = card.find( class_=re.compile(r'listingPrice', re.I)) price = "" if price_el: raw_price = price_el.get_text(" ", strip=True) # Extract only the first dollar amount — ignore "/mo*" noise pm = re.search(r'\$\s*([\d,]+)', raw_price) if pm: price = f"${pm.group(1)}" # Image ── first with a boatsgroup or boattrader CDN src img = "" for im in card.find_all("img"): raw = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) if raw and raw.startswith("http") and not raw.endswith(".svg"): img = raw break # Location ── "City, ST ZIP" pattern in card text # Use listingCaption element if available (more precise) location = "" caption_el = card.find(class_=re.compile(r'listingCaption|listingLocation', re.I)) search_text = caption_el.get_text(" ", strip=True) if caption_el else card.get_text(" ", strip=True) lm = re.search( r'\b([A-Z][a-zA-Z\s]{2,20},\s+[A-Z]{2}(?:\s+\d{5})?)', search_text) if lm: location = lm.group(1).strip() results.append({ "url": href, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "BoatTrader"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_apolloduck(src: dict, query: str, filters: dict) -> list[dict]: """ Apollo Duck scraper — plain requests + BS4 (no JS needed). Two card types on the listing page: Sidebar cards: div.eastSDFPPanel → a.SidebarTitle, a.SidebarPrice, img Featured cards: div._FeatureAdPanel → a._FeatureTitle, span._FeaturePrice, img, td._PanelSpecData (location) Listing URL pattern: https://www.apolloduck.com/boat/{slug}/{id} """ results = [] seen = set() # Use Apollo Duck keyword search — returns results filtered by query. # Strip trailing "for sale" / "en venta" / "a vendre" since Apollo Duck # searches listing titles and those phrases rarely appear there. stripped_q = re.sub( r'\s*(for\s+sale|en\s+venta|à\s+vendre|zu\s+verkaufen)\s*$', '', query.strip(), flags=re.I).strip() clean_q = requests.utils.quote(stripped_q or query.strip()) if clean_q: url = f"https://www.apolloduck.com/search.phtml?search={clean_q}&sr=1&q=1" else: raw_url = src.get("search_url", "") or "https://www.apolloduck.com/boats/used-boats-for-sale" url = raw_url.replace("{query}", clean_q) is_search = bool(clean_q) # only featured cards are query-filtered try: headers = { "User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } resp = requests.get(url, headers=headers, timeout=20, allow_redirects=True) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "html.parser") def _parse_card(card, title_sel, price_sel, is_featured=False): """Common extraction for both card types.""" title_el = card.select_one(title_sel) if not title_el: return title = title_el.get_text(strip=True) if not title: return # URL — from title link or image link href = title_el.get("href", "") if not href: a = card.find("a", href=re.compile(r'/boat/')) href = a["href"] if a else "" if not href: return full_url = ("https://www.apolloduck.com" + href if href.startswith("/") else href) if full_url in seen: return seen.add(full_url) # Price price_el = card.select_one(price_sel) price = price_el.get_text(strip=True) if price_el else "" # Image img = "" for im in card.find_all("img"): raw = (im.get("src") or im.get("data-src") or im.get("data-lazy-src") or "") if raw and raw.startswith("http") and not raw.endswith(".svg"): img = raw break # srcset fallback ss = im.get("srcset","") if ss: img = ss.split()[0] break # Location — only featured cards have it location = "" if is_featured: for lbl in card.select("td._PanelSpecLabel"): if "location" in lbl.get_text(strip=True).lower(): loc_td = lbl.find_next_sibling("td") if loc_td: location = loc_td.get_text(strip=True) break results.append({ "url": full_url, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "Apollo Duck"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) # Featured cards — always query-filtered on search results (~60-100/page) for card in soup.select("div._FeatureAdPanel"): _parse_card(card, "a._FeatureTitle", "span._FeaturePrice", is_featured=True) # Sidebar cards — only when browsing a category (NOT on keyword search, # because sidebar is always the same 101 generic listings regardless of query) if not is_search: for card in soup.select("div.eastSDFPPanel"): _parse_card(card, "a.SidebarTitle", "a.SidebarPrice") print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_boatsdotcom(src: dict, query: str, filters: dict) -> list[dict]: """ Boats.com scraper — uses Playwright (same Boats Group infrastructure as BoatTrader). Two card types: Sponsored/real: li[data-listing-id] → h2+div.year, div.price, div.img-container img, div.country OEM specs: li.enhanced.oem → h2+div.year, div.price, div.img-container img (no location) Listing URL pattern: https://www.boats.com/{type}/{year}-{make}-{id}/ """ results = [] seen = set() raw_url = src.get("search_url", "") or "https://www.boats.com/boats-for-sale/?query={query}" clean_q = requests.utils.quote(query.strip()) url = raw_url.replace("{query}", clean_q) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"] ) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", timezone_id="America/New_York", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(4000, 6000)) page.evaluate("window.scrollBy(0, 600)") page.wait_for_timeout(1500) html = page.content() except Exception as e: print(f"[{src['name']}] Playwright nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") def _extract_card(card, has_location=True): # URL a = card.find("a", href=re.compile(r'^/')) if not a: return href = "https://www.boats.com" + a["href"] if href in seen: return seen.add(href) # Title = year + model name year_el = card.select_one("div.year") name_el = card.select_one("h2") year = year_el.get_text(strip=True) if year_el else "" name = name_el.get_text(strip=True) if name_el else "" title = f"{year} {name}".strip() if year else name if not title: return # Price price_el = card.select_one("div.price") price = "" if price_el: raw_p = price_el.get_text(" ", strip=True) pm = re.search(r'\$\s*([\d,]+)', raw_p) price = f"${pm.group(1)}" if pm else raw_p[:30] # Image img = "" img_container = card.select_one("div.img-container") if img_container: im = img_container.find("img") if im: img = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) # Location location = "" if has_location: loc_el = card.select_one("div.country") if loc_el: location = loc_el.get_text(strip=True) results.append({ "url": href, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "Boats.com"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) # Sponsored/real marketplace listings for card in soup.select("li[data-listing-id]"): _extract_card(card, has_location=True) # OEM spec sheets for card in soup.select("li.enhanced.oem"): _extract_card(card, has_location=False) print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_craigslist(src: dict, query: str, filters: dict) -> list[dict]: """ Craigslist boats scraper — plain requests + BS4. Card root : div[data-pid] (class="cl-search-result") Title : a.posting-title span.label URL : a.main[href] (full absolute URL with regional subdomain) Price : span.priceinfo Location : span.result-location Image : img[data-image-index="0"] inside div.cl-gallery """ results = [] seen = set() # Craigslist has no national search — scrape several major coastal cities CITIES = ["sfbay", "losangeles", "seattle", "miami", "boston", "newyork", "chicago", "houston", "dallas", "denver", "phoenix", "atlanta", "portland", "sandiego", "tampa", "minneapolis", "stlouis", "nashville", "raleigh", "saltlakecity"] qs = requests.utils.quote(query.strip()) try: from playwright.sync_api import sync_playwright all_html_parts = [] with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True, ) # Fetch 3 random cities to keep runtime reasonable for city in random.sample(CITIES, min(3, len(CITIES))): city_url = f"https://{city}.craigslist.org/search/boa?query={qs}&sort=rel" page = ctx.new_page() try: page.goto(city_url, timeout=25000, wait_until="domcontentloaded") page.wait_for_timeout(2500) all_html_parts.append(page.content()) except Exception: pass finally: try: page.close() except: pass browser.close() if not all_html_parts: return [] # Parse all city HTMLs for html in all_html_parts: soup = BeautifulSoup(html, "html.parser") for card in soup.find_all(attrs={"data-pid": True}): try: # URL — from the main image link (absolute) a_main = card.find("a", class_="main") if not a_main: continue listing_url = a_main.get("href", "") if not listing_url or listing_url in seen: continue seen.add(listing_url) # Title — from card title attr or span.label title = card.get("title", "") if not title: span = card.find("span", class_="label") title = span.get_text(strip=True) if span else "" if not title: continue # Price price_el = card.find("span", class_="priceinfo") price = price_el.get_text(strip=True) if price_el else "" # Location loc_el = card.find("span", class_="result-location") location = loc_el.get_text(strip=True) if loc_el else "" # Image — first img with data-image-index="0" img = "" im = card.find("img", attrs={"data-image-index": "0"}) if im: img = im.get("src", "") or im.get("data-src", "") if not img: im = card.find("img") if im: img = im.get("src", "") or im.get("data-src", "") results.append({ "url": listing_url, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "Craigslist Boats"), "source_type": src.get("type", "classifieds"), "category": src.get("category", "Clasificados Generales"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_rightboat(src: dict, query: str, filters: dict) -> list[dict]: """ Rightboat scraper — Playwright (JS-rendered, Tailwind CSS). Card root : div[data-tracking-bound="true"] Image : img.object-cover (first inside card) Title : first with href containing /boats-for-sale/ that has text Price : element containing fa-tag icon's sibling text Location : element containing fa-location-pin icon's sibling text """ results = [] seen = set() raw_url = (src.get("search_url", "") or "https://www.rightboat.com/boats-for-sale/?q={query}") clean_q = requests.utils.quote(query.strip()) url = raw_url.replace("{query}", clean_q) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"] ) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True, ) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" ) page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(5000, 7000)) page.evaluate("window.scrollBy(0, 800)") page.wait_for_timeout(1500) html = page.content() except Exception as e: print(f"[{src['name']}] Playwright nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") # Cards are div[data-tracking-bound="true"] cards = soup.find_all(attrs={"data-tracking-bound": "true"}) for card in cards: try: # URL — the card ITSELF is the element href = card.get("href", "") if not href or "/boats-for-sale/" not in href: continue listing_url = ("https://www.rightboat.com" + href if href.startswith("/") else href) if listing_url in seen: continue seen.add(listing_url) # Image — first object-cover img (main photo) img = "" im = card.find("img", class_=re.compile(r'object-cover')) if im: img = im.get("src", "") or im.get("data-src", "") # Title — from img alt attribute (most reliable) or heading title = "" if im: title = im.get("alt", "").strip() if not title: h_el = card.find(re.compile(r'^h[1-4]$')) title = h_el.get_text(strip=True) if h_el else "" if not title: # Build from URL slug: /boats-for-sale/make/model/rbXXX parts = href.strip("/").split("/") if len(parts) >= 3: title = " ".join(parts[1:-1]).replace("-", " ").title() if not title: continue # Price —

or regex fallback price = "" price_el = card.find("p", class_=re.compile(r'font-bold')) if price_el: pt = price_el.get_text(strip=True) if re.search(r'[\$£€]', pt): price = pt if not price: pm = re.search(r'[\$£€]\s*[\d,]+', card.get_text()) if pm: price = pm.group(0) # Location — text inside same div as fa-location-pin icon location = "" pin_icon = card.find("i", class_=re.compile(r'fa-location')) if pin_icon: # Typically:

"City, State"
row = pin_icon.find_parent() if row: location = row.get_text(" ", strip=True).strip() results.append({ "url": listing_url, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "Rightboat"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_cooperss(src: dict, query: str, filters: dict) -> list[dict]: """ Cooper Capital Specialty Salvage (cooperss.com). Salvage / insurance-loss vessels. Structure (paired divs, same index): div.listing-thumb — image + link (assets/detail/?name=marine&id=N) div.listing-detail — h5.blue (name) + table (Year,Size,Location,Min Bid…) """ results = [] seen = set() base = "https://www.cooperss.com" try: headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"} resp = requests.get(base + "/", headers=headers, timeout=20) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") thumbs = [el for el in soup.find_all(class_="listing-thumb") if "slick-cloned" not in (el.get("class") or [])] details = [el for el in soup.find_all(class_="listing-detail") if "slick-cloned" not in (el.get("class") or [])] for thumb, detail in zip(thumbs, details): try: # URL a = thumb.find("a", href=True) if not a: continue href = a["href"] if not href.startswith("http"): href = base + "/" + href.lstrip("/") if href in seen: continue seen.add(href) # Image img_tag = thumb.find("img") img = img_tag.get("src", "") if img_tag else "" if img and not img.startswith("http"): img = base + "/" + img.lstrip("/") # Title — h5.blue (vessel name) h5 = detail.find("h5", class_="blue") title = h5.get_text(strip=True).split("\n")[0].strip() if h5 else "" # Remove video-button text artifact for tag in (h5.find_all("a") if h5 else []): tag.decompose() title = h5.get_text(strip=True) if h5 else title if not title: continue # Parse the detail table rows = {td.get_text(strip=True): tds[1].get_text(strip=True) for tr in detail.find_all("tr") if len(tds := tr.find_all("td")) == 2 for td in [tds[0]]} year = rows.get("Year", "") size = rows.get("Size", "") location = rows.get("Location", "") min_bid = rows.get("Minimum Bid", "") loss_type= rows.get("Type of Loss", "") deadline = rows.get("Bid Deadline", "") if year: title = f"{year} {title}".strip() price = f"Min Bid ${min_bid}" if min_bid else "" snippet_parts = [p for p in [price, loss_type, location, f"Deadline: {deadline}" if deadline else ""] if p] results.append({ "url": href, "title": title[:120], "snippet": " | ".join(snippet_parts), "price_text": price, "img_url": img, "location": location, "size_m": size, "source": src.get("name", "Cooper Salvage"), "source_type": "salvage", "category": src.get("category", "Salvage & Wrecks"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_inautia(src: dict, query: str, filters: dict) -> list[dict]: """ iNautia scraper — same Boats Group platform as BoatTrader/Boats.com. Card: div[data-grid-index] Link: a.grid-listing-link[href] → /boat/YEAR-MAKE-MODEL-ID/ Title: [class*=listingTitle] Price: data-ssr-meta="make|type|len||price_eur" (5th field) Location: [class*=listingBody] Image: first CDN img in card """ results = [] seen = set() raw_url = (src.get("search_url", "") or "https://www.inautia.com/boats/?q={query}") url = raw_url.replace("{query}", requests.utils.quote(query.strip())) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"]) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};") page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(4000, 6000)) page.evaluate("window.scrollBy(0,600)") page.wait_for_timeout(1500) html = page.content() except Exception as e: print(f"[{src['name']}] nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") cards = soup.find_all(attrs={"data-grid-index": True}) for card in cards: try: link_tag = card.find("a", class_=re.compile(r'grid-listing-link')) if not link_tag: continue href = link_tag.get("href", "") if not href: continue full_url = ("https://www.inautia.com" + href if href.startswith("/") else href) if full_url in seen: continue seen.add(full_url) # Title title_el = card.find(class_=re.compile(r'listingTitle', re.I)) title = title_el.get_text(strip=True) if title_el else "" if not title: slug = href.strip("/").split("/")[-1] title = slug.rsplit("-", 1)[0].replace("-", " ").title() if not title: continue # Price from data-ssr-meta (make|type|length||price_eur) price = "" meta = link_tag.get("data-ssr-meta", "") if meta: parts = meta.split("|") if len(parts) >= 5 and parts[4]: try: price = f"€{int(float(parts[4])):,}" except ValueError: pass if not price: price_el = card.find(class_=re.compile(r'listingPrice', re.I)) if price_el: raw_p = price_el.get_text(" ", strip=True) pm = re.search(r'[\$€£]\s*[\d,]+', raw_p) price = pm.group(0) if pm else "" # Location — listingBody contains "Broker | City, Country" loc_el = card.find(class_=re.compile(r'listingBody', re.I)) location = loc_el.get_text(" ", strip=True) if loc_el else "" # Image img = "" for im in card.find_all("img"): raw = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) if raw and raw.startswith("http") and not raw.endswith(".svg"): img = raw break results.append({ "url": full_url, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "iNautia"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_boat24(src: dict, query: str, filters: dict) -> list[dict]: """ Boat24 scraper — European marketplace, plain requests. Card: div.blurb.blurb--strip Link: data-link attr (base64 → ROT13 → URL) Title: h3.blurb__title Price: p.blurb__price Location: p.blurb__location Image: lazy via slider — extract from li.slider__slide img[src] or data-src """ results = [] seen = set() BASE = "https://www.boat24.com" raw_url = (src.get("search_url", "") or "https://www.boat24.com/en/usedboats/") url = raw_url.replace("{query}", requests.utils.quote(query.strip())) _rot13 = str.maketrans( "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "NOPQRSTUVWXYZABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm") def _decode_link(encoded: str) -> str: try: import base64 rot = base64.b64decode(encoded).decode("utf-8", errors="ignore") return rot.translate(_rot13) except Exception: return "" try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"]) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});") page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(4000, 6000)) html = page.content() except Exception as e: print(f"[{src['name']}] nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") cards = soup.find_all("div", class_=re.compile(r'\bblurb\b')) for card in cards: try: encoded = card.get("data-link", "") if not encoded: continue listing_url = _decode_link(encoded) if not listing_url or not listing_url.startswith("http"): # Try building from title link a = card.find("a", href=re.compile(r'/en/')) if a: listing_url = (BASE + a["href"] if a["href"].startswith("/") else a["href"]) else: continue if listing_url in seen: continue seen.add(listing_url) title_el = card.select_one("h3.blurb__title, h2.blurb__title") title = title_el.get_text(strip=True) if title_el else "" if not title: continue price_el = card.select_one("p.blurb__price") price = price_el.get_text(strip=True) if price_el else "" loc_el = card.select_one("p.blurb__location") location = "" if loc_el: location = re.sub(r'\s+', ' ', loc_el.get_text(" ", strip=True)).strip() # Image — try slider slides or first img img = "" for im in card.find_all("img"): raw = (im.get("data-src") or im.get("data-lazy") or im.get("srcset","").split()[0] or im.get("src","")) if raw and raw.startswith("http") and "/alpha.gif" not in raw: img = raw break results.append({ "url": listing_url, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "Boat24"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_facebook_marketplace(src: dict, query: str, filters: dict) -> list[dict]: """ Facebook Marketplace scraper. Requires a saved session file: fb_session.json (cookies from a logged-in session). If not found, returns a single instructional result. Setup: POST /api/fb-setup → launches a visible browser for the user to log in. Session file is saved automatically after login. """ import json as _json results = [] seen = set() SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json") SEARCH_URL = ("https://www.facebook.com/marketplace/search/" f"?query={requests.utils.quote(query.strip())}" "&deliveryMethod=local_pick_up") if not os.path.exists(SESSION_FILE): return [{ "url": "https://www.facebook.com/marketplace/", "title": "⚠ Facebook Marketplace — Configuración requerida", "snippet": ("Para habilitar Facebook Marketplace, ve a Fuentes y " "haz clic en 'Configurar FB'. Solo se necesita una vez."), "price_text": "", "img_url": "", "location": "", "source": "Facebook Marketplace", "source_type": "setup_required", "category": src.get("category", "Clasificados Generales"), }] try: from playwright.sync_api import sync_playwright with open(SESSION_FILE) as f: cookies = _json.load(f) with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"]) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True) context.add_cookies(cookies) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};") page = context.new_page() try: page.goto(SEARCH_URL, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(5000, 7000)) page.evaluate("window.scrollBy(0,800)") page.wait_for_timeout(2000) html = page.content() except Exception as e: print(f"[Facebook Marketplace] nav error: {e}") html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") # FB Marketplace listing cards — data-testid or aria-label patterns # Each listing is usually an
with href /marketplace/item/ID/ listing_links = soup.find_all( "a", href=re.compile(r'/marketplace/item/\d+')) for a in listing_links: try: href = a.get("href", "") full_url = ("https://www.facebook.com" + href if href.startswith("/") else href) # Normalize: remove query params after item ID full_url = re.sub(r'(/marketplace/item/\d+/).*', r'\1', full_url) if full_url in seen: continue seen.add(full_url) # Title — span or div with listing title title_el = (a.find("span", style=re.compile(r'line-clamp')) or a.find("span", class_=re.compile(r'x1lliihq|xt0psk2')) or a.find("div", class_=re.compile(r'x1lliihq'))) title = title_el.get_text(strip=True) if title_el else "" if not title: # Try aria-label on the card title = a.get("aria-label", "") if not title: continue # Price price = "" for span in a.find_all("span"): t = span.get_text(strip=True) if re.match(r'[\$£€][\d,]+', t): price = t break # Image img = "" im = a.find("img") if im: img = im.get("src", "") or im.get("data-src", "") # Location — usually a second span below price location = "" spans = [s.get_text(strip=True) for s in a.find_all("span") if s.get_text(strip=True) and s.get_text(strip=True) != title] for s in spans: if re.search(r'[A-Z][a-z]+,\s+[A-Z]{2}', s) or ( not re.match(r'[\$£€\d]', s) and len(s) > 3 and s != price): location = s break results.append({ "url": full_url, "title": title[:120], "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": "Facebook Marketplace", "source_type": "classifieds", "category": src.get("category", "Clasificados Generales"), }) except Exception: continue print(f"[Facebook Marketplace] {len(results)} listings") except Exception as e: print(f"[Facebook Marketplace] Error: {e}") return results def scrape_hmy(src: dict, query: str, filters: dict) -> list[dict]: """ HMY Yachts — queries Algolia directly (app ECN3QX1VBL). Fast, no Playwright needed. """ results = [] seen = set() ALGOLIA_URL = "https://ecn3qx1vbl-dsn.algolia.net/1/indexes/*/queries" ALGOLIA_HEADERS = { "x-algolia-application-id": "ECN3QX1VBL", "x-algolia-api-key": "d86ccdd9ac0292ba76ee4755693d0c10", "content-type": "application/json", "referer": "https://www.hmy.com/", "user-agent": random.choice(USER_AGENTS), } import urllib.parse params_str = urllib.parse.urlencode({ "filters": "SalesStatus:Active", "facetFilters": '[["SaleClassCode:used"]]', "query": query, "hitsPerPage": 40, "page": 0, }) payload = { "requests": [{ "indexName": "production_oceanelite_yachts", "params": params_str, }] } try: resp = requests.post(ALGOLIA_URL, json=payload, headers=ALGOLIA_HEADERS, timeout=15) resp.raise_for_status() data = resp.json() hits = data.get("results", [{}])[0].get("hits", []) for h in hits: try: slug = h.get("Slug", "") url = h.get("URL") or (f"https://www.hmy.com/yachts-for-sale/{slug}" if slug else "") if not url or url in seen: continue seen.add(url) year = h.get("ModelYear", "") make = h.get("MakeStringExact", "") model = h.get("ModelExact", "") name = h.get("BoatName", "") title = f"{year} {make} {model}".strip() if name: title += f' "{name}"' price_raw = h.get("NormPrice", 0) price_text = f"${int(price_raw):,}" if price_raw else "" length = h.get("NominalLengthNormalized", "") country = h.get("country", "USA") location = f"{length}ft · {country}" if length else country img = h.get("mainImage", "") results.append({ "url": url, "title": title[:120], "snippet": f"{price_text} · {location}".strip(" ·"), "price_text": price_text, "img_url": img, "location": country, "source": src.get("name", "HMY Yachts"), "source_type": src.get("type", "broker"), "category": src.get("category", "Venta Especializada"), }) except Exception: continue print(f"[{src.get('name','HMY')}] {len(results)} listings") except Exception as e: print(f"[{src.get('name','HMY')}] Error: {e}") return results def scrape_boatcrazy(src: dict, query: str, filters: dict) -> list[dict]: """ BoatCrazy — US aggregator with 105+ listings per page. Card: div.boat-list-item Link: a[href*="/boat-for-sale/"] Image: div.item-img img or div.list-itemimg img Details: div.item-details URL pattern: /boat-for-sale/YEAR-MAKE-LOCATION-id """ results = [] seen = set() raw_url = src.get("search_url", "") or "https://boatcrazy.com/boats?q={query}" url = raw_url.replace("{query}", requests.utils.quote(query.strip())) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--disable-blink-features=AutomationControlled","--no-sandbox"]) context = browser.new_context( viewport={"width": 1280, "height": 900}, user_agent=random.choice(USER_AGENTS), locale="en-US", ignore_https_errors=True) context.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};") page = context.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(random.randint(4000, 6000)) html = page.content() except Exception as e: print(f"[{src['name']}] nav error: {e}"); html = "" finally: try: page.close() except: pass browser.close() if not html: return [] soup = BeautifulSoup(html, "html.parser") cards = soup.find_all(class_="boat-list-item") if not cards: # fallback: find by link pattern cards = [] for a in soup.find_all("a", href=re.compile(r'/boat-for-sale/')): parent = a.find_parent(class_=re.compile(r'boat|list|item|card')) if parent and parent not in cards: cards.append(parent) for card in cards: try: a = card.find("a", href=re.compile(r'/boat-for-sale/')) if not a: continue href = a["href"] full_url = href if href.startswith("http") else "https://boatcrazy.com" + href if full_url in seen: continue seen.add(full_url) # Title — prefer h3, then aria-label, then slug title = "" h3 = card.find("h3") if h3: title = h3.get_text(strip=True)[:80] if not title: al = card.find(attrs={"aria-label": True}) if al: title = al["aria-label"][:80] if not title: slug = href.rstrip("/").split("/")[-1] slug_clean = re.sub(r'-id[-\w]*$', '', slug).replace("-", " ") title = slug_clean.title()[:80] if not title: continue # Price price = "" price_el = card.find(class_=re.compile(r'\bprice\b')) if price_el: pm = re.search(r'\$[\d,]+', price_el.get_text()) if pm: price = pm.group(0) if not price: pm = re.search(r'\$[\d,]+', card.get_text(" ", strip=True)) if pm: price = pm.group(0) # Location location = "" loc_el = card.find(class_="location") if loc_el: location = loc_el.get_text(strip=True)[:60] if not location: lm = re.search(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)?,\s*[A-Z]{2})', card.get_text(" ", strip=True)) if lm: location = lm.group(1) # Image img = "" img_div = card.find(class_=re.compile(r'item.?img|list.?item.?img')) if img_div: im = img_div.find("img") if im: img = (_extract_best_src(im) or im.get("src","") or im.get("data-src","")) if not img: im = card.find("img") if im: img = im.get("src","") or im.get("data-src","") results.append({ "url": full_url, "title": title, "snippet": f"{price} {location}".strip(), "price_text": price, "img_url": img, "location": location, "source": src.get("name", "BoatCrazy"), "source_type": src.get("type", "classifieds"), "category": src.get("category", "Clasificados Generales"), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results def scrape_denison(src: dict, query: str, filters: dict) -> list: """ Denison Yachting — static HTML, 30 cards per page. Card: div.boat-item URL: a[href*=/yachts-for-sale/SLUG] (non-dashboard link) Title: boat_length + make/model + year + name Price: h4.boat_price[data-price] + [data-default_currency] Location: h3 text | Image: div.news_pic img Search: ?search={query} """ results = [] seen = set() base = "https://www.denisonyachtsales.com/yachts-for-sale/" url = f"{base}?search={requests.utils.quote(query.strip())}" LISTING_RE = re.compile(r'/yachts-for-sale/[a-z][a-z0-9-]{4,}$', re.I) CURRENCY_SYMBOLS = {"USD": "$", "EUR": "€", "GBP": "£", "AUD": "A$"} try: resp = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)}, timeout=20, verify=False) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") for card in soup.find_all(class_="boat-item"): try: a = card.find("a", href=LISTING_RE) if not a: continue href = a["href"] full_url = href if href.startswith("http") else "https://www.denisonyachtsales.com" + href if full_url in seen: continue seen.add(full_url) # Title: length + make/model year + "name" h2 = card.find("h2") if h2: length_el = h2.find(class_="boat_length") length_txt = length_el.get_text(strip=True) if length_el else "" if length_el: length_el.extract() name_el = h2.find("span") name_txt = name_el.get_text(strip=True) if name_el else "" if name_el: name_el.extract() rest = " ".join(h2.get_text(" ", strip=True).split()) parts = [p for p in [length_txt, rest, f'"{name_txt}"' if name_txt else ""] if p] title = " ".join(parts)[:100] else: title = (a.get("title", "") or "")[:100] if not title: continue # Price price_text = "" price_el = card.find(class_="boat_price") if price_el: raw_price = price_el.get("data-price", "") currency = price_el.get("data-default_currency", "USD") sym = CURRENCY_SYMBOLS.get(currency, currency + " ") if raw_price: try: price_text = f"{sym}{int(raw_price):,}" except ValueError: price_text = price_el.get_text(strip=True)[:30] # Location location = "" h3 = card.find("h3") if h3: location = h3.get_text(strip=True)[:80] # Image img = "" pic_div = card.find(class_="news_pic") if pic_div: im = pic_div.find("img") if im: img = im.get("src", "") or im.get("data-src", "") results.append({ "url": full_url, "title": title, "snippet": f"{price_text} · {location}".strip(" ·"), "price_text": price_text, "img_url": img, "location": location, "source": src.get("name", "Denison Yachting"), "source_type": src.get("type", "broker"), "category": src.get("category", "Brokers USA"), }) except Exception: continue print(f"[{src.get('name','Denison')}] {len(results)} listings") except Exception as e: print(f"[{src.get('name','Denison')}] Error: {e}") return results # ============================================================================= # SCRAPER: GovPlanet + IronPlanet (Ritchie Bros family — same HTML .sr_lot) # ============================================================================= def scrape_govplanet(src: dict, query: str, filters: dict) -> list[dict]: """ GovPlanet (recreational marine) and IronPlanet (commercial marine). Both share Ritchie Bros HTML: listing cards use .sr_lot selector. GovPlanet: https://www.govplanet.com/Recreational+Marine IronPlanet: https://www.ironplanet.com/Commercial+Marine+Vessels """ results = [] try: url = src["search_url"] base = "https://" + url.split("/")[2] headers = get_headers(referer=base + "/") time.sleep(1.0) r = requests.get(url, headers=headers, timeout=25, verify=False) if r.status_code not in (200, 206): print(f"[{src['name']}] HTTP {r.status_code}") return [] soup = BeautifulSoup(r.text, "html.parser") seen = set() for card in soup.select(".sr_lot, .lot-tile, article.lot, [class*=srItem]"): try: a = card.find("a", href=True) if not a: continue href = a["href"] if not href.startswith("http"): href = base + href if href in seen: continue seen.add(href) title = a.get_text(strip=True)[:100] or card.get_text(" ", strip=True)[:80] price_el = card.select_one(".price, .lot-price, span[class*=price]") price_txt = price_el.get_text(strip=True) if price_el else "" img_el = card.find("img") img = _extract_best_src(img_el) if img_el else "" if img and img.startswith("/"): img = base + img if title and len(title) > 4: results.append({ "title": title, "url": href, "snippet": card.get_text(" ", strip=True)[:200], "price_text": price_txt, "location": "", "img_url": img, "source": src["name"], "source_type": src.get("type", "auction"), "category": src.get("category", ""), }) except Exception: continue print(f"[{src['name']}] {len(results)} listings") except Exception as e: print(f"[{src['name']}] Error: {e}") return results # ============================================================================= # SCRAPER: HiBid (React SPA — Playwright required) # ============================================================================= def scrape_hibid(src: dict, query: str, filters: dict) -> list[dict]: """ HiBid online auction platform — React SPA requires Playwright. URL: https://www.hibid.com/lots?q={query}+boat Cards: .lot-tile Title: h3/.lot-title Price: .high-bid/.lot-price """ results = [] try: q = requests.utils.quote((query.strip() + " boat")) url = f"https://www.hibid.com/lots?q={q}" from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={"width": 1280, "height": 900}, locale="en-US", ignore_https_errors=True, ) ctx.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" ) page = ctx.new_page() try: page.goto(url, timeout=30000, wait_until="domcontentloaded") page.wait_for_timeout(4000) html = page.content() finally: try: page.close() except: pass browser.close() soup = BeautifulSoup(html, "html.parser") seen = set() for card in soup.select(".lot-tile, [class*=lot-item], [class*=LotTile], [class*=lotCard]"): try: a = card.find("a", href=True) if not a: continue href = a["href"] if not href.startswith("http"): href = "https://www.hibid.com" + href if href in seen: continue seen.add(href) title_el = card.select_one("h3, .lot-title, [class*=lot-title], [class*=lotTitle]") title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] price_el = card.select_one(".high-bid, .lot-price, [class*=bid], [class*=price]") price_txt = price_el.get_text(strip=True) if price_el else "" img_el = card.find("img") img = _extract_best_src(img_el) if img_el else "" if title and len(title) > 4: results.append({ "title": title, "url": href, "snippet": card.get_text(" ", strip=True)[:200], "price_text": price_txt, "location": "", "img_url": img, "source": src["name"], "source_type": src.get("type", "auction"), "category": src.get("category", ""), }) except Exception: continue print(f"[{src['name']}] {len(results)} lots") except Exception as e: print(f"[{src['name']}] Error: {e}") return results # ============================================================================= # SCRAPER: Copart salvage boats (heavy JS SPA — Playwright) # ============================================================================= def scrape_copart(src: dict, query: str, filters: dict) -> list[dict]: """ Copart salvage/insurance lots for watercraft. URL: https://www.copart.com/vehicleFinderSection/?searchStr={query}&vehicleType=BOAT Lots render in a React table after JS executes. """ results = [] try: q = requests.utils.quote(query.strip()) url = f"https://www.copart.com/vehicleFinderSection/?searchStr={q}&vehicleType=BOAT" from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-blink-features=AutomationControlled"] ) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={"width": 1280, "height": 900}, locale="en-US", ignore_https_errors=True, ) ctx.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = ctx.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(5000) try: page.wait_for_selector( ".lot-row, tr[data-lot], .lot-details, [class*=lottile], [class*=lot-card]", timeout=8000 ) except Exception: pass html = page.content() finally: try: page.close() except: pass browser.close() soup = BeautifulSoup(html, "html.parser") seen = set() for row in soup.select( "tr[data-lot], .lot-row, [class*=lot-card], [class*=lottile], [class*=lot-item]" ): try: a = row.find("a", href=re.compile(r"/lot/")) if not a: continue href = a["href"] if not href.startswith("http"): href = "https://www.copart.com" + href if href in seen: continue seen.add(href) title_el = row.select_one("[class*=title], [class*=desc], td.des") title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] price_el = row.select_one("[class*=bid], [class*=price], td.bid") price_txt = price_el.get_text(strip=True) if price_el else "" img_el = row.find("img") img = _extract_best_src(img_el) if img_el else "" if title and len(title) > 4: results.append({ "title": title, "url": href, "snippet": row.get_text(" ", strip=True)[:200], "price_text": price_txt, "location": "", "img_url": img, "source": src["name"], "source_type": "salvage", "category": src.get("category", ""), }) except Exception: continue print(f"[{src['name']}] {len(results)} lots") except Exception as e: print(f"[{src['name']}] Error: {e}") return results # ============================================================================= # SCRAPER: Trade a Boat AU (server-rendered Material-UI) # ============================================================================= def scrape_tradeaboat(src: dict, query: str, filters: dict) -> list[dict]: """ TradeABoat Australia — server-rendered with Material-UI CSS classes. Cards use jss* dynamic class names; fallback to /details/ link detection. URL: https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={query} """ results = [] try: q = requests.utils.quote(query.strip()) url = f"https://www.tradeaboat.com.au/search/Boats?category=Sail&keywords={q}" headers = get_headers(referer="https://www.tradeaboat.com.au/") time.sleep(1.0) r = requests.get(url, headers=headers, timeout=25, verify=False) if r.status_code not in (200, 206): print(f"[Trade a Boat AU] HTTP {r.status_code}") return [] soup = BeautifulSoup(r.text, "html.parser") base = "https://www.tradeaboat.com.au" seen = set() # MUI class names are dynamic (jss77, jss78 …) — find cards via /details/ links detail_links = soup.find_all("a", href=re.compile(r"/details/")) visited_parents = set() for a in detail_links: try: href = a["href"] if not href.startswith("http"): href = base + href if href in seen: continue seen.add(href) # Walk up to find card container card = a.find_parent("div") or a card_id = id(card) if card_id in visited_parents: continue visited_parents.add(card_id) title_el = card.select_one("h2, h3, [class*=title]") title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] price_el = card.select_one("[class*=price], [class*=Price]") price_txt = price_el.get_text(strip=True) if price_el else "" img_el = card.find("img") img = _extract_best_src(img_el) if img_el else "" if img and img.startswith("/"): img = base + img if title and len(title) > 4: results.append({ "title": title, "url": href, "snippet": card.get_text(" ", strip=True)[:200], "price_text": price_txt, "location": "Australia", "img_url": img, "source": "Trade a Boat AU", "source_type": "broker", "category": src.get("category", ""), }) except Exception: continue print(f"[Trade a Boat AU] {len(results)} listings") except Exception as e: print(f"[Trade a Boat AU] Error: {e}") return results # ============================================================================= # SCRAPER: Galati Yachts (requests, WordPress / YSP plugin) # ============================================================================= def scrape_galati(src: dict, query: str, filters: dict) -> list[dict]: """ Galati Yachts — server-rendered WordPress with YachtSalesPlugin. URL: https://www.galatiyachts.com/yachts-for-sale/?keywords={query} """ results = [] try: q = requests.utils.quote(query.strip()) url = f"https://www.galatiyachts.com/yachts-for-sale/?keywords={q}" headers = get_headers(referer="https://www.galatiyachts.com/") time.sleep(1.0) r = requests.get(url, headers=headers, timeout=25, verify=False) if r.status_code not in (200, 206): print(f"[Galati Yachts] HTTP {r.status_code}") return [] soup = BeautifulSoup(r.text, "html.parser") base = "https://www.galatiyachts.com" seen = set() # YSP listing cards — try common selectors, fallback to /yachts/ links cards = soup.select(".ysp-listing, .listing-card, .yacht-card, [class*=yacht-listing]") if not cards: # fallback: group by /yachts/details/ anchor for a in soup.find_all("a", href=re.compile(r"/yachts/")): href = a["href"] if not href.startswith("http"): href = base + href if href in seen or "galatiyachts.com" not in href: continue if href.count("/") < 4: continue seen.add(href) card = a.find_parent("div") or a title_el = card.select_one("h2, h3, [class*=title]") title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] price_el = card.select_one("[class*=price], .price") price_txt = price_el.get_text(strip=True) if price_el else "" img_el = card.find("img") img = _extract_best_src(img_el) if img_el else "" if img and img.startswith("/"): img = base + img if title and len(title) > 4: results.append({ "title": title, "url": href, "snippet": card.get_text(" ", strip=True)[:200], "price_text": price_txt, "location": "USA", "img_url": img, "source": "Galati Yachts", "source_type": "broker", "category": src.get("category", ""), }) else: for card in cards: try: a = card.find("a", href=True) if not a: continue href = a["href"] if not href.startswith("http"): href = base + href if href in seen: continue seen.add(href) title_el = card.select_one("h2, h3, [class*=title]") title = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))[:100] price_el = card.select_one("[class*=price], .price") price_txt = price_el.get_text(strip=True) if price_el else "" img_el = card.find("img") img = _extract_best_src(img_el) if img_el else "" if img and img.startswith("/"): img = base + img if title and len(title) > 4: results.append({ "title": title, "url": href, "snippet": card.get_text(" ", strip=True)[:200], "price_text": price_txt, "location": "USA", "img_url": img, "source": "Galati Yachts", "source_type": "broker", "category": src.get("category", ""), }) except Exception: continue print(f"[Galati Yachts] {len(results)} listings") except Exception as e: print(f"[Galati Yachts] Error: {e}") return results # ============================================================================= # SCRAPER: Luxury brokers (Fraser, Burgess, Worth Ave, Merle Wood, N&J) # Playwright — JS-heavy sites that won't render with plain requests # ============================================================================= def scrape_luxury_broker(src: dict, query: str, filters: dict) -> list[dict]: """ Generic Playwright scraper for luxury yacht broker sites. Covers: Fraser Yachts, Worth Ave Yachts, Merle Wood, Burgess, N&J. Follows internal links with /yacht/, /vessel/, /boat/, /listing/ in path. """ results = [] name = src.get("name", "Broker") try: raw_url = src["search_url"] url = raw_url.replace("{query}", requests.utils.quote(query.strip())) base = "https://" + url.split("/")[2] from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-blink-features=AutomationControlled"] ) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={"width": 1280, "height": 900}, locale="en-US", ignore_https_errors=True, ) ctx.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" "window.chrome={runtime:{}};" ) page = ctx.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(3000) page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") page.wait_for_timeout(1500) html = page.content() finally: try: page.close() except: pass browser.close() soup = BeautifulSoup(html, "html.parser") seen = set() LISTING_RE = re.compile( r'/(yacht[s]?|vessel[s]?|boat[s]?|listing[s]?|detail[s]?|sale|for-sale)/', re.I ) for a in soup.find_all("a", href=LISTING_RE): try: href = a["href"] if not href.startswith("http"): href = base + href if href in seen or len(href) < 25: continue path = href.split("?")[0].rstrip("/") if path.count("/") < 3: continue seen.add(href) parent = a.find_parent("div") or a.find_parent("li") or a title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80] title = " ".join(title.split())[:100] if len(title) < 5: continue ctx_txt = parent.get_text(" ", strip=True)[:300] pm = re.search(r'[\$€£]\s*[\d,\.]+(?:\s*[Mm]illion|M)?', ctx_txt) price_txt = pm.group() if pm else "" img_el = parent.find("img") img = _extract_best_src(img_el) if img_el else "" if img and img.startswith("/"): img = base + img results.append({ "title": title, "url": href, "snippet": ctx_txt[:200], "price_text": price_txt, "location": "", "img_url": img, "source": name, "source_type": src.get("type", "broker"), "category": src.get("category", ""), }) if len(results) >= 30: break except Exception: continue print(f"[{name}] {len(results)} listings") except Exception as e: print(f"[{name}] Error: {e}") return results # ============================================================================= # SCRAPER: EU/International brokers blocked on requests (Playwright) # Covers: Boat24, YachtAll, Annonces Bateau, Inautia ES, Boats&Outboards UK, # Boatsales AU, YachtMarket, Apollo Duck UK subdomain # ============================================================================= def scrape_eu_broker(src: dict, query: str, filters: dict) -> list[dict]: """ Generic Playwright scraper for EU/AU/UK broker sites that block plain requests (403/ECONNREFUSED). Navigates with real browser, extracts listings. """ results = [] name = src.get("name", "EU Broker") try: raw_url = src["search_url"] url = raw_url.replace("{query}", requests.utils.quote(query.strip())) base = "https://" + url.split("/")[2] domain = url.split("/")[2] from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={"width": 1280, "height": 900}, locale="en-US", ignore_https_errors=True, ) ctx.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});" ) page = ctx.new_page() try: page.goto(url, timeout=35000, wait_until="domcontentloaded") page.wait_for_timeout(3000) html = page.content() finally: try: page.close() except: pass browser.close() soup = BeautifulSoup(html, "html.parser") seen = set() for a in soup.find_all("a", href=True): try: href = a["href"] if not href.startswith("http"): href = base + href if domain not in href or href in seen: continue path = href.split("?")[0].rstrip("/") if path.count("/") < 3: continue if any(s in href.lower() for s in [ "login","register","contact","about","help","privacy", "sitemap","category","search","tag","page=","lang=" ]): continue seen.add(href) parent = a.find_parent("div") or a.find_parent("li") or a title = a.get_text(strip=True) or parent.get_text(" ", strip=True)[:80] title = " ".join(title.split())[:100] if len(title) < 5: continue ctx_txt = parent.get_text(" ", strip=True)[:300] pm = re.search(r'[\$€£]\s*[\d,\.]+', ctx_txt) price_txt = pm.group() if pm else "" img_el = parent.find("img") img = _extract_best_src(img_el) if img_el else "" if img and img.startswith("/"): img = base + img results.append({ "title": title, "url": href, "snippet": ctx_txt[:200], "price_text": price_txt, "location": "", "img_url": img, "source": name, "source_type": src.get("type", "broker"), "category": src.get("category", ""), }) if len(results) >= 30: break except Exception: continue print(f"[{name}] {len(results)} listings") except Exception as e: print(f"[{name}] Error: {e}") return results # ============================================================================= # SCRAPER: Forum For-Sale sections (TheHullTruth, Cruisers Forum) # ============================================================================= def scrape_forum_fs(src: dict, query: str, filters: dict) -> list[dict]: """ Scrapes For-Sale classified threads from boating forums (Playwright). TheHullTruth: /boating-forum/search.php?do=process&query={query}&prefixid=FS Cruisers Forum: /forums/f152/ (Classifieds subforum) """ results = [] name = src.get("name", "Forum") try: raw_url = src["search_url"] url = raw_url.replace("{query}", requests.utils.quote(query.strip())) base = "https://" + url.split("/")[2] from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={"width": 1280, "height": 900}, locale="en-US", ignore_https_errors=True, ) page = ctx.new_page() try: page.goto(url, timeout=30000, wait_until="domcontentloaded") page.wait_for_timeout(2000) html = page.content() finally: try: page.close() except: pass browser.close() soup = BeautifulSoup(html, "html.parser") seen = set() # vBulletin/XenForo thread rows for row in soup.select( "li.threadbit, div.threadbit, .thread-item, " "tr.odd, tr.even, .search-result, [class*=thread], " ".js-threadListItem, li[id*=thread]" ): try: a = row.find("a", href=re.compile( r'showthread|/thread[s]?/|/t/\d|/post', re.I )) if not a: a = row.find("a", href=True) if not a: continue href = a["href"] if not href.startswith("http"): href = base + href if href in seen: continue seen.add(href) title = a.get_text(strip=True)[:100] ctx_txt = row.get_text(" ", strip=True)[:200] pm = re.search(r'\$\s*[\d,]{3,}', ctx_txt) price_txt = pm.group() if pm else "" if title and len(title) > 5: results.append({ "title": title, "url": href, "snippet": ctx_txt, "price_text": price_txt, "location": "", "img_url": "", "source": name, "source_type": "classifieds", "category": src.get("category", ""), }) except Exception: continue print(f"[{name}] {len(results)} threads") except Exception as e: print(f"[{name}] Error: {e}") return results def scrape_source_router(src: dict, query: str, filters: dict, page: int = 1): """Central dispatcher — routes each source to its dedicated scraper.""" name = src.get("name", "") # ── Dedicated scrapers ──────────────────────────────────────────────────── if name == "YachtWorld": return scrape_yachtworld(query, filters, max_pages=1) if name.startswith("eBay"): # covers all 5 eBay entries return scrape_ebay(src, query, filters) if name == "BoatTrader": return scrape_boattrader(src, query, filters) if name in ("Apollo Duck", "Apollo Duck Workboats"): return scrape_apolloduck(src, query, filters) if name == "Boats.com": return scrape_boatsdotcom(src, query, filters) if name == "Craigslist": # single multi-city Craigslist entry return scrape_craigslist(src, query, filters) if name.startswith("Craigslist "): # individual city entries — one request each return scrape_direct_source(src, query, filters) if name in ("GovPlanet", "GovPlanet Recreational", "IronPlanet", "IronPlanet Marine"): return scrape_govplanet(src, query, filters) if name == "HiBid": return scrape_hibid(src, query, filters) if name in ("Copart Marine", "Copart Boats", "Copart Watercraft"): return scrape_copart(src, query, filters) if name == "Trade a Boat AU": return scrape_tradeaboat(src, query, filters) if name == "Galati Yachts": return scrape_galati(src, query, filters) if name in ("Fraser Yachts", "Burgess Yachts", "Northrop & Johnson", "Worth Ave Yachts"): return scrape_luxury_broker(src, query, filters) # Boat24 handled below by dedicated scrape_boat24; Inautia handled by scrape_inautia if name in ("Boat24 EU", "YachtAll", "Annonces Bateau", "Annonces Bateau FR", "Inautia ES", "Boats & Outboards UK", "Boats Outboards UK", "Apollo Duck UK", "Boatsales AU", "YachtMarket", "Boatpoint AU"): return scrape_eu_broker(src, query, filters) if name in ("TheHullTruth", "Cruisers Forum"): return scrape_forum_fs(src, query, filters) if name == "YachtWorld Commercial": return scrape_yachtworld(query, filters, max_pages=1) if name == "Rightboat": return scrape_rightboat(src, query, filters) if name in ("Cooper Salvage", "Cooper Capital Salvage"): return scrape_cooperss(src, query, filters) if name == "Inautia": return scrape_inautia(src, query, filters) if name == "Boat24": return scrape_boat24(src, query, filters) if name == "Facebook Marketplace": return scrape_facebook_marketplace(src, query, filters) if name == "HMY Yachts": return scrape_hmy(src, query, filters) if name == "BoatCrazy": return scrape_boatcrazy(src, query, filters) if name == "Denison Yachting": return scrape_denison(src, query, filters) # ── Generic HTML scraper (fallback) ────────────────────────────────────── return scrape_direct_source(src, query, filters) def extract_vessel_fast(raw: dict) -> dict | None: """ Pure-regex vessel extraction — no Ollama call. Used for results from known boat marketplaces (broker/classifieds/auction/etc.) Returns a data dict compatible with save_vessel(), or None if too sparse. """ title = (raw.get("title") or "").strip() snippet = (raw.get("snippet") or "") price_text = (raw.get("price_text") or "") location = (raw.get("location") or "") src_name = (raw.get("source") or "").lower() src_type = (raw.get("source_type") or "") category = (raw.get("category") or "").lower() if not title or len(title) < 5: return None combined = f"{title} {snippet} {price_text}" # ── Price ──────────────────────────────────────────────────────────────── price_usd = None currency_out = "USD" for txt in [price_text, snippet, title]: # USD m = re.search(r'\$\s*([\d,]{3,})', txt) if m: try: v = float(m.group(1).replace(",","")) if 500 < v < 50_000_000: price_usd = v; currency_out = "USD"; break except: pass # GBP m = re.search(r'£\s*([\d,]{3,})', txt) if m: try: v = float(m.group(1).replace(",","")) * 1.27 if 500 < v < 50_000_000: price_usd = round(v); currency_out = "GBP"; break except: pass # EUR m = re.search(r'€\s*([\d,]{3,})', txt) if m: try: v = float(m.group(1).replace(",","")) * 1.09 if 500 < v < 50_000_000: price_usd = round(v); currency_out = "EUR"; break except: pass # plain number + currency word m = re.search(r'([\d,]{4,})\s*(?:USD|usd|GBP|gbp|EUR|eur)', txt) if m: try: v = float(m.group(1).replace(",","")) if 500 < v < 50_000_000: price_usd = round(v); break except: pass # ── LOA ────────────────────────────────────────────────────────────────── loa_m = None for pat, in_meters in [ (r'(?:loa|length)[:\s]+([\d.]+)\s*(?:ft|\'|feet)', False), (r'^(\d{2,3}(?:\.\d)?)\s*(?:\'|ft|feet)', False), # starts with size (r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet)\b', False), (r"(\d{2,3}(?:\.\d)?)'", False), (r'(?:loa|length)[:\s]+([\d.]+)\s*m\b', True), ]: m = re.search(pat, combined, re.IGNORECASE) if m: try: v = float(m.group(1)) if in_meters: if 5 < v < 200: loa_m = round(v, 1); break else: if 10 < v < 500: loa_m = round(v * 0.3048, 1); break except: pass # ── Year ───────────────────────────────────────────────────────────────── year = None ym = re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', title) if ym: year = int(ym.group(1)) # ── Vessel type ────────────────────────────────────────────────────────── cl = combined.lower() if any(k in src_name for k in ["sailboat","sail"]) or "veleros" in category: vtype = "Sailboat" elif any(k in src_name for k in ["workboat","commercial","osv","offshore"]): vtype = "Offshore" elif "tug" in src_name: vtype = "Tug" elif "barge" in src_name: vtype = "Barge" elif any(k in cl for k in ["sailboat","sailing","velero","ketch","sloop","schooner", "yawl","cutter","catamaran","trimaran","voilier"]): vtype = "Sailboat" elif any(k in cl for k in ["tugboat","tug boat","remolcador"]): vtype = "Tug" elif "barge" in cl or "barcaza" in cl: vtype = "Barge" elif any(k in cl for k in ["offshore","osv","supply vessel","crew boat"]): vtype = "Offshore" elif any(k in cl for k in ["fishing","trawler","seiner","pesquero"]): vtype = "Fishing" elif any(k in cl for k in ["yacht","motor yacht","motoryacht"]): vtype = "Yacht" else: vtype = "Motor" status = ("auction" if src_type == "auction" else "salvage" if src_type == "salvage" else "active") # Infer location from source name when missing (e.g. "Craigslist Houston" → "Houston") if not location and raw.get("source"): src_full = raw["source"] if re.search(r'[Cc]raigslist', src_full): city = re.sub(r'[Cc]raigslist\s*', '', src_full).strip() if city: location = city elif "Kijiji" in src_full: location = "Canada" elif "Gumtree" in src_full: location = "Australia" elif "LeBonCoin" in src_full: location = "France" elif "Subito" in src_full: location = "Italy" # For trusted marketplace sources keep the result even with partial data. # For web-search results require at least one data point to avoid garbage. is_trusted = src_type in ("broker", "classifieds", "salvage", "commercial", "auction") if not is_trusted and not (price_usd or loa_m or year or location): return None score = 50 if loa_m: score += min(10, int(loa_m - 10)) if year and year > 1990: score += min(10, (year - 1990) // 3) if price_usd and loa_m: pft = price_usd / max(loa_m / 0.3048, 1) if pft < 600: score += 15 elif pft < 1200: score += 8 score = min(100, max(0, score)) return { "_fast": True, # flag: skip unit-conversion block downstream "skip": False, "name": title[:100], "vessel_type": vtype, "loa_m": loa_m, "beam_m": None, "draft_m": None, "year_built": year, "hull": "Unknown", "propulsion": "Sail" if vtype == "Sailboat" else "Diesel", "status": status, "price_usd": price_usd, "currency": currency_out, "location": location, "country": None, "description": f"{title[:140]}", "flags": [], "score": score, } def search_with_ai(query: str, filters: dict) -> list: """ Hybrid search: direct scraping of open sources + web search to reach blocked sites (YachtWorld, Boats.com, Apollo Duck, etc.) """ vessel_type = filters.get("type", "") region = filters.get("region", "").lower() base = query if vessel_type and vessel_type.lower() not in query.lower(): base = f"{vessel_type} {base}" # Filter sources by region if specified # Load custom sources from DB and merge with built-in try: conn = get_db() custom = [dict(r) for r in conn.execute( "SELECT * FROM custom_sources WHERE active=1").fetchall()] conn.close() all_sources = DIRECT_SOURCES + [{ "name": c["name"], "category": c["category"], "search_url": c["search_url"], "result_sel": "a[href]", "price_sel": "", "img_sel": "img", "loc_sel": "", "type": c["source_type"], } for c in custom] except: all_sources = DIRECT_SOURCES sources_to_use = all_sources if region and region not in ["global", "todo", "all", ""]: region_map = { "usa": ["USA", "Clasificados USA", "Subastas Gobierno USA", "Subastas USA", "Subastas Gobierno", "Comercial Offshore"], "europa": ["Europa", "Brokers Europa", "Francia", "Italia", "Reino Unido", "España", "España / Global"], "caribe": ["Latinoamérica", "Latinoamérica / España", "España / Global"], "latin": ["Latinoamérica", "Latinoamérica / España", "España", "España / Global"], "asia": ["Australia / Pacífico"], "australia": ["Australia / Pacífico"], } allowed_cats = None for key, cats in region_map.items(): if key in region: allowed_cats = cats break if allowed_cats: sources_to_use = [s for s in all_sources if any(c in s["category"] for c in allowed_cats)] if not sources_to_use: sources_to_use = all_sources # Filter by status status = filters.get("status", "") if status == "auction": sources_to_use = [s for s in sources_to_use if s["type"] in ["auction", "salvage"]] or sources_to_use elif status == "salvage": sources_to_use = [s for s in sources_to_use if s["type"] == "salvage"] or sources_to_use elif status not in ("salvage",): # Exclude salvage-only sources unless explicitly searching for salvage sources_to_use = [s for s in sources_to_use if s["type"] != "salvage"] or sources_to_use # Vessel-type-aware source prioritization OFFSHORE_TYPES = {"offshore", "tug", "barge", "ferry", "fishing", "commercial", "salvage"} SAILBOAT_TYPES = {"sailboat", "sail", "velero", "ketch", "sloop", "cutter", "schooner"} COMMERCIAL_ONLY_SOURCES = { "Seaboats Tug", "Seaboats Barge", "Seaboats Offshore", "Seaboats Fishing", "OSV Broker", "OSVBroker", "WorkBoat Classifieds", "VT Halter Marine", "Maritime Connector", "ShipXchange", "Commercial Vessel", } SAILBOAT_ONLY_SOURCES = {"SailboatListings", "SailboatListings View", "Cruisers Forum", "Sailboat Listing"} vessel_type_lower = vessel_type.lower() if vessel_type else "" if vessel_type_lower in OFFSHORE_TYPES: # Skip sailboat-only sources, float commercial ones to front sources_to_use = [s for s in sources_to_use if s["name"] not in SAILBOAT_ONLY_SOURCES] commercial = [s for s in sources_to_use if s["type"] in ("commercial", "salvage", "auction")] rest = [s for s in sources_to_use if s["type"] not in ("commercial", "salvage", "auction")] sources_to_use = commercial + rest elif vessel_type_lower in SAILBOAT_TYPES or "sail" in base.lower() or "velero" in base.lower(): # Skip commercial-only offshore sources for sailboat searches sources_to_use = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES] elif not vessel_type_lower: # Generic search: keep all but put commercial sources after general ones commercial = [s for s in sources_to_use if s["name"] in COMMERCIAL_ONLY_SOURCES] rest = [s for s in sources_to_use if s["name"] not in COMMERCIAL_ONLY_SOURCES] sources_to_use = rest + commercial print(f"[Search] Querying {len(sources_to_use)} sources for: {base}") search_state['total_sources'] = len(sources_to_use) search_state['log'].append(f"Consultando {len(sources_to_use)} fuentes...") def get_query_for_source(src): """Match query language to source region.""" cat = src.get("category","").lower() if any(x in cat for x in ["france","franc","veleros franc"]): return base elif any(x in cat for x in ["spain","españa","espana","mexico","colombia","latin"]): return base else: return f"{base} for sale" if "for sale" not in base.lower() else base # Build web search queries targeting specific sites web_queries = build_web_queries(base, filters) total = len(sources_to_use) + len(web_queries) search_state['total_sources'] = total search_state['log'].append(f"Consultando {len(sources_to_use)} sitios directos + {len(web_queries)} búsquedas web...") print(f"[Search] {len(sources_to_use)} direct + {len(web_queries)} web searches for: {base}") # Run BOTH direct scraping AND web searches in parallel all_raw = [] # ── SailboatListings: dedicated parallel thread (handles its own AI extraction) ── # Only for sailboat/velero or generic searches, not for offshore/tug/barge/etc. sbl_thread = None if vessel_type_lower not in OFFSHORE_TYPES and vessel_type_lower not in {"motor", "motorboat"}: sbl_thread = threading.Thread( target=scrape_and_extract_sailboatlistings, args=(query, filters, search_state.get('search_id', ''), 8), daemon=True, ) sbl_thread.start() search_state['log'].append("SailboatListings: iniciado en paralelo (hilo dedicado)...") print("[Search] SailboatListings dedicated thread started") # ── Breadth-First Search across all sources ────────────────────────────── # Round 1: page 1 of all sources simultaneously # Round 2: page 2 of sources that had results # Round 3: page 3, etc. # Between rounds, a natural pause occurs as we process results # This avoids hammering any single source with consecutive requests MAX_ROUNDS = 6 # max pages per source active_srcs = {src["name"]: {"src": src, "page": 1, "has_more": True} for src in sources_to_use} # Web searches only run once (no pagination) web_done = False for round_num in range(1, MAX_ROUNDS + 1): if search_state.get("cancelled"): break round_sources = {name: info for name, info in active_srcs.items() if info["has_more"]} if not round_sources: break search_state['log'].append(f"Ronda {round_num}: consultando {len(round_sources)} fuentes...") print(f"[Search] Round {round_num}: {len(round_sources)} active sources") round_raw = [] with ThreadPoolExecutor(max_workers=12) as executor: futures = {} # Submit page N of all active sources for name, info in round_sources.items(): src = info["src"] q = get_query_for_source(src) # Add page parameter to URL if supported and page > 1 src_with_page = dict(src) if round_num > 1: url = src["search_url"] # Common pagination patterns if "craigslist.org" in url: src_with_page["search_url"] = url + f"&s={round_num * 25 - 25}" elif "ebay.com" in url: src_with_page["search_url"] = url + f"&_pgn={round_num}" elif "seaboats.net" in url: src_with_page["search_url"] = url + f"&page={round_num}" elif "kijiji.ca" in url: src_with_page["search_url"] = url.rstrip('/') + f"/page-{round_num}/" else: # Most sites don't support pagination via URL params we know # Mark as done after page 1 active_srcs[name]["has_more"] = False continue futures[executor.submit(scrape_source_router, src_with_page, q, filters, round_num)] = name # Web searches on round 1 only if round_num == 1 and not web_done: for wq in web_queries: futures[executor.submit(web_search, wq, 6)] = f"Web:{wq[:20]}" web_done = True # Collect results for this round for future in as_completed(futures, timeout=90): name = futures[future] try: results = future.result() count = len(results) round_raw.extend(results) search_state['sources_done'] += 1 if name.startswith("Web:"): if count: search_state['log'].append(f"🌐 Web: {count} resultados") else: if count: search_state['log'].append(f"✓ {name} p{round_num}: {count}") print(f"[Round {round_num}] {name}: {count} listings") else: # No results this round — remove from future rounds if name in active_srcs: active_srcs[name]["has_more"] = False except Exception as e: search_state['sources_done'] += 1 if name in active_srcs: active_srcs[name]["has_more"] = False all_raw.extend(round_raw) print(f"[Search] Round {round_num} complete: {len(round_raw)} new results (total: {len(all_raw)})") # Small pause between rounds — natural break if round_num < MAX_ROUNDS and not search_state.get("cancelled"): polite_pause("BFS-round") print(f"[Search] Got {len(all_raw)} raw results, extracting vessel data...") if not all_raw: return [] # Extract vessel data — parallel with dedup and real-time save vessels = [] lock = threading.Lock() max_price = float(filters.get("max_price") or 0) min_loa = float(filters.get("min_loa") or 0) query_words = [w.lower() for w in query.split() if len(w) > 2] # Deduplicate raw results by URL seen_urls = set() unique_raw = [] for r in all_raw: if r["url"] not in seen_urls: seen_urls.add(r["url"]) unique_raw.append(r) print(f"[Extract] Processing {len(unique_raw)} unique URLs...") SYNONYMS = { "sailboat":["sail","velero","vela","ketch","sloop","schooner","yawl","voilier"], "velero": ["sail","sailboat","vela","ketch","sloop"], "tug": ["tugboat","remolcador","tug boat","schlepper"], "barge": ["barcaza","chaland","ponton","landing craft","lct"], "fishing": ["pesquero","trawler","seiner","longliner","fisher"], "offshore":["osv","supply vessel","supply boat","platform"], "yacht": ["yate","motoryacht","m/y"], "motor": ["motorboat","lancha","speedboat","cruiser"], } NON_VESSELS = ["outboard motor","engine only","motor only","parts only", "trailer only","propeller","honda bf","yamaha f","suzuki df", "life jacket","anchor","marine insurance","boat storage", # Land vehicles — never boats "ford expedition","ford explorer","ford f-1","ford ranger", "ford bronco","ford mustang","ford escape","ford transit", "chevy silverado","chevy tahoe","chevy suburban","chevy colorado", "chevrolet silverado","chevrolet tahoe","chevrolet suburban", "gmc sierra","gmc yukon","gmc terrain","gmc canyon", "dodge ram","ram 1500","ram 2500","ram 3500", "jeep wrangler","jeep cherokee","jeep grand","jeep gladiator", "toyota camry","toyota tacoma","toyota tundra","toyota 4runner", "toyota highlander","toyota rav4","toyota sienna", "subaru outback","subaru forester","subaru crosstrek", "honda cr-v","honda pilot","honda accord","honda civic","honda odyssey", "tesla model","bmw x","mercedes benz","audi q","volkswagen jetta", "cadillac escalade","cadillac xt","buick enclave","buick encore", # Non-vessel services "sailing lesson","sailing partner","sailing school","sailing class", "sailing instruction","boating lesson","boat lesson","boating class", "sailing instructor","boat rental","kayak rental","canoe rental", ] def expand_query(words): expanded = set(words) for w in words: for key, syns in SYNONYMS.items(): if w == key or w in syns: expanded.add(key) expanded.update(syns) return expanded expanded_query = expand_query(query_words) GENERIC_NAMES = { "sailboat","velero","barco","yacht","boat","vessel","embarcación", "sailboat for sale","velero en venta","boat for sale","barco en venta", "motor boat","motorboat","fishing boat","tug boat","tugboat", "within25 mi","within 25 mi","results","listing","listings", } def process_one(raw): try: if search_state.get("cancelled"): return # Quick title pre-check title_lower = raw["title"].lower() if any(kw in title_lower for kw in NON_VESSELS): return src_type = raw.get("source_type", "") all_images = [] data = None # ── FAST PATH: known boat marketplace → pure regex, no AI ──────── if src_type in ("broker","classifieds","auction","salvage","commercial"): data = extract_vessel_fast(raw) if data: img = raw.get("img_url","") if img: all_images = [img] else: # Derive thumbnail from URL (no page fetch needed) listing_url = raw.get("url","") ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url) if ebay_m: all_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"] cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url) if cl_m: all_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"] # ── Fast path: validate the listing is actually a boat ────────────── if data and data.get("_fast"): combined_text = (raw.get("title","") + " " + raw.get("snippet","")).lower() url_l = raw.get("url","").lower() # URLs that are guaranteed to be boat listings (trusted sections) BOAT_URLS = ("/boa","/boat","/sail","sailboatlistings","yachtworld", "boattrader","seaboats","apolloduck","rightboat","boat24", "annonces-bateau","barcos.net","tradeaboat","marinetraffic") is_boat_url = any(k in url_l for k in BOAT_URLS) # General auction sites (sell everything) need a boat keyword in the text BOAT_WORDS = ["boat","sail","yacht","vessel","ketch","sloop","catamaran", "trimaran","mast","hull","marina","keel","watercraft","cruiser", "trawler","dinghy","skiff","pontoon","motorboat","powerboat", "sailboat","barge","tugboat","outboard","inboard","nautical", "marine","stern","bow","aft","draft","beam","knot","starboard"] has_boat_word = any(k in combined_text for k in BOAT_WORDS) if not is_boat_url and not has_boat_word: return # Cars, furniture, etc. from general auction sites — skip # ── SLOW PATH: web-search results → fetch page + AI ────────────── if not data: page_text, page_images = "", [] try: fut = ThreadPoolExecutor(max_workers=1).submit(fetch_page_with_images, raw["url"]) page_text, page_images = fut.result(timeout=12) except Exception: page_text = (f"Title: {raw['title']} " f"| Location: {raw.get('location','')} | {raw.get('snippet','')}") if not page_images and raw.get("img_url"): page_images = [raw["img_url"]] if not page_images: listing_url = raw.get("url", "") ebay_m = re.search(r'ebay\.com/itm/(\d+)', listing_url) if ebay_m: page_images = [f"https://i.ebayimg.com/images/g/{ebay_m.group(1)}/s-l500.jpg"] cl_m = re.search(r'craigslist\.org/.+/(\d{10})\.html', listing_url) if cl_m: page_images = [f"https://images.craigslist.org/{cl_m.group(1)}_600x450.jpg"] all_images = page_images status = ("auction" if src_type == "auction" else "salvage" if src_type == "salvage" else "active") context = ("URL: " + raw["url"] + "\nTitle: " + raw["title"] + "\nPrice: " + raw.get("price_text","") + "\n" + page_text[:1500]) prompt = ( "Analyze this boat listing from " + str(raw.get('source','')) + ". Search was: " + query + "\n" "TEXT: " + context + "\n\n" "If NOT a boat for sale respond {skip:true}. " "If IS a boat respond JSON with: skip=false, name, vessel_type " "(Yacht|Motor|Sailboat|Fishing|Tug|Barge|Offshore|Ferry|Other), " "loa_m, beam_m, draft_m (ALWAYS in METERS — detect unit from text; " "if feet multiply by 0.3048, e.g. 45ft=13.7m, 60ft=18.3m, 100ft=30.5m), " "year_built, hull, propulsion, " "status=" + status + ", price_usd, currency, location, country, " "description (Spanish max 150 chars), flags=[], score 0-100." ) response = ollama_generate(prompt, model=MODELS['classify'], json_mode=True) m = re.search(r'\{.*\}', response or '', re.DOTALL) if not m: return data = json.loads(m.group()) if data.get("skip") or not data.get("name"): return # Override AI loa_m with regex (AI misses feet→m conversion) loa_from_ctx = None for pat in [ r'(?:length|loa|eslora)[:\s]+([\d.]+)\s*(?:ft|\'|feet)', r'\b(\d{2,3}(?:\.\d)?)\s*(?:ft|feet|\')', r'^(\d{2,3}(?:\.\d)?)\s*\'', ]: lm = re.search(pat, context, re.IGNORECASE) if not lm: lm = re.search(pat, raw.get("title",""), re.IGNORECASE) if lm: try: ft = float(lm.group(1)) if 10 < ft < 500: loa_from_ctx = round(ft * 0.3048, 1) break except: pass if loa_from_ctx and not data.get("loa_m"): data["loa_m"] = loa_from_ctx elif loa_from_ctx and data.get("loa_m") and data["loa_m"] > 25: data["loa_m"] = round(data["loa_m"] * 0.3048, 1) # AI unit conversion guard (only needed for AI output) ctx_lower = (page_text + " " + raw.get("title","")).lower() has_feet = bool(re.search(r"\d+\s*(?:ft|feet|')\b|loa[:\s]+\d+\s*(?:ft|')", ctx_lower)) vtype_lower = data.get("vessel_type","").lower() MAX_M = {"sailboat":25,"yacht":35,"motor":30,"fishing":30, "tug":60,"barge":120,"offshore":90,"ferry":100,"other":50} max_reasonable = MAX_M.get(vtype_lower, 50) for dim in ["loa_m","beam_m","draft_m"]: val = data.get(dim) if not val or not isinstance(val,(int,float)): continue convert = False if dim == "loa_m" and (val > 100 or val > max_reasonable or (val > 25 and has_feet)): convert = True elif dim == "beam_m" and (val > 30 or (val > 8 and has_feet)): convert = True elif dim == "draft_m"and (val > 15 or (val > 5 and has_feet)): convert = True if convert: data[dim] = round(val * 0.3048, 1) # ── Shared post-processing (fast path + AI path) ────────────────── if not data or not data.get("name"): return # Query match check combined = (data.get("name","") + " " + data.get("description","") + " " + data.get("vessel_type","") + " " + raw.get("title","") + " " + raw.get("url","")).lower() if query_words: if not any(qw in combined for qw in expanded_query): # Skip query-match filter for results from direct scrapers (not web search). # Web search results have category="Web Search" and may return off-topic pages. # Direct scraper results already passed through a relevant search query. is_web_search = raw.get("category","").lower() == "web search" if is_web_search: source_lower = raw.get("source","").lower() if not any(kw in source_lower for kw in ["sailboat","yacht","workboat","offshore","tug","commercial", "boats","boattrader","apolloduck","rightboat","seaboats", "yachtworld","govplanet","govdeals","hibid","copart","ebay", "salvex","kijiji","craigslist","denison","galati","hmy"]): return # Non-vessel + generic name check if any(kw in data.get("name","").lower() for kw in NON_VESSELS): return if data.get("name","").lower().strip() in GENERIC_NAMES: return # Filters (price + LOA) if max_price and data.get("price_usd") and data["price_usd"] > max_price * 1.01: return if min_loa and data.get("loa_m") and data["loa_m"] < (min_loa - 0.15): return data["images"] = all_images[:8] data["source_url"] = raw["url"] data["source_name"] = raw["source"] vid = save_vessel(data) if vid > 0: with lock: search_state["found"] += 1 vessels.append(data) tag = "[Fast]" if data.get("_fast") else "[AI]" msg = f"✓ {data.get('name','?')} — {raw['source']}" print(f"{tag} {msg}") search_state["log"].append(msg) except Exception as e: print(f"[Extract] Error: {e}") # Fast path: more workers + more URLs since most results skip AI now with ThreadPoolExecutor(max_workers=16) as ex: futs = [ex.submit(process_one, r) for r in unique_raw[:300]] for f in as_completed(futs, timeout=180): if search_state.get("cancelled"): break try: f.result() except Exception: pass print(f"[Search] Done — {len(vessels)} vessels found") return vessels return vessels # ── Fingerprint ─────────────────────────────────────────────────────────────── def fingerprint(v: dict) -> str: raw = f"{v.get('name','').lower().strip()}|{round(v.get('loa_m') or 0)}|{v.get('year_built',0)}|{v.get('vessel_type','')}" return hashlib.sha256(raw.encode()).hexdigest()[:16] def save_vessel(v: dict) -> int: # Reject pure shells — need at least name + 1 real data field if not v.get("name") or v["name"].strip() in ("", "Unknown"): return -1 data_points = sum(1 for f in ['price_usd', 'loa_m', 'year_built', 'location'] if v.get(f)) if data_points < 1: return -1 fp = fingerprint(v) conn = get_db() c = conn.cursor() existing = c.execute("SELECT id FROM vessels WHERE fingerprint=?", (fp,)).fetchone() if existing: conn.close() return existing['id'] try: c.execute("""INSERT INTO vessels (name,vessel_type,loa_m,beam_m,draft_m,year_built,hull,propulsion, status,price_usd,currency,location,country,source_name,source_url, description,images,flags,score,fingerprint,raw_data) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", (v.get('name'), v.get('vessel_type'), v.get('loa_m'), v.get('beam_m'), v.get('draft_m'), v.get('year_built'), v.get('hull'), v.get('propulsion'), v.get('status','active'), v.get('price_usd'), v.get('currency','USD'), v.get('location'), v.get('country'), v.get('source_name'), v.get('source_url'), v.get('description'), json.dumps(v.get('images',[])), json.dumps(v.get('flags',[])), v.get('score',50), fp, json.dumps(v))) vid = c.lastrowid conn.commit() except Exception as e: print(f"[DB] Error: {e}") vid = -1 finally: conn.close() return vid # ── API Routes ──────────────────────────────────────────────────────────────── def hash_pw(pw): return _hashlib.sha256(pw.encode()).hexdigest() def seed_admin(): conn = get_db() existing = conn.execute("SELECT id FROM users WHERE username='admin'").fetchone() if not existing: conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)", ('admin', hash_pw('admin123'), 'admin')) conn.commit() print("[Auth] Default user created: admin / admin123") conn.close() @app.route('/api/login', methods=['POST']) def login(): body = request.json or {} username = body.get('username','').strip() password = body.get('password','') conn = get_db() user = conn.execute("SELECT * FROM users WHERE username=? AND password=?", (username, hash_pw(password))).fetchone() conn.close() if user: session['user_id'] = user['id'] session['username'] = user['username'] session['role'] = user['role'] return jsonify({'ok': True, 'username': user['username'], 'role': user['role']}) return jsonify({'ok': False, 'error': 'Usuario o contraseña incorrectos'}), 401 @app.route('/api/logout', methods=['POST']) def logout(): session.clear() return jsonify({'ok': True}) @app.route('/api/me') def me(): if 'user_id' not in session: return jsonify({'logged_in': False}), 401 return jsonify({'logged_in': True, 'username': session.get('username'), 'role': session.get('role')}) @app.route('/api/users', methods=['GET']) def list_users(): if session.get('role') != 'admin': return jsonify({'error': 'forbidden'}), 403 conn = get_db() rows = [dict(r) for r in conn.execute("SELECT id,username,role,created_at FROM users").fetchall()] conn.close() return jsonify({'users': rows}) @app.route('/api/users', methods=['POST']) def create_user(): if session.get('role') != 'admin': return jsonify({'error': 'forbidden'}), 403 body = request.json or {} username = body.get('username','').strip() password = body.get('password','') role = body.get('role','user') if not username or not password: return jsonify({'error': 'username and password required'}), 400 conn = get_db() try: conn.execute("INSERT INTO users (username,password,role) VALUES (?,?,?)", (username, hash_pw(password), role)) conn.commit() conn.close() return jsonify({'ok': True}) except: conn.close() return jsonify({'error': 'username already exists'}), 400 @app.route('/api/change_password', methods=['POST']) def change_password(): if 'user_id' not in session: return jsonify({'error': 'not logged in'}), 401 body = request.json or {} old_pw = body.get('old_password','') new_pw = body.get('new_password','') conn = get_db() user = conn.execute("SELECT * FROM users WHERE id=? AND password=?", (session['user_id'], hash_pw(old_pw))).fetchone() if not user: conn.close() return jsonify({'error': 'Contraseña actual incorrecta'}), 400 conn.execute("UPDATE users SET password=? WHERE id=?", (hash_pw(new_pw), session['user_id'])) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/') def index(): return send_from_directory('static', 'index.html') @app.route('/api/status') def status(): models = ollama_models() conn = get_db() counts = { 'vessels': conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0], 'saved': conn.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0], 'alerts': conn.execute("SELECT COUNT(*) FROM alerts WHERE active=1").fetchone()[0], } conn.close() return jsonify({ 'ok': True, 'ollama_models': models, 'active_model': MODELS['extract'], 'db_counts': counts, 'sources_count': len(DIRECT_SOURCES), 'categories': list(set(s['category'] for s in DIRECT_SOURCES)), }) @app.route('/api/vessels') def list_vessels(): conn = get_db() q = "SELECT * FROM vessels WHERE 1=1" params = [] if t := request.args.get('type'): q += " AND vessel_type=?"; params.append(t) if s := request.args.get('status'): q += " AND status=?"; params.append(s) if h := request.args.get('hull'): q += " AND hull=?"; params.append(h) if mp := request.args.get('max_price'): q += " AND price_usd <= ?"; params.append(float(mp)) if ml := request.args.get('min_loa'): q += " AND loa_m IS NOT NULL AND loa_m >= ?"; params.append(round(float(ml) - 0.15, 2)) if yr_min := request.args.get('year_min'): try: q += " AND year_built >= ?"; params.append(int(yr_min)) except: pass if yr_max := request.args.get('year_max'): try: q += " AND year_built <= ?"; params.append(int(yr_max)) except: pass sort = request.args.get('sort', 'score') sorts = { 'score':'score DESC', 'price_asc':'price_usd ASC', 'price_desc':'price_usd DESC', 'loa':'loa_m DESC', 'year':'year_built DESC', 'newest':'created_at DESC' } q += f" ORDER BY {sorts.get(sort,'score DESC')}" q += f" LIMIT {min(int(request.args.get('limit',200)),500)}" rows = [dict(r) for r in conn.execute(q, params).fetchall()] for r in rows: r['flags'] = json.loads(r.get('flags') or '[]') r['images'] = json.loads(r.get('images') or '[]') conn.close() return jsonify({'vessels': rows, 'count': len(rows)}) _PROXY_ALLOWED = [ 'sailboatlistings.com', 'yachtworld.com', 'boattrader.com', 'apolloduck.com', 'rightboat.com', 'boat24.com', 'seaboats.net', 'boats.com', 'iboats.com', 'yachtworld.co.uk', ] @app.route('/api/img_proxy') def img_proxy(): url = request.args.get('url', '') if not url: return '', 404 from urllib.parse import urlparse host = urlparse(url).hostname or '' if not any(d in host for d in _PROXY_ALLOWED): return '', 403 try: resp = requests.get(url, timeout=10, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': f'https://{host}/', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', }) if resp.status_code == 200: ct = resp.headers.get('content-type', 'image/jpeg') return Response(resp.content, content_type=ct, headers={'Cache-Control': 'public, max-age=86400'}) return '', resp.status_code except Exception as e: app.logger.debug(f"img_proxy error: {e}") return '', 502 # Global search state search_state = { 'running': False, 'cancelled': False, 'query': '', 'found': 0, 'total_sources': 0, 'sources_done': 0, 'log': [], } @app.route('/api/search', methods=['POST']) def search(): body = request.json or {} query = body.get('query', '') filters = body.get('filters', {}) if not query: return jsonify({'error': 'query requerido'}), 400 # Clear previous results immediately conn = get_db() conn.execute("DELETE FROM vessels") conn.execute("DELETE FROM saved_vessels") conn.execute("INSERT INTO search_history (query,filters) VALUES (?,?)", (query, json.dumps(filters))) conn.commit() conn.close() # Reset state search_state['running'] = True search_state['cancelled'] = False search_state['query'] = query search_state['found'] = 0 search_state['sources_done'] = 0 search_state['total_sources'] = len(DIRECT_SOURCES) search_state['log'] = [f"Iniciando búsqueda: {query}"] # Tag this search with a unique ID so old threads don't pollute new searches import uuid search_id = str(uuid.uuid4()) search_state['search_id'] = search_id # Run search in background thread def run_bg(sid): try: search_with_ai(query, filters) except Exception as e: search_state['log'].append(f"Error: {e}") print(f"[BG] Error: {e}") finally: if search_state.get('search_id') == sid: search_state['running'] = False total = search_state['found'] msg = f"✓ Búsqueda completa — {total} embarcaciones encontradas" search_state['log'].append(msg) print(f"[BG] {msg}") t = threading.Thread(target=run_bg, args=(search_id,), daemon=True) t.start() return jsonify({'ok': True, 'message': 'Búsqueda iniciada en background'}) @app.route('/api/search/status') def search_status(): return jsonify(search_state) @app.route('/api/search/cancel', methods=['POST']) def cancel_search(): import uuid search_state['cancelled'] = True search_state['running'] = False search_state['search_id'] = str(uuid.uuid4()) # invalidate any running thread search_state['log'].append('⏹ Búsqueda cancelada por el usuario') return jsonify({'ok': True}) @app.route('/api/fb-status') def fb_status(): SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json") return jsonify({"active": os.path.exists(SESSION_FILE)}) @app.route('/api/fb-setup', methods=['POST']) def fb_setup(): """ Launch a visible Chromium window so the user can log in to Facebook. After login is detected (marketplace URL is accessible), saves cookies to fb_session.json. """ SESSION_FILE = os.path.join(os.path.dirname(__file__), "fb_session.json") import json as _json try: from playwright.sync_api import sync_playwright result = {"ok": False, "msg": ""} with sync_playwright() as p: browser = p.chromium.launch( headless=False, args=["--disable-blink-features=AutomationControlled"]) context = browser.new_context( viewport={"width": 1100, "height": 800}, user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/122.0.0.0 Safari/537.36")) page = context.new_page() page.goto("https://www.facebook.com/login", timeout=30000, wait_until="domcontentloaded") # Wait up to 3 minutes for user to log in and reach marketplace try: page.wait_for_url( re.compile(r'facebook\.com/(marketplace|home|feed)'), timeout=180000) # Give extra time to fully load page.wait_for_timeout(3000) cookies = context.cookies() with open(SESSION_FILE, "w") as f: _json.dump(cookies, f) result = {"ok": True, "msg": f"Sesión guardada ({len(cookies)} cookies). " "Facebook Marketplace activado."} except Exception as e: result = {"ok": False, "msg": f"Tiempo agotado o error: {e}"} finally: try: page.close() except: pass browser.close() return jsonify(result) except Exception as e: return jsonify({"ok": False, "msg": str(e)}), 500 @app.route('/api/vessels/', methods=['GET']) def get_vessel(vid): conn = get_db() row = conn.execute("SELECT * FROM vessels WHERE id=?", (vid,)).fetchone() conn.close() if not row: return jsonify({'error': 'not found'}), 404 v = dict(row) v['flags'] = json.loads(v.get('flags') or '[]') v['images'] = json.loads(v.get('images') or '[]') return jsonify(v) @app.route('/api/vessels', methods=['POST']) def add_vessel(): v = request.json or {} v['source_name'] = v.get('source_name', 'Manual') vid = save_vessel(v) return jsonify({'id': vid, 'ok': True}) @app.route('/api/vessels/', methods=['PUT']) def update_vessel(vid): body = request.json or {} conn = get_db() fields = ['name','vessel_type','loa_m','beam_m','draft_m','year_built', 'hull','propulsion','status','price_usd','location','description','score'] updates = {k: body[k] for k in fields if k in body} if updates: set_clause = ', '.join(f"{k}=?" for k in updates) conn.execute(f"UPDATE vessels SET {set_clause}, updated_at=datetime('now') WHERE id=?", [*updates.values(), vid]) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/vessels/', methods=['DELETE']) def delete_vessel(vid): conn = get_db() conn.execute("DELETE FROM vessels WHERE id=?", (vid,)) conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/saved', methods=['GET']) def list_saved(): conn = get_db() rows = conn.execute(""" SELECT v.*, s.notes, s.saved_at FROM vessels v JOIN saved_vessels s ON v.id=s.vessel_id ORDER BY s.saved_at DESC """).fetchall() result = [] for r in rows: v = dict(r) v['flags'] = json.loads(v.get('flags') or '[]') v['images'] = json.loads(v.get('images') or '[]') result.append(v) conn.close() return jsonify({'vessels': result, 'count': len(result)}) @app.route('/api/saved/', methods=['POST']) def save_vessel_fav(vid): notes = (request.json or {}).get('notes', '') conn = get_db() existing = conn.execute("SELECT id FROM saved_vessels WHERE vessel_id=?", (vid,)).fetchone() if not existing: conn.execute("INSERT INTO saved_vessels (vessel_id, notes) VALUES (?,?)", (vid, notes)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/saved/', methods=['DELETE']) def unsave_vessel(vid): conn = get_db() conn.execute("DELETE FROM saved_vessels WHERE vessel_id=?", (vid,)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/alerts', methods=['GET']) def list_alerts(): conn = get_db() rows = [dict(r) for r in conn.execute("SELECT * FROM alerts WHERE active=1").fetchall()] conn.close() return jsonify({'alerts': rows}) @app.route('/api/alerts', methods=['POST']) def create_alert(): body = request.json or {} conn = get_db() conn.execute("INSERT INTO alerts (name, filters) VALUES (?,?)", (body.get('name','Alerta'), json.dumps(body.get('filters',{})))) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/alerts/', methods=['DELETE']) def delete_alert(aid): conn = get_db() conn.execute("UPDATE alerts SET active=0 WHERE id=?", (aid,)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/sources') def list_sources(): by_cat = {} for s in DIRECT_SOURCES: cat = s['category'] if cat not in by_cat: by_cat[cat] = [] by_cat[cat].append({'name': s['name'], 'url': s['search_url'].split('?')[0], 'type': s['type'], 'builtin': True}) # Add custom sources try: conn = get_db() custom = [dict(r) for r in conn.execute("SELECT * FROM custom_sources ORDER BY category").fetchall()] conn.close() for c in custom: cat = c['category'] or 'Custom' if cat not in by_cat: by_cat[cat] = [] by_cat[cat].append({ 'name': c['name'], 'url': c['search_url'].split('?')[0], 'type': c['source_type'], 'builtin': False, 'id': c['id'], 'active': bool(c['active']) }) except: pass return jsonify({'sources': by_cat, 'total': sum(len(v) for v in by_cat.values())}) @app.route('/api/history') def search_history(): conn = get_db() rows = [dict(r) for r in conn.execute( "SELECT * FROM search_history ORDER BY searched_at DESC LIMIT 50").fetchall()] conn.close() return jsonify({'history': rows}) @app.route('/api/analyze', methods=['POST']) def analyze_text(): body = request.json or {} text = body.get('text', '') source = body.get('source', 'Manual') if not text: return jsonify({'error': 'text requerido'}), 400 result = extract_vessel_from_text(text, source) if result: vid = save_vessel({**result, 'source_name': source}) result['id'] = vid return jsonify(result) @app.route('/api/collections', methods=['GET']) def list_collections(): conn = get_db() cols = [dict(r) for r in conn.execute( "SELECT c.*, COUNT(cv.vessel_id) as vessel_count FROM collections c " "LEFT JOIN collection_vessels cv ON c.id=cv.collection_id " "GROUP BY c.id ORDER BY c.created_at DESC").fetchall()] conn.close() return jsonify({'collections': cols}) @app.route('/api/collections', methods=['POST']) def create_collection(): body = request.json or {} name = body.get('name','').strip() if not name: return jsonify({'error': 'name required'}), 400 conn = get_db() conn.execute("INSERT INTO collections (name,description,color,icon) VALUES (?,?,?,?)", (name, body.get('description',''), body.get('color','#00b4ff'), body.get('icon','📁'))) conn.commit() cid = conn.execute("SELECT last_insert_rowid()").fetchone()[0] conn.close() return jsonify({'ok': True, 'id': cid}) @app.route('/api/collections/', methods=['DELETE']) def delete_collection(cid): conn = get_db() conn.execute("DELETE FROM collection_vessels WHERE collection_id=?", (cid,)) conn.execute("DELETE FROM collections WHERE id=?", (cid,)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/collections//vessels', methods=['GET']) def collection_vessels(cid): conn = get_db() rows = conn.execute(""" SELECT v.*, cv.notes, cv.added_at FROM vessels v JOIN collection_vessels cv ON v.id=cv.vessel_id WHERE cv.collection_id=? ORDER BY cv.added_at DESC""", (cid,)).fetchall() result = [] for r in rows: v = dict(r) v['flags'] = json.loads(v.get('flags') or '[]') v['images'] = json.loads(v.get('images') or '[]') result.append(v) conn.close() return jsonify({'vessels': result, 'count': len(result)}) @app.route('/api/collections//vessels', methods=['POST']) def add_to_collection(cid): body = request.json or {} vessel_ids = body.get('vessel_ids', []) notes = body.get('notes', '') conn = get_db() added = 0 for vid in vessel_ids: try: conn.execute("INSERT OR IGNORE INTO collection_vessels (collection_id,vessel_id,notes) VALUES (?,?,?)", (cid, vid, notes)) added += 1 except: pass conn.commit() conn.close() return jsonify({'ok': True, 'added': added}) @app.route('/api/collections//vessels/', methods=['DELETE']) def remove_from_collection(cid, vid): conn = get_db() conn.execute("DELETE FROM collection_vessels WHERE collection_id=? AND vessel_id=?", (cid, vid)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/custom_sources', methods=['GET']) def get_custom_sources(): conn = get_db() rows = [dict(r) for r in conn.execute( "SELECT * FROM custom_sources ORDER BY created_at DESC").fetchall()] conn.close() return jsonify({'sources': rows}) @app.route('/api/custom_sources', methods=['POST']) def add_custom_source(): body = request.json or {} name = body.get('name','').strip() url = body.get('search_url','').strip() if not name or not url: return jsonify({'error': 'name and search_url required'}), 400 # Ensure URL has {query} placeholder if '{query}' not in url: url = url.rstrip('/') + '?q={query}' conn = get_db() conn.execute("""INSERT INTO custom_sources (name,category,search_url,source_type,added_by) VALUES (?,?,?,?,?)""", (name, body.get('category','Custom'), url, body.get('source_type','broker'), session.get('username','admin'))) conn.commit() sid = conn.execute("SELECT last_insert_rowid()").fetchone()[0] conn.close() return jsonify({'ok': True, 'id': sid}) @app.route('/api/custom_sources/', methods=['PUT']) def update_custom_source(sid): body = request.json or {} conn = get_db() fields = ['name','category','search_url','source_type','active'] updates = {k: body[k] for k in fields if k in body} if updates: set_clause = ', '.join(f"{k}=?" for k in updates) conn.execute(f"UPDATE custom_sources SET {set_clause} WHERE id=?", [*updates.values(), sid]) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/custom_sources/', methods=['DELETE']) def delete_custom_source(sid): conn = get_db() conn.execute("DELETE FROM custom_sources WHERE id=?", (sid,)) conn.commit() conn.close() return jsonify({'ok': True}) @app.route('/api/stats') def stats(): conn = get_db() c = conn.cursor() data = { 'total': c.execute("SELECT COUNT(*) FROM vessels").fetchone()[0], 'saved': c.execute("SELECT COUNT(*) FROM saved_vessels").fetchone()[0], 'by_type': dict(c.execute("SELECT vessel_type, COUNT(*) FROM vessels GROUP BY vessel_type").fetchall()), 'by_status': dict(c.execute("SELECT status, COUNT(*) FROM vessels GROUP BY status").fetchall()), 'by_country':dict((k or 'Unknown', v) for k,v in c.execute("SELECT country, COUNT(*) FROM vessels WHERE country IS NOT NULL GROUP BY country ORDER BY COUNT(*) DESC LIMIT 10").fetchall()), 'avg_score': c.execute("SELECT AVG(score) FROM vessels").fetchone()[0] or 0, 'avg_price': c.execute("SELECT AVG(price_usd) FROM vessels WHERE price_usd > 0").fetchone()[0] or 0, 'top_opportunities': [dict(r) for r in c.execute( "SELECT id,name,vessel_type,price_usd,score,location FROM vessels ORDER BY score DESC LIMIT 5").fetchall()], } conn.close() return jsonify(data) # ── Seed sample data ────────────────────────────────────────────────────────── def seed_sample_data(): samples = [ {"name":"M/Y Stella Maris","vessel_type":"Yacht","loa_m":28.4,"beam_m":6.8,"draft_m":1.9,"year_built":2008,"hull":"Fiberglass","propulsion":"Diesel","status":"active","price_usd":189000,"location":"Fort Lauderdale, FL","country":"US","source_name":"YachtWorld","source_url":"https://yachtworld.com","description":"Yate motor bien mantenido, twin Volvo IPS, refit 2022.","flags":["below_market","motivated_seller"],"score":87}, {"name":"F/V Cape Hatteras","vessel_type":"Fishing","loa_m":19.2,"beam_m":5.1,"draft_m":1.4,"year_built":1997,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":22000,"location":"Gloucester, MA","country":"US","source_name":"GovDeals","source_url":"https://govdeals.com","description":"Ex buque NOAA, motor operativo, casco requiere trabajo.","flags":["rare","salvage_value","below_market"],"score":94}, {"name":"TUG Bravo Eagle","vessel_type":"Tug","loa_m":32.0,"beam_m":9.4,"draft_m":3.8,"year_built":1989,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":310000,"location":"New Orleans, LA","country":"US","source_name":"AuctionTime","source_url":"https://auctiontime.com","description":"Remolcador 2400HP, clase ABS, listo para operación comercial.","flags":["rare","auction","motivated_seller"],"score":91}, {"name":"OSV Pacific Ranger","vessel_type":"Offshore","loa_m":52.0,"beam_m":13.2,"draft_m":4.1,"year_built":2005,"hull":"Steel","propulsion":"Diesel","status":"auction","price_usd":890000,"location":"Port Fourchon, LA","country":"US","source_name":"GovPlanet","source_url":"https://govplanet.com","description":"Buque apoyo offshore DP1, 400T carga, documentación completa.","flags":["rare","auction","government_surplus"],"score":79}, {"name":"Barge RJ-440","vessel_type":"Barge","loa_m":44.0,"beam_m":12.0,"draft_m":1.8,"year_built":1978,"hull":"Steel","propulsion":"None","status":"active","price_usd":55000,"location":"Houston, TX","country":"US","source_name":"WorkBoat Classifieds","source_url":"https://workboat.com","description":"Barcaza cubierta, capacidad 800T, buen estado estructural.","flags":["below_market","rare"],"score":73}, {"name":"LCT Endeavour","vessel_type":"Barge","loa_m":61.0,"beam_m":14.6,"draft_m":1.5,"year_built":1968,"hull":"Steel","propulsion":"Diesel","status":"salvage","price_usd":38000,"location":"Manila, Filipinas","country":"PH","source_name":"Salvex","source_url":"https://salvex.com","description":"Landing craft, estructura sólida, motores requieren overhaul.","flags":["salvage_value","rare","below_market"],"score":82}, ] for s in samples: save_vessel(s) # ── Main ────────────────────────────────────────────────────────────────────── if __name__ == '__main__': import socket, signal, atexit, sys BASE_DIR = os.path.dirname(os.path.abspath(__file__)) PID_FILE = os.path.join(BASE_DIR, ".server.pid") # ── Handle existing instance ─────────────────────────────────────────────── def kill_pid(pid): try: import ctypes handle = ctypes.windll.kernel32.OpenProcess(1, False, pid) ctypes.windll.kernel32.TerminateProcess(handle, -1) ctypes.windll.kernel32.CloseHandle(handle) return True except: try: os.kill(pid, 9) return True except: return False def pid_running(pid): try: os.kill(pid, 0) return True except OSError: return False if os.path.exists(PID_FILE): try: old_pid = int(open(PID_FILE).read().strip()) if pid_running(old_pid): print(f"\n ⚠️ Ya hay una instancia corriendo (PID {old_pid})") resp = input(" ¿Cerrar la instancia anterior y continuar? [S/n]: ").strip().lower() if resp in ("", "s", "si", "sí", "y", "yes"): if kill_pid(old_pid): print(f" ✓ Instancia anterior (PID {old_pid}) cerrada.") import time; time.sleep(1) else: print(f" ✗ No se pudo cerrar. Ciérrala manualmente y vuelve a intentar.") sys.exit(1) else: print(" Saliendo sin cambios.") sys.exit(0) except (ValueError, IOError): pass # PID file corrupted — ignore # ── Write PID file ───────────────────────────────────────────────────────── with open(PID_FILE, "w") as f: f.write(str(os.getpid())) def cleanup_pid(): try: os.remove(PID_FILE) except: pass atexit.register(cleanup_pid) def handle_signal(sig, frame): print("\n\n 👋 Cerrando Boat&Ship-Finder...") cleanup_pid() sys.exit(0) signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) # ── Port selection ───────────────────────────────────────────────────────── def port_free(p): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: s.bind(("0.0.0.0", p)); return True except: return False desired = int(os.environ.get('MARINE_PORT', 8765)) port = desired if not port_free(desired): for p in range(desired + 1, desired + 20): if port_free(p): port = p break print(f"\n ⚠️ Puerto {desired} ocupado — usando {port}") # ── DB init ──────────────────────────────────────────────────────────────── print("\n" + "="*55) print(" Boat&Ship-Finder — Iniciando...") print("="*55) init_db() seed_admin() conn = get_db() count = conn.execute("SELECT COUNT(*) FROM vessels").fetchone()[0] conn.close() if count == 0: print("[DB] Base de datos vacía — lista para búsquedas reales") else: print(f"[DB] {count} embarcaciones en caché de sesión anterior") print(f"\n Local: http://localhost:{port}") print(f" Tailscale: http://:{port}") print(f" Fuentes directas: {len(DIRECT_SOURCES)}") print(f" Modelos Ollama: {list(MODELS.values())}") print(f" PID: {os.getpid()} (guardado en .server.pid)") print("\n [Ctrl+C para detener]\n") app.run(host='0.0.0.0', port=port, debug=False)