// scrape_marine_emails.js // Reads marine business directory Excel, searches for emails/websites, // writes results back to the same file with checkpoints every 25 rows. const xlsx = require('xlsx'); const axios = require('axios'); const cheerio = require('cheerio'); const path = require('path'); const fs = require('fs'); const EXCEL_PATH = path.resolve(__dirname, 'Directorio_Marino_FL_Acumulativo_4.xlsx'); const SHEETS = ['MARINAS', 'ASTILLEROS', 'ALMACENAMIENTO', 'SUMINISTROS']; const CHECKPOINT = 25; const DELAY_MS = 2000; const TIMEOUT_MS = 10000; const MAX_PAGES = 3; const COL = { NUM:0, NAME:1, ADDR:2, PHONE:3, EMAIL:4, CITY:5, COUNTY:6, WEBSITE:7 }; const EMAIL_RE = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g; const summary = { processed:0, emails:0, websites:0, noEmail:[] }; const http = axios.create({ timeout: TIMEOUT_MS, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5' }, maxRedirects: 5, validateStatus: s => s < 500 }); function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } // Guard 1: sanitize names before using in search queries function sanitize(str) { if (!str) return ''; return String(str).replace(/[<>"'`;&|$\\{}]/g, '').trim().substring(0, 100); } function extractEmails(text) { const raw = text.match(EMAIL_RE) || []; return [...new Set(raw.filter(e => { const low = e.toLowerCase(); return !low.match(/\.(png|jpg|jpeg|gif|svg|webp|pdf|zip|js|css)$/) && !low.includes('example.com') && !low.includes('sentry.io') && low.includes('.'); }))]; } function scoreEmail(email) { const low = email.toLowerCase(); if (/dockmaster@/.test(low)) return 5; if (/marina@/.test(low)) return 4; if (/service@/.test(low)) return 3; if (/contact@/.test(low)) return 2; if (/info@/.test(low)) return 1; return 0; } function bestEmail(emails) { if (!emails.length) return null; return emails.sort((a, b) => scoreEmail(b) - scoreEmail(a))[0]; } async function fetchPage(url) { try { if (!url.startsWith('http')) url = 'https://' + url; const res = await http.get(url); if (res.status === 200 && res.data) return { html: String(res.data), url }; } catch { /* timeout or network error — skip */ } return null; } async function searchDDG(query) { const q = encodeURIComponent(sanitize(query)); try { const res = await http.get(`https://html.duckduckgo.com/html/?q=${q}&kl=us-en`); if (!res.data) return []; const $ = cheerio.load(res.data); const results = []; // Primary: result URL spans $('a.result__url').each((_, el) => { let href = $(el).attr('href') || $(el).text().trim(); if (href && !href.includes('duckduckgo.com')) { if (!href.startsWith('http')) href = 'https://' + href; results.push(href); } }); // Fallback: uddg= redirect links if (!results.length) { $('a[href*="uddg="]').each((_, el) => { const href = $(el).attr('href') || ''; try { const u = new URL(href, 'https://duckduckgo.com'); const dest = u.searchParams.get('uddg'); if (dest) results.push(decodeURIComponent(dest)); } catch {} }); } return [...new Set(results)].slice(0, 6); } catch { return []; } } function rootDomain(url) { try { return new URL(url).origin; } catch { return null; } } const SKIP_DOMAINS = /yelp|yellowpages|mapquest|tripadvisor|facebook|google|instagram|twitter|linkedin|bbb\.org|manta\.com|chamberofcommerce|indeed|glassdoor|angieslist|homeadvisor|findagrave/i; async function scrapeBusiness(name, city) { const query = `"${name}" ${city} Florida marina contact email`; let website = null; let email = null; let pagesFetched = 0; const visitedDomains = new Set(); const results = await searchDDG(query); await sleep(300); for (const url of results) { if (SKIP_DOMAINS.test(url)) continue; const domain = rootDomain(url); if (!domain || visitedDomains.has(domain)) continue; visitedDomains.add(domain); if (!website) website = url; // Fetch homepage if (pagesFetched >= MAX_PAGES) break; const home = await fetchPage(url); pagesFetched++; if (!home) continue; const e1 = extractEmails(home.html); if (e1.length) { email = bestEmail(e1); break; } // Fetch /contact page if (pagesFetched < MAX_PAGES) { const contactUrl = domain + '/contact'; const contact = await fetchPage(contactUrl) || await fetchPage(domain + '/contact-us'); pagesFetched++; if (contact) { const e2 = extractEmails(contact.html); if (e2.length) { email = bestEmail(e2); break; } } } // Fetch /about page if (pagesFetched < MAX_PAGES && !email) { const about = await fetchPage(domain + '/about') || await fetchPage(domain + '/about-us'); pagesFetched++; if (about) { const e3 = extractEmails(about.html); if (e3.length) { email = bestEmail(e3); break; } } } if (email) break; } return { website, email }; } function saveWorkbook(wb) { const tmp = EXCEL_PATH + '.tmp'; try { xlsx.writeFile(wb, tmp); if (fs.existsSync(tmp)) { fs.copyFileSync(tmp, EXCEL_PATH); fs.unlinkSync(tmp); } } catch (err) { // If original is locked by Excel, save to sidecar so no progress is lost const sidecar = EXCEL_PATH.replace('.xlsx', '_output.xlsx'); console.log(`\n ⚠️ File locked — saving to: ${path.basename(sidecar)}`); xlsx.writeFile(wb, sidecar); } } async function processSheet(wb, sheetName) { const ws = wb.Sheets[sheetName]; const data = xlsx.utils.sheet_to_json(ws, { header:1, defval:'' }); // Find the header row (contains "NOMBRE") — could be row 0 or 1 let headerIdx = 0; for (let r = 0; r < Math.min(3, data.length); r++) { if (data[r] && data[r].includes('NOMBRE')) { headerIdx = r; break; } } if (data[headerIdx] && !data[headerIdx][COL.WEBSITE]) data[headerIdx][COL.WEBSITE] = 'WEBSITE'; const dataStart = headerIdx + 1; console.log(`\n${'═'.repeat(60)}`); console.log(` Sheet: ${sheetName} (${data.length - dataStart} data rows, header at row ${headerIdx})`); console.log(`${'═'.repeat(60)}`); let rowsSinceCheckpoint = 0; for (let i = dataStart; i < data.length; i++) { const row = data[i]; const name = sanitize(row[COL.NAME]); const city = sanitize(row[COL.CITY]); if (!name) continue; // Skip rows already complete if (row[COL.EMAIL] && row[COL.WEBSITE]) { console.log(` row ${String(i).padStart(3)} SKIP (filled): ${name.substring(0,35)}`); summary.processed++; continue; } process.stdout.write(` row ${String(i).padStart(3)} ${name.substring(0,38).padEnd(38)} → `); let email = row[COL.EMAIL] || null; let website = row[COL.WEBSITE] || null; try { if (!email || !website) { const result = await scrapeBusiness(name, city); if (!email && result.email) email = result.email; if (!website && result.website) website = result.website; } } catch (err) { process.stdout.write(`ERR(${err.message.substring(0,30)})\n`); } if (email) row[COL.EMAIL] = email; if (website) row[COL.WEBSITE] = website; summary.processed++; if (email) summary.emails++; if (website) summary.websites++; if (!email) summary.noEmail.push(`[${sheetName}] ${name} — ${city}`); process.stdout.write(`${email || '(no email)'.padEnd(35)} ${website ? website.substring(0,40) : '(no site)'}\n`); const newWs = xlsx.utils.aoa_to_sheet(data); wb.Sheets[sheetName] = newWs; rowsSinceCheckpoint++; if (rowsSinceCheckpoint >= CHECKPOINT) { saveWorkbook(wb); console.log(`\n ★ CHECKPOINT: saved after ${summary.processed} total rows processed ★\n`); rowsSinceCheckpoint = 0; } await sleep(DELAY_MS); } const newWs = xlsx.utils.aoa_to_sheet(data); wb.Sheets[sheetName] = newWs; saveWorkbook(wb); console.log(`\n ✅ ${sheetName} complete — file saved.`); } async function main() { console.log(` ✅ GUARD 1 — Input: business names sanitized before search queries ✅ GUARD 2 — Network: ${TIMEOUT_MS/1000}s timeout, max ${MAX_PAGES} pages/domain, ${DELAY_MS/1000}s inter-domain delay ✅ GUARD 3 — Data: reads/writes only ${path.basename(EXCEL_PATH)}, no credentials stored 🚀 Starting marine directory email scraper... File: ${EXCEL_PATH} Sheets: ${SHEETS.join(', ')} `); if (!fs.existsSync(EXCEL_PATH)) { console.error('❌ Excel file not found:', EXCEL_PATH); process.exit(1); } const wb = xlsx.readFile(EXCEL_PATH); const start = Date.now(); for (const sheet of SHEETS) { if (!wb.SheetNames.includes(sheet)) { console.log(`Sheet "${sheet}" not found — skipping.`); continue; } await processSheet(wb, sheet); } saveWorkbook(wb); const elapsed = Math.round((Date.now() - start) / 1000); const noEmailList = summary.noEmail.length ? summary.noEmail.map(n => ' • ' + n).join('\n') : ' (all businesses have email)'; console.log(` ${'═'.repeat(60)} FINAL SUMMARY ${'═'.repeat(60)} Rows processed : ${summary.processed} Emails found : ${summary.emails} (${Math.round(summary.emails/summary.processed*100)||0}%) Websites found : ${summary.websites} (${Math.round(summary.websites/summary.processed*100)||0}%) Time elapsed : ${Math.floor(elapsed/60)}m ${elapsed%60}s Saved to : ${EXCEL_PATH} Businesses with NO email (${summary.noEmail.length}) — for manual follow-up: ${noEmailList} ${'═'.repeat(60)} `); } main().catch(err => { console.error('Fatal:', err); process.exit(1); });