316 lines
9.8 KiB
JavaScript
316 lines
9.8 KiB
JavaScript
// scrape_marine_emails.js
|
|
// Reads marine business directory Excel, searches for emails/websites,
|
|
// writes results back to the same file with checkpoints every 25 rows.
|
|
|
|
const xlsx = require('xlsx');
|
|
const axios = require('axios');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const fs = require('fs');
|
|
|
|
const EXCEL_PATH = path.resolve(__dirname, 'Directorio_Marino_FL_Acumulativo_4.xlsx');
|
|
const SHEETS = ['MARINAS', 'ASTILLEROS', 'ALMACENAMIENTO', 'SUMINISTROS'];
|
|
const CHECKPOINT = 25;
|
|
const DELAY_MS = 2000;
|
|
const TIMEOUT_MS = 10000;
|
|
const MAX_PAGES = 3;
|
|
|
|
const COL = { NUM:0, NAME:1, ADDR:2, PHONE:3, EMAIL:4, CITY:5, COUNTY:6, WEBSITE:7 };
|
|
|
|
const EMAIL_RE = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
|
|
|
|
const summary = { processed:0, emails:0, websites:0, noEmail:[] };
|
|
|
|
const http = axios.create({
|
|
timeout: TIMEOUT_MS,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5'
|
|
},
|
|
maxRedirects: 5,
|
|
validateStatus: s => s < 500
|
|
});
|
|
|
|
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
|
|
|
// Guard 1: sanitize names before using in search queries
|
|
function sanitize(str) {
|
|
if (!str) return '';
|
|
return String(str).replace(/[<>"'`;&|$\\{}]/g, '').trim().substring(0, 100);
|
|
}
|
|
|
|
function extractEmails(text) {
|
|
const raw = text.match(EMAIL_RE) || [];
|
|
return [...new Set(raw.filter(e => {
|
|
const low = e.toLowerCase();
|
|
return !low.match(/\.(png|jpg|jpeg|gif|svg|webp|pdf|zip|js|css)$/) &&
|
|
!low.includes('example.com') &&
|
|
!low.includes('sentry.io') &&
|
|
low.includes('.');
|
|
}))];
|
|
}
|
|
|
|
function scoreEmail(email) {
|
|
const low = email.toLowerCase();
|
|
if (/dockmaster@/.test(low)) return 5;
|
|
if (/marina@/.test(low)) return 4;
|
|
if (/service@/.test(low)) return 3;
|
|
if (/contact@/.test(low)) return 2;
|
|
if (/info@/.test(low)) return 1;
|
|
return 0;
|
|
}
|
|
|
|
function bestEmail(emails) {
|
|
if (!emails.length) return null;
|
|
return emails.sort((a, b) => scoreEmail(b) - scoreEmail(a))[0];
|
|
}
|
|
|
|
async function fetchPage(url) {
|
|
try {
|
|
if (!url.startsWith('http')) url = 'https://' + url;
|
|
const res = await http.get(url);
|
|
if (res.status === 200 && res.data) return { html: String(res.data), url };
|
|
} catch { /* timeout or network error — skip */ }
|
|
return null;
|
|
}
|
|
|
|
async function searchDDG(query) {
|
|
const q = encodeURIComponent(sanitize(query));
|
|
try {
|
|
const res = await http.get(`https://html.duckduckgo.com/html/?q=${q}&kl=us-en`);
|
|
if (!res.data) return [];
|
|
const $ = cheerio.load(res.data);
|
|
const results = [];
|
|
|
|
// Primary: result URL spans
|
|
$('a.result__url').each((_, el) => {
|
|
let href = $(el).attr('href') || $(el).text().trim();
|
|
if (href && !href.includes('duckduckgo.com')) {
|
|
if (!href.startsWith('http')) href = 'https://' + href;
|
|
results.push(href);
|
|
}
|
|
});
|
|
|
|
// Fallback: uddg= redirect links
|
|
if (!results.length) {
|
|
$('a[href*="uddg="]').each((_, el) => {
|
|
const href = $(el).attr('href') || '';
|
|
try {
|
|
const u = new URL(href, 'https://duckduckgo.com');
|
|
const dest = u.searchParams.get('uddg');
|
|
if (dest) results.push(decodeURIComponent(dest));
|
|
} catch {}
|
|
});
|
|
}
|
|
|
|
return [...new Set(results)].slice(0, 6);
|
|
} catch { return []; }
|
|
}
|
|
|
|
function rootDomain(url) {
|
|
try { return new URL(url).origin; } catch { return null; }
|
|
}
|
|
|
|
const SKIP_DOMAINS = /yelp|yellowpages|mapquest|tripadvisor|facebook|google|instagram|twitter|linkedin|bbb\.org|manta\.com|chamberofcommerce|indeed|glassdoor|angieslist|homeadvisor|findagrave/i;
|
|
|
|
async function scrapeBusiness(name, city) {
|
|
const query = `"${name}" ${city} Florida marina contact email`;
|
|
let website = null;
|
|
let email = null;
|
|
let pagesFetched = 0;
|
|
const visitedDomains = new Set();
|
|
|
|
const results = await searchDDG(query);
|
|
await sleep(300);
|
|
|
|
for (const url of results) {
|
|
if (SKIP_DOMAINS.test(url)) continue;
|
|
const domain = rootDomain(url);
|
|
if (!domain || visitedDomains.has(domain)) continue;
|
|
visitedDomains.add(domain);
|
|
|
|
if (!website) website = url;
|
|
|
|
// Fetch homepage
|
|
if (pagesFetched >= MAX_PAGES) break;
|
|
const home = await fetchPage(url);
|
|
pagesFetched++;
|
|
if (!home) continue;
|
|
|
|
const e1 = extractEmails(home.html);
|
|
if (e1.length) { email = bestEmail(e1); break; }
|
|
|
|
// Fetch /contact page
|
|
if (pagesFetched < MAX_PAGES) {
|
|
const contactUrl = domain + '/contact';
|
|
const contact = await fetchPage(contactUrl) ||
|
|
await fetchPage(domain + '/contact-us');
|
|
pagesFetched++;
|
|
if (contact) {
|
|
const e2 = extractEmails(contact.html);
|
|
if (e2.length) { email = bestEmail(e2); break; }
|
|
}
|
|
}
|
|
|
|
// Fetch /about page
|
|
if (pagesFetched < MAX_PAGES && !email) {
|
|
const about = await fetchPage(domain + '/about') ||
|
|
await fetchPage(domain + '/about-us');
|
|
pagesFetched++;
|
|
if (about) {
|
|
const e3 = extractEmails(about.html);
|
|
if (e3.length) { email = bestEmail(e3); break; }
|
|
}
|
|
}
|
|
|
|
if (email) break;
|
|
}
|
|
|
|
return { website, email };
|
|
}
|
|
|
|
function saveWorkbook(wb) {
|
|
const tmp = EXCEL_PATH + '.tmp';
|
|
try {
|
|
xlsx.writeFile(wb, tmp);
|
|
if (fs.existsSync(tmp)) {
|
|
fs.copyFileSync(tmp, EXCEL_PATH);
|
|
fs.unlinkSync(tmp);
|
|
}
|
|
} catch (err) {
|
|
// If original is locked by Excel, save to sidecar so no progress is lost
|
|
const sidecar = EXCEL_PATH.replace('.xlsx', '_output.xlsx');
|
|
console.log(`\n ⚠️ File locked — saving to: ${path.basename(sidecar)}`);
|
|
xlsx.writeFile(wb, sidecar);
|
|
}
|
|
}
|
|
|
|
async function processSheet(wb, sheetName) {
|
|
const ws = wb.Sheets[sheetName];
|
|
const data = xlsx.utils.sheet_to_json(ws, { header:1, defval:'' });
|
|
|
|
// Find the header row (contains "NOMBRE") — could be row 0 or 1
|
|
let headerIdx = 0;
|
|
for (let r = 0; r < Math.min(3, data.length); r++) {
|
|
if (data[r] && data[r].includes('NOMBRE')) { headerIdx = r; break; }
|
|
}
|
|
if (data[headerIdx] && !data[headerIdx][COL.WEBSITE]) data[headerIdx][COL.WEBSITE] = 'WEBSITE';
|
|
|
|
const dataStart = headerIdx + 1;
|
|
console.log(`\n${'═'.repeat(60)}`);
|
|
console.log(` Sheet: ${sheetName} (${data.length - dataStart} data rows, header at row ${headerIdx})`);
|
|
console.log(`${'═'.repeat(60)}`);
|
|
|
|
let rowsSinceCheckpoint = 0;
|
|
|
|
for (let i = dataStart; i < data.length; i++) {
|
|
const row = data[i];
|
|
const name = sanitize(row[COL.NAME]);
|
|
const city = sanitize(row[COL.CITY]);
|
|
|
|
if (!name) continue;
|
|
|
|
// Skip rows already complete
|
|
if (row[COL.EMAIL] && row[COL.WEBSITE]) {
|
|
console.log(` row ${String(i).padStart(3)} SKIP (filled): ${name.substring(0,35)}`);
|
|
summary.processed++;
|
|
continue;
|
|
}
|
|
|
|
process.stdout.write(` row ${String(i).padStart(3)} ${name.substring(0,38).padEnd(38)} → `);
|
|
|
|
let email = row[COL.EMAIL] || null;
|
|
let website = row[COL.WEBSITE] || null;
|
|
|
|
try {
|
|
if (!email || !website) {
|
|
const result = await scrapeBusiness(name, city);
|
|
if (!email && result.email) email = result.email;
|
|
if (!website && result.website) website = result.website;
|
|
}
|
|
} catch (err) {
|
|
process.stdout.write(`ERR(${err.message.substring(0,30)})\n`);
|
|
}
|
|
|
|
if (email) row[COL.EMAIL] = email;
|
|
if (website) row[COL.WEBSITE] = website;
|
|
|
|
summary.processed++;
|
|
if (email) summary.emails++;
|
|
if (website) summary.websites++;
|
|
if (!email) summary.noEmail.push(`[${sheetName}] ${name} — ${city}`);
|
|
|
|
process.stdout.write(`${email || '(no email)'.padEnd(35)} ${website ? website.substring(0,40) : '(no site)'}\n`);
|
|
|
|
const newWs = xlsx.utils.aoa_to_sheet(data);
|
|
wb.Sheets[sheetName] = newWs;
|
|
|
|
rowsSinceCheckpoint++;
|
|
if (rowsSinceCheckpoint >= CHECKPOINT) {
|
|
saveWorkbook(wb);
|
|
console.log(`\n ★ CHECKPOINT: saved after ${summary.processed} total rows processed ★\n`);
|
|
rowsSinceCheckpoint = 0;
|
|
}
|
|
|
|
await sleep(DELAY_MS);
|
|
}
|
|
|
|
const newWs = xlsx.utils.aoa_to_sheet(data);
|
|
wb.Sheets[sheetName] = newWs;
|
|
saveWorkbook(wb);
|
|
console.log(`\n ✅ ${sheetName} complete — file saved.`);
|
|
}
|
|
|
|
async function main() {
|
|
console.log(`
|
|
✅ GUARD 1 — Input: business names sanitized before search queries
|
|
✅ GUARD 2 — Network: ${TIMEOUT_MS/1000}s timeout, max ${MAX_PAGES} pages/domain, ${DELAY_MS/1000}s inter-domain delay
|
|
✅ GUARD 3 — Data: reads/writes only ${path.basename(EXCEL_PATH)}, no credentials stored
|
|
|
|
🚀 Starting marine directory email scraper...
|
|
File: ${EXCEL_PATH}
|
|
Sheets: ${SHEETS.join(', ')}
|
|
`);
|
|
|
|
if (!fs.existsSync(EXCEL_PATH)) {
|
|
console.error('❌ Excel file not found:', EXCEL_PATH);
|
|
process.exit(1);
|
|
}
|
|
|
|
const wb = xlsx.readFile(EXCEL_PATH);
|
|
const start = Date.now();
|
|
|
|
for (const sheet of SHEETS) {
|
|
if (!wb.SheetNames.includes(sheet)) {
|
|
console.log(`Sheet "${sheet}" not found — skipping.`);
|
|
continue;
|
|
}
|
|
await processSheet(wb, sheet);
|
|
}
|
|
|
|
saveWorkbook(wb);
|
|
const elapsed = Math.round((Date.now() - start) / 1000);
|
|
|
|
const noEmailList = summary.noEmail.length
|
|
? summary.noEmail.map(n => ' • ' + n).join('\n')
|
|
: ' (all businesses have email)';
|
|
|
|
console.log(`
|
|
${'═'.repeat(60)}
|
|
FINAL SUMMARY
|
|
${'═'.repeat(60)}
|
|
Rows processed : ${summary.processed}
|
|
Emails found : ${summary.emails} (${Math.round(summary.emails/summary.processed*100)||0}%)
|
|
Websites found : ${summary.websites} (${Math.round(summary.websites/summary.processed*100)||0}%)
|
|
Time elapsed : ${Math.floor(elapsed/60)}m ${elapsed%60}s
|
|
Saved to : ${EXCEL_PATH}
|
|
|
|
Businesses with NO email (${summary.noEmail.length}) — for manual follow-up:
|
|
${noEmailList}
|
|
${'═'.repeat(60)}
|
|
`);
|
|
}
|
|
|
|
main().catch(err => { console.error('Fatal:', err); process.exit(1); });
|