Files

316 lines
9.8 KiB
JavaScript

// scrape_marine_emails.js
// Reads marine business directory Excel, searches for emails/websites,
// writes results back to the same file with checkpoints every 25 rows.
const xlsx = require('xlsx');
const axios = require('axios');
const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const EXCEL_PATH = path.resolve(__dirname, 'Directorio_Marino_FL_Acumulativo_4.xlsx');
const SHEETS = ['MARINAS', 'ASTILLEROS', 'ALMACENAMIENTO', 'SUMINISTROS'];
const CHECKPOINT = 25;
const DELAY_MS = 2000;
const TIMEOUT_MS = 10000;
const MAX_PAGES = 3;
const COL = { NUM:0, NAME:1, ADDR:2, PHONE:3, EMAIL:4, CITY:5, COUNTY:6, WEBSITE:7 };
const EMAIL_RE = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
const summary = { processed:0, emails:0, websites:0, noEmail:[] };
const http = axios.create({
timeout: TIMEOUT_MS,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
},
maxRedirects: 5,
validateStatus: s => s < 500
});
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
// Guard 1: sanitize names before using in search queries
function sanitize(str) {
if (!str) return '';
return String(str).replace(/[<>"'`;&|$\\{}]/g, '').trim().substring(0, 100);
}
function extractEmails(text) {
const raw = text.match(EMAIL_RE) || [];
return [...new Set(raw.filter(e => {
const low = e.toLowerCase();
return !low.match(/\.(png|jpg|jpeg|gif|svg|webp|pdf|zip|js|css)$/) &&
!low.includes('example.com') &&
!low.includes('sentry.io') &&
low.includes('.');
}))];
}
function scoreEmail(email) {
const low = email.toLowerCase();
if (/dockmaster@/.test(low)) return 5;
if (/marina@/.test(low)) return 4;
if (/service@/.test(low)) return 3;
if (/contact@/.test(low)) return 2;
if (/info@/.test(low)) return 1;
return 0;
}
function bestEmail(emails) {
if (!emails.length) return null;
return emails.sort((a, b) => scoreEmail(b) - scoreEmail(a))[0];
}
async function fetchPage(url) {
try {
if (!url.startsWith('http')) url = 'https://' + url;
const res = await http.get(url);
if (res.status === 200 && res.data) return { html: String(res.data), url };
} catch { /* timeout or network error — skip */ }
return null;
}
async function searchDDG(query) {
const q = encodeURIComponent(sanitize(query));
try {
const res = await http.get(`https://html.duckduckgo.com/html/?q=${q}&kl=us-en`);
if (!res.data) return [];
const $ = cheerio.load(res.data);
const results = [];
// Primary: result URL spans
$('a.result__url').each((_, el) => {
let href = $(el).attr('href') || $(el).text().trim();
if (href && !href.includes('duckduckgo.com')) {
if (!href.startsWith('http')) href = 'https://' + href;
results.push(href);
}
});
// Fallback: uddg= redirect links
if (!results.length) {
$('a[href*="uddg="]').each((_, el) => {
const href = $(el).attr('href') || '';
try {
const u = new URL(href, 'https://duckduckgo.com');
const dest = u.searchParams.get('uddg');
if (dest) results.push(decodeURIComponent(dest));
} catch {}
});
}
return [...new Set(results)].slice(0, 6);
} catch { return []; }
}
function rootDomain(url) {
try { return new URL(url).origin; } catch { return null; }
}
const SKIP_DOMAINS = /yelp|yellowpages|mapquest|tripadvisor|facebook|google|instagram|twitter|linkedin|bbb\.org|manta\.com|chamberofcommerce|indeed|glassdoor|angieslist|homeadvisor|findagrave/i;
async function scrapeBusiness(name, city) {
const query = `"${name}" ${city} Florida marina contact email`;
let website = null;
let email = null;
let pagesFetched = 0;
const visitedDomains = new Set();
const results = await searchDDG(query);
await sleep(300);
for (const url of results) {
if (SKIP_DOMAINS.test(url)) continue;
const domain = rootDomain(url);
if (!domain || visitedDomains.has(domain)) continue;
visitedDomains.add(domain);
if (!website) website = url;
// Fetch homepage
if (pagesFetched >= MAX_PAGES) break;
const home = await fetchPage(url);
pagesFetched++;
if (!home) continue;
const e1 = extractEmails(home.html);
if (e1.length) { email = bestEmail(e1); break; }
// Fetch /contact page
if (pagesFetched < MAX_PAGES) {
const contactUrl = domain + '/contact';
const contact = await fetchPage(contactUrl) ||
await fetchPage(domain + '/contact-us');
pagesFetched++;
if (contact) {
const e2 = extractEmails(contact.html);
if (e2.length) { email = bestEmail(e2); break; }
}
}
// Fetch /about page
if (pagesFetched < MAX_PAGES && !email) {
const about = await fetchPage(domain + '/about') ||
await fetchPage(domain + '/about-us');
pagesFetched++;
if (about) {
const e3 = extractEmails(about.html);
if (e3.length) { email = bestEmail(e3); break; }
}
}
if (email) break;
}
return { website, email };
}
function saveWorkbook(wb) {
const tmp = EXCEL_PATH + '.tmp';
try {
xlsx.writeFile(wb, tmp);
if (fs.existsSync(tmp)) {
fs.copyFileSync(tmp, EXCEL_PATH);
fs.unlinkSync(tmp);
}
} catch (err) {
// If original is locked by Excel, save to sidecar so no progress is lost
const sidecar = EXCEL_PATH.replace('.xlsx', '_output.xlsx');
console.log(`\n ⚠️ File locked — saving to: ${path.basename(sidecar)}`);
xlsx.writeFile(wb, sidecar);
}
}
async function processSheet(wb, sheetName) {
const ws = wb.Sheets[sheetName];
const data = xlsx.utils.sheet_to_json(ws, { header:1, defval:'' });
// Find the header row (contains "NOMBRE") — could be row 0 or 1
let headerIdx = 0;
for (let r = 0; r < Math.min(3, data.length); r++) {
if (data[r] && data[r].includes('NOMBRE')) { headerIdx = r; break; }
}
if (data[headerIdx] && !data[headerIdx][COL.WEBSITE]) data[headerIdx][COL.WEBSITE] = 'WEBSITE';
const dataStart = headerIdx + 1;
console.log(`\n${'═'.repeat(60)}`);
console.log(` Sheet: ${sheetName} (${data.length - dataStart} data rows, header at row ${headerIdx})`);
console.log(`${'═'.repeat(60)}`);
let rowsSinceCheckpoint = 0;
for (let i = dataStart; i < data.length; i++) {
const row = data[i];
const name = sanitize(row[COL.NAME]);
const city = sanitize(row[COL.CITY]);
if (!name) continue;
// Skip rows already complete
if (row[COL.EMAIL] && row[COL.WEBSITE]) {
console.log(` row ${String(i).padStart(3)} SKIP (filled): ${name.substring(0,35)}`);
summary.processed++;
continue;
}
process.stdout.write(` row ${String(i).padStart(3)} ${name.substring(0,38).padEnd(38)} → `);
let email = row[COL.EMAIL] || null;
let website = row[COL.WEBSITE] || null;
try {
if (!email || !website) {
const result = await scrapeBusiness(name, city);
if (!email && result.email) email = result.email;
if (!website && result.website) website = result.website;
}
} catch (err) {
process.stdout.write(`ERR(${err.message.substring(0,30)})\n`);
}
if (email) row[COL.EMAIL] = email;
if (website) row[COL.WEBSITE] = website;
summary.processed++;
if (email) summary.emails++;
if (website) summary.websites++;
if (!email) summary.noEmail.push(`[${sheetName}] ${name}${city}`);
process.stdout.write(`${email || '(no email)'.padEnd(35)} ${website ? website.substring(0,40) : '(no site)'}\n`);
const newWs = xlsx.utils.aoa_to_sheet(data);
wb.Sheets[sheetName] = newWs;
rowsSinceCheckpoint++;
if (rowsSinceCheckpoint >= CHECKPOINT) {
saveWorkbook(wb);
console.log(`\n ★ CHECKPOINT: saved after ${summary.processed} total rows processed ★\n`);
rowsSinceCheckpoint = 0;
}
await sleep(DELAY_MS);
}
const newWs = xlsx.utils.aoa_to_sheet(data);
wb.Sheets[sheetName] = newWs;
saveWorkbook(wb);
console.log(`\n ✅ ${sheetName} complete — file saved.`);
}
async function main() {
console.log(`
✅ GUARD 1 — Input: business names sanitized before search queries
✅ GUARD 2 — Network: ${TIMEOUT_MS/1000}s timeout, max ${MAX_PAGES} pages/domain, ${DELAY_MS/1000}s inter-domain delay
✅ GUARD 3 — Data: reads/writes only ${path.basename(EXCEL_PATH)}, no credentials stored
🚀 Starting marine directory email scraper...
File: ${EXCEL_PATH}
Sheets: ${SHEETS.join(', ')}
`);
if (!fs.existsSync(EXCEL_PATH)) {
console.error('❌ Excel file not found:', EXCEL_PATH);
process.exit(1);
}
const wb = xlsx.readFile(EXCEL_PATH);
const start = Date.now();
for (const sheet of SHEETS) {
if (!wb.SheetNames.includes(sheet)) {
console.log(`Sheet "${sheet}" not found — skipping.`);
continue;
}
await processSheet(wb, sheet);
}
saveWorkbook(wb);
const elapsed = Math.round((Date.now() - start) / 1000);
const noEmailList = summary.noEmail.length
? summary.noEmail.map(n => ' • ' + n).join('\n')
: ' (all businesses have email)';
console.log(`
${'═'.repeat(60)}
FINAL SUMMARY
${'═'.repeat(60)}
Rows processed : ${summary.processed}
Emails found : ${summary.emails} (${Math.round(summary.emails/summary.processed*100)||0}%)
Websites found : ${summary.websites} (${Math.round(summary.websites/summary.processed*100)||0}%)
Time elapsed : ${Math.floor(elapsed/60)}m ${elapsed%60}s
Saved to : ${EXCEL_PATH}
Businesses with NO email (${summary.noEmail.length}) — for manual follow-up:
${noEmailList}
${'═'.repeat(60)}
`);
}
main().catch(err => { console.error('Fatal:', err); process.exit(1); });