Contact & Email Finder
v1PublishedExtract public contact details from any company website — emails, phone numbers, and social profile links. Parameter: url.
Output & API
Preview the latest data, download it, or call this collector as an API.
| url | https://www.squarespace.com |
|---|---|
| counts | |
| emails | [] |
| phones | [] |
| social | |
| resolvedUrl | https://www.squarespace.com/ |
| scrapedPages |
Parameters
--urlstringrequiredThe company website domain or URL to scrape for contact details (e.g. example.com or https://www.example.com). e.g. "https://www.squarespace.com"
Marketplace
Publish this collector so others can deploy it — you keep ownership.
0 runs in 14d · published 5h ago
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
v1builtapprovedcurrent5h ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
// ---------------------------------------------------------------------------
// CLI parameters
// ---------------------------------------------------------------------------
const { values } = parseArgs({
strict: true,
options: {
url: { type: "string" },
},
});
const rawInput = (values.url ?? "").trim();
if (!rawInput) {
console.error("Missing required parameter --url=<domain-or-url>");
process.exit(1);
}
// Normalise the input into an absolute http(s) URL.
function normalizeUrl(input: string): string {
let candidate = input;
if (!/^https?:\/\//i.test(candidate)) {
candidate = "https://" + candidate.replace(/^\/+/, "");
}
let parsed: URL;
try {
parsed = new URL(candidate);
} catch {
throw new Error(`OUT_OF_SCOPE: "${input}" is not a valid domain or URL`);
}
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
throw new Error(`OUT_OF_SCOPE: only http(s) URLs are supported, got "${input}"`);
}
// A bare hostname must contain a dot (e.g. example.com) to be a real site.
if (!parsed.hostname.includes(".")) {
throw new Error(`OUT_OF_SCOPE: "${input}" does not look like a public website domain`);
}
return parsed.toString();
}
const firecrawl = new Firecrawl({ apiKey: process.env.FIRECRAWL_API_KEY });
const EMAIL_RE = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
const ASSET_EXT_RE = /\.(png|jpe?g|gif|svg|webp|ico|css|js|woff2?|ttf|eot|mp4|pdf)$/i;
const SOCIAL_MATCHERS: { platform: keyof SocialLinks; host: RegExp }[] = [
{ platform: "linkedin", host: /(^|\.)linkedin\.com$/i },
{ platform: "twitter", host: /(^|\.)(twitter|x)\.com$/i },
{ platform: "facebook", host: /(^|\.)(facebook|fb)\.com$/i },
{ platform: "instagram", host: /(^|\.)instagram\.com$/i },
];
// Generic / non-profile social paths we never want to report as a profile.
const SOCIAL_JUNK_RE = /\/(sharer|share|intent|dialog|plugins|tr|home|login|signup|hashtag|search)\b/i;
interface SocialLinks {
linkedin: string[];
twitter: string[];
facebook: string[];
instagram: string[];
}
function cleanEmail(raw: string): string | null {
const email = raw.trim().replace(/[).,;:'"<>]+$/, "").toLowerCase();
if (!/^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$/.test(email)) return null;
if (ASSET_EXT_RE.test(email)) return null;
if (email.includes("..")) return null;
// Drop obvious placeholders / tracking noise.
if (/(example|sentry|wixpress|\.png|\.jpg|domain)\b/i.test(email)) return null;
return email;
}
// Normalise a phone candidate to its dialable form; return null if implausible.
function cleanPhone(raw: string): string | null {
const trimmed = raw.trim();
const hasPlus = /^\+/.test(trimmed.replace(/^[\s(]*/, ""));
const digits = trimmed.replace(/[^\d]/g, "");
if (digits.length < 7 || digits.length > 15) return null;
return (hasPlus ? "+" : "") + digits;
}
function hostnameOf(link: string): string | null {
try {
return new URL(link).hostname.toLowerCase();
} catch {
return null;
}
}
interface Accumulator {
emails: Set<string>;
phones: Map<string, string>; // normalized -> display
social: Record<keyof SocialLinks, Set<string>>;
}
function classifySocial(link: string, acc: Accumulator) {
const host = hostnameOf(link);
if (!host) return;
if (SOCIAL_JUNK_RE.test(link)) return;
for (const m of SOCIAL_MATCHERS) {
if (m.host.test(host)) {
// Require an actual path segment beyond the bare domain for a profile.
let path = "/";
try {
path = new URL(link).pathname;
} catch {
return;
}
if (path.replace(/\/+$/, "").length === 0) return;
acc.social[m.platform].add(link.split("#")[0].replace(/\/+$/, ""));
return;
}
}
}
// Extract all contact signals from one scraped document into the accumulator.
function harvest(rawHtml: string, links: string[], acc: Accumulator) {
const $ = cheerio.load(rawHtml);
// Anchor hrefs: mailto, tel, and social profiles.
$("a[href]").each((_, el) => {
const href = ($(el).attr("href") || "").trim();
if (/^mailto:/i.test(href)) {
const addr = href.replace(/^mailto:/i, "").split("?")[0];
const c = cleanEmail(addr);
if (c) acc.emails.add(c);
} else if (/^tel:/i.test(href)) {
const c = cleanPhone(href.replace(/^tel:/i, ""));
if (c) acc.phones.set(c, c);
} else if (/^https?:/i.test(href)) {
classifySocial(href, acc);
}
});
// Social profiles surfaced in Firecrawl's discovered link list.
for (const link of links) classifySocial(link, acc);
// Visible text: emails and phone numbers written out in the page body.
$("script, style, noscript").remove();
const text = $("body").text().replace(/\s+/g, " ");
const emailMatches = text.match(EMAIL_RE) || [];
for (const m of emailMatches) {
const c = cleanEmail(m);
if (c) acc.emails.add(c);
}
// Phone numbers in text. To avoid false positives (years, prices, IDs that
// are merely space-separated digit runs), a text candidate must carry an
// explicit telephone separator: a leading "+", parentheses, or dot/dash
// separated digit groups.
const phoneRe =
/(?:\+\d[\d\s().\-]{6,}\d)|(?:\(\d{2,4}\)[\d\s().\-]{4,}\d)|(?:\d{2,4}[.\-]\d{2,4}[.\-]\d{2,4}(?:[.\-]\d{1,4})?)/g;
const phoneMatches = text.match(phoneRe) || [];
for (const m of phoneMatches) {
const c = cleanPhone(m);
if (c && !acc.phones.has(c)) acc.phones.set(c, c);
}
}
async function scrapePage(url: string): Promise<{ rawHtml: string; links: string[] } | null> {
try {
const doc: any = await firecrawl.scrape(url, {
formats: ["rawHtml", "links"],
onlyMainContent: false,
integration: "prometheus",
} as any);
return { rawHtml: doc?.rawHtml || "", links: Array.isArray(doc?.links) ? doc.links : [] };
} catch (e) {
console.error(`Failed to scrape ${url}: ${(e as Error).message}`);
return null;
}
}
async function main() {
const startUrl = normalizeUrl(rawInput);
const startHost = hostnameOf(startUrl);
const acc: Accumulator = {
emails: new Set(),
phones: new Map(),
social: { linkedin: new Set(), twitter: new Set(), facebook: new Set(), instagram: new Set() },
};
const home = await scrapePage(startUrl);
if (!home) {
throw new Error(`could not scrape the homepage at ${startUrl}`);
}
const scrapedPages: string[] = [startUrl];
harvest(home.rawHtml, home.links, acc);
// Find up to 4 same-site contact/about pages to scrape for more details.
const CONTACT_RE = /(contact|about|team|imprint|impressum|legal|support|company)/i;
const candidates: string[] = [];
const seen = new Set([startUrl.replace(/\/+$/, "")]);
for (const link of home.links) {
if (candidates.length >= 4) break;
const host = hostnameOf(link);
if (!host || host !== startHost) continue;
if (!CONTACT_RE.test(link)) continue;
const norm = link.split("#")[0].replace(/\/+$/, "");
if (seen.has(norm)) continue;
seen.add(norm);
candidates.push(link);
}
for (const url of candidates) {
const page = await scrapePage(url);
if (!page) continue;
scrapedPages.push(url);
harvest(page.rawHtml, page.links, acc);
}
const out = {
url: rawInput,
resolvedUrl: startUrl,
scrapedPages,
emails: [...acc.emails].sort(),
phones: [...acc.phones.values()].sort(),
social: {
linkedin: [...acc.social.linkedin].sort(),
twitter: [...acc.social.twitter].sort(),
facebook: [...acc.social.facebook].sort(),
instagram: [...acc.social.instagram].sort(),
},
counts: {
emails: acc.emails.size,
phones: acc.phones.size,
social:
acc.social.linkedin.size +
acc.social.twitter.size +
acc.social.facebook.size +
acc.social.instagram.size,
},
};
process.stdout.write(JSON.stringify(out));
}
main().catch((e) => {
console.error(e instanceof Error ? e.message : String(e));
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.