Competitor Integration Gap Finder
v1PublishedOfficial company integration listings normalized into comparable company-integration rows or grouped company records.
Output & API
Preview the latest data, download it, or call this collector as an API.
| # | notes | confidence | company_name | company_domain | source_page_url | integration_name | integration_category | integration_detail_url |
|---|---|---|---|---|---|---|---|---|
| 0 | Official page listing; no category visible. | high | Zapier | zapier.com | https://zapier.com/apps | Google Sheets | null | https://zapier.com/apps/google-sheets/integrations |
| 1 | Official page listing; no category visible. | high | Zapier | zapier.com | https://zapier.com/apps | Gmail | null | https://zapier.com/apps/gmail/integrations |
| 2 | Official page listing; no category visible. | high | Zapier | zapier.com | https://zapier.com/apps | Slack | null | https://zapier.com/apps/slack/integrations |
Marketplace
Publish this collector so others can deploy it — you keep ownership.
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY is not set");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
const DEFAULT_MAX_COMPANIES = 5;
const MAX_COMPANIES = 10;
const MAX_DISCOVERY_LINKS_PER_SEED = 150;
const SEARCH_RESULT_LIMIT = 3;
const MAX_CANDIDATE_PAGES_PER_COMPANY = 4;
const MAX_EXTRACTION_LINKS_PER_PAGE = 500;
const MAX_INTEGRATIONS_PER_COMPANY = 100;
const { values: flags } = parseArgs({
strict: true,
options: {
"seed-urls": { type: "string" },
"max-companies": { type: "string" },
"output-mode": { type: "string" },
},
});
if (!flags["seed-urls"]) {
console.error("--seed-urls is required");
process.exit(1);
}
const maxCompanies = Number(flags["max-companies"] ?? String(DEFAULT_MAX_COMPANIES));
if (!Number.isInteger(maxCompanies) || maxCompanies < 1 || maxCompanies > MAX_COMPANIES) {
console.error(`OUT_OF_SCOPE: --max-companies must be an integer between 1 and ${MAX_COMPANIES}`);
process.exit(1);
}
const outputMode = flags["output-mode"] ?? "integration_rows";
if (outputMode !== "integration_rows" && outputMode !== "grouped_by_company") {
console.error("OUT_OF_SCOPE: --output-mode must be integration_rows or grouped_by_company");
process.exit(1);
}
type IntegrationRow = {
company_name: string;
company_domain: string;
source_page_url: string | null;
integration_name: string | null;
integration_category: string | null;
integration_detail_url: string | null;
confidence: "high" | "medium" | "low";
notes: string;
};
type Integration = {
integration_name: string;
integration_category: string | null;
integration_detail_url: string | null;
confidence: "high" | "medium" | "low";
notes: string;
};
const seedUrls = String(flags["seed-urls"])
.split(",")
.map((value) => value.trim())
.filter(Boolean)
.slice(0, maxCompanies);
if (seedUrls.length === 0) {
console.error("OUT_OF_SCOPE: --seed-urls must include at least one URL");
process.exit(1);
}
const integrationPageTerms = [
"integration",
"integrations",
"apps",
"app",
"marketplace",
"partners",
"partner",
"connectors",
"plugins",
"extensions",
];
const categoryTerms = [
"crm",
"payments",
"analytics",
"support",
"ecommerce",
"automation",
"productivity",
"developer tools",
"communication",
"collaboration",
"marketing",
"sales",
"finance",
"data",
"storage",
"security",
"hr",
"project management",
];
const genericNames = new Set([
"apps",
"app",
"integrations",
"integration",
"marketplace",
"partners",
"partner",
"learn more",
"read more",
"view all",
"see all",
"get started",
"contact us",
"request demo",
"pricing",
"features",
"solutions",
"resources",
"blog",
"docs",
"documentation",
"api",
"login",
"sign in",
"sign up",
"book a demo",
]);
const nonIntegrationPathParts = new Set([
"category",
"categories",
"collection",
"collections",
"templates",
"template",
"blog",
"resources",
"resource",
"customers",
"customer",
"pricing",
"features",
"feature",
"solutions",
"solution",
"industries",
"industry",
"docs",
"documentation",
"developers",
"developer",
"login",
"signin",
"signup",
"contact",
"deals",
"l",
]);
function normalizeSeedUrl(input: string): URL {
const withProtocol = /^https?:\/\//i.test(input) ? input : `https://${input}`;
let parsed: URL;
try {
parsed = new URL(withProtocol);
} catch {
throw new Error(`OUT_OF_SCOPE: invalid URL "${input}"`);
}
if (!parsed.hostname.includes(".")) {
throw new Error(`OUT_OF_SCOPE: invalid company domain "${input}"`);
}
parsed.hash = "";
return parsed;
}
function cleanDomain(hostname: string): string {
return hostname.toLowerCase().replace(/^www\./, "");
}
function sameOfficialDomain(candidate: string, domain: string): boolean {
try {
const host = cleanDomain(new URL(candidate).hostname);
return host === domain || host.endsWith(`.${domain}`);
} catch {
return false;
}
}
function absoluteUrl(href: string | undefined, baseUrl: string): string | null {
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
return null;
}
try {
const parsed = new URL(href, baseUrl);
parsed.hash = "";
return parsed.toString();
} catch {
return null;
}
}
function compactText(value: string | undefined | null): string {
return (value ?? "").replace(/\s+/g, " ").trim();
}
function titleCaseFromDomain(domain: string): string {
const first = domain.split(".")[0] ?? domain;
return first
.split(/[-_]/)
.filter(Boolean)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
}
function hasIntegrationSignal(value: string): boolean {
const lower = value.toLowerCase();
return integrationPageTerms.some((term) => lower.includes(term));
}
function canonicalUrl(value: string): string {
const parsed = new URL(value);
parsed.hash = "";
if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
parsed.pathname = parsed.pathname.slice(0, -1);
}
return parsed.toString();
}
function candidatePriority(url: string, seedUrl: string): number {
const lower = url.toLowerCase();
if (url === seedUrl) return 0;
if (lower.includes("/integrations")) return 1;
if (lower.includes("/apps") || lower.includes("/app-marketplace")) return 2;
if (lower.includes("/marketplace")) return 3;
if (lower.includes("/partners")) return 4;
if (lower.includes("/connectors")) return 5;
return 9;
}
async function scrapeHtml(url: string): Promise<string | null> {
try {
const doc: any = await firecrawl.scrape(url, {
formats: ["html"],
integration: "prometheus",
});
return typeof doc?.html === "string" ? doc.html : null;
} catch (err) {
console.error(`Skipping ${url}: ${err instanceof Error ? err.message : String(err)}`);
return null;
}
}
async function discoverCandidatePages(seed: URL, domain: string): Promise<string[]> {
const seedUrl = canonicalUrl(seed.toString());
const candidates = new Map<string, number>();
function add(url: string | null, priority: number) {
if (!url || !sameOfficialDomain(url, domain)) return;
const canonical = canonicalUrl(url);
const existing = candidates.get(canonical);
if (existing === undefined || priority < existing) candidates.set(canonical, priority);
}
add(seedUrl, hasIntegrationSignal(seed.pathname) ? 0 : 8);
for (const path of [
"/integrations",
"/apps",
"/app-marketplace",
"/marketplace",
"/partners",
"/connectors",
"/product/integrations",
"/solutions/integrations",
]) {
add(`${seed.origin}${path}`, candidatePriority(`${seed.origin}${path}`, seedUrl));
}
const seedHtml = await scrapeHtml(seedUrl);
if (seedHtml) {
const $ = cheerio.load(seedHtml);
$("a[href]").slice(0, MAX_DISCOVERY_LINKS_PER_SEED).each((_, element) => {
const href = absoluteUrl($(element).attr("href"), seedUrl);
const text = compactText($(element).text());
const signal = `${href ?? ""} ${text}`;
if (href && hasIntegrationSignal(signal)) add(href, candidatePriority(href, seedUrl));
});
}
try {
const searchResults: any = await firecrawl.search(
`site:${domain} official integrations apps marketplace partners`,
{ limit: SEARCH_RESULT_LIMIT, integration: "prometheus" },
);
const results = Array.isArray(searchResults?.data) ? searchResults.data.slice(0, SEARCH_RESULT_LIMIT) : [];
for (const result of results) {
const url = typeof result?.url === "string" ? result.url : null;
const title = compactText(result?.title);
const description = compactText(result?.description);
if (url && sameOfficialDomain(url, domain) && hasIntegrationSignal(`${url} ${title} ${description}`)) {
add(url, candidatePriority(url, seedUrl));
}
}
} catch (err) {
console.error(`Search failed for ${domain}: ${err instanceof Error ? err.message : String(err)}`);
}
return [...candidates.entries()]
.sort((a, b) => a[1] - b[1])
.slice(0, MAX_CANDIDATE_PAGES_PER_COMPANY)
.map(([url]) => url);
}
function visibleCategory(value: string): string | null {
const lower = value.toLowerCase();
const found = categoryTerms.find((term) => lower === term || lower.includes(term));
if (!found) return null;
return found
.split(" ")
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
}
function nearestCategory($: cheerio.CheerioAPI, element: cheerio.Element): string | null {
const direct = visibleCategory(compactText($(element).closest("[data-category], [aria-label]").attr("data-category")));
if (direct) return direct;
const container = $(element).closest("article, li, div, section");
const ownText = compactText(container.find(".category, [class*='category'], [class*='tag']").first().text());
const ownCategory = visibleCategory(ownText);
if (ownCategory) return ownCategory;
const heading = container.prevAll("h2,h3,h4").first().text() || container.closest("section").find("h2,h3,h4").first().text();
return visibleCategory(compactText(heading));
}
function likelyIntegrationName(text: string): boolean {
const name = normalizeIntegrationName(text);
const lower = name.toLowerCase();
if (name.length < 2 || name.length > 70) return false;
if (genericNames.has(lower)) return false;
if (/^(view|see|learn|read|explore|browse|connect|install|try|become|submit|add)\b/i.test(name)) return false;
if (name === name.toUpperCase() && name.length > 8) return false;
if (/^\d[\d,]*\+?$/.test(name)) return false;
if (/[{}<>]/.test(name)) return false;
if (name.split(" ").length > 7) return false;
return /[a-z0-9]/i.test(name);
}
function normalizeIntegrationName(text: string): string {
let name = compactText(text);
name = name.replace(/^logo of\s+/i, "").trim();
name = name.replace(/\s*\d[\d,]*\+?$/, "").trim();
name = name.replace(/\s+logo$/i, "").trim();
name = name.replace(/\s+(integration|integrations|app|apps)$/i, "").trim();
return name;
}
function likelyDetailUrl(href: string, pageUrl: string): boolean {
const parsed = new URL(href);
const page = new URL(pageUrl);
const path = parsed.pathname.toLowerCase();
if (canonicalUrl(href) === canonicalUrl(pageUrl)) return false;
if (parsed.pathname === "/" || path === "/apps" || path === "/integrations" || path === "/marketplace") return false;
const parts = path.split("/").filter(Boolean);
if (parts.some((part) => nonIntegrationPathParts.has(part))) return false;
const pagePath = page.pathname.toLowerCase();
if (parts[0] === "apps") {
return parts.length >= 3 && parts.includes("integrations");
}
if (parts[0] === "marketplace") {
return parts.length >= 2 && /^a[a-z0-9]+-/i.test(parts[1]);
}
if (parts[0] === "integrations" || parts[0] === "integration") {
return parts.length >= 2;
}
if (parts[0] === "connectors" || parts[0] === "connector") {
return parts.length >= 2;
}
if (parts[0] === "partners" || parts[0] === "partner") {
return parts.length >= 2;
}
return hasIntegrationSignal(path) && hasIntegrationSignal(pagePath) && parts.length >= 2;
}
function extractFromPage(html: string, pageUrl: string, domain: string): Integration[] {
const $ = cheerio.load(html);
const pageHasSignal = hasIntegrationSignal(pageUrl) || hasIntegrationSignal(compactText($("title").text()));
const byName = new Map<string, Integration>();
$("a[href]").slice(0, MAX_EXTRACTION_LINKS_PER_PAGE).each((_, element) => {
const href = absoluteUrl($(element).attr("href"), pageUrl);
if (!href || !sameOfficialDomain(href, domain)) return;
if (!likelyDetailUrl(href, pageUrl)) return;
const linkText = compactText($(element).text());
const imageAlt = compactText($(element).find("img[alt]").first().attr("alt"));
const ariaLabel = compactText($(element).attr("aria-label"));
const rawName = [imageAlt, ariaLabel, linkText].find(likelyIntegrationName);
const candidateName = rawName ? normalizeIntegrationName(rawName) : null;
if (!candidateName) return;
const pathHasSignal = hasIntegrationSignal(new URL(href).pathname);
const containerText = compactText($(element).closest("article, li, div").text());
const containerHasSignal = hasIntegrationSignal(containerText);
if (!pageHasSignal && !pathHasSignal && !containerHasSignal) return;
const key = candidateName.toLowerCase();
const category = nearestCategory($, element);
const confidence: "high" | "medium" = pathHasSignal || pageHasSignal ? "high" : "medium";
const integration: Integration = {
integration_name: candidateName,
integration_category: category,
integration_detail_url: href,
confidence,
notes: category ? "Official page listing with visible category signal." : "Official page listing; no category visible.",
};
const existing = byName.get(key);
if (!existing || (existing.confidence === "medium" && integration.confidence === "high")) {
byName.set(key, integration);
}
});
return [...byName.values()].slice(0, MAX_INTEGRATIONS_PER_COMPANY);
}
async function collectCompany(seedUrlText: string): Promise<{ companyName: string; domain: string; sourcePageUrl: string | null; integrations: Integration[] }> {
const seed = normalizeSeedUrl(seedUrlText);
const domain = cleanDomain(seed.hostname);
const candidates = await discoverCandidatePages(seed, domain);
let companyName = titleCaseFromDomain(domain);
let bestSource: string | null = null;
let bestIntegrations: Integration[] = [];
for (const candidate of candidates) {
const html = await scrapeHtml(candidate);
if (!html) continue;
const integrations = extractFromPage(html, candidate, domain);
if (integrations.length > bestIntegrations.length) {
bestIntegrations = integrations;
bestSource = candidate;
}
if (bestIntegrations.length >= 10 && hasIntegrationSignal(candidate)) break;
}
return { companyName, domain, sourcePageUrl: bestSource, integrations: bestIntegrations };
}
async function main() {
const companies = [];
for (const seedUrl of seedUrls) {
companies.push(await collectCompany(seedUrl));
}
if (outputMode === "grouped_by_company") {
const grouped = companies.map((company) => ({
company_name: company.companyName,
company_domain: company.domain,
source_page_url: company.sourcePageUrl,
integrations: company.integrations.map((integration) => ({
integration_name: integration.integration_name,
integration_category: integration.integration_category,
integration_detail_url: integration.integration_detail_url,
confidence: integration.confidence,
notes: integration.notes,
})),
}));
process.stdout.write(JSON.stringify(grouped));
return;
}
const rows: IntegrationRow[] = companies.flatMap((company) => {
if (company.integrations.length === 0) {
return [
{
company_name: company.companyName,
company_domain: company.domain,
source_page_url: company.sourcePageUrl,
integration_name: null,
integration_category: null,
integration_detail_url: null,
confidence: "low",
notes: "No public official integrations page with extractable integration listings was found.",
},
];
}
return company.integrations.map((integration) => ({
company_name: company.companyName,
company_domain: company.domain,
source_page_url: company.sourcePageUrl,
integration_name: integration.integration_name,
integration_category: integration.integration_category,
integration_detail_url: integration.integration_detail_url,
confidence: integration.confidence,
notes: integration.notes,
}));
});
process.stdout.write(JSON.stringify(rows));
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.