Public Case Study ROI Extractor collector facts

Publisher: bo-05 (@bo-05).

Version: 1. Last updated: 2026-07-03T09:50:54.511Z.

Run this collector on demand, as an API endpoint, or on a schedule with Firecrawl Prometheus.

Sample fields: notes, use_case, confidence, vendor_name, customer_name, vendor_domain, case_study_url, numeric_results, outcome_summary, source_page_url, case_study_title, customer_industry.

Parameters: seed-urls (string, required), max-companies (number), output-mode (string).

Public Case Study ROI Extractor

v1Published

Extracts compact proof points and visible ROI metrics from official vendor case study and customer story pages.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data
#notesuse_caseconfidencevendor_namecustomer_namevendor_domaincase_study_urlnumeric_resultsoutcome_summarysource_page_urlcase_study_titlecustomer_industrycustomer_size_or_type
0nullAutomating customer support inquirieshighHubSpotSticoshubspot.comhttps://www.hubspot.com/case-studies/sticosAutomated 41% of incoming support inquiries, giving accountants and auditors instant help while freeing the team to shift from reactive tickets to proactive customer success.https://www.hubspot.com/case-studiesSticosProfessional Servicesnull
1nullCreating a new admissions bloghighHubSpotMorehouse Collegehubspot.comhttps://www.hubspot.com/case-studies/morehouse-collegeBuilt a new admissions blog which resulted in faster updates, higher engagement, and a consistent brand voice across every page.https://www.hubspot.com/case-studiesMorehouseEducationnull
2nullUnified customer data managementhighHubSpotMotorola Solutionshubspot.comhttps://www.hubspot.com/case-studies/motorola-solutions[]Unified 123,000+ customer records, providing real-time access to trusted data and uncovering cross-sell opportunities that generated millions in revenue.https://www.hubspot.com/case-studiesMotorola SolutionsSoftware & Technologynull
Parameters
--seed-urlsstringrequiredComma-separated vendor websites, case-study pages, or customer-story hub URLs to search for official case studies. e.g. "https://www.hubspot.com"
--max-companiesnumberMaximum number of seed companies to process, capped at 10 by the collector. default 1
--output-modestringUse case_study_rows for one object per case study or grouped_by_company for one vendor object with nested case studies. default "case_study_rows"

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers
bo-05@bo-05
0 runs in 14d · published 2d ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author
v1builtapprovedcurrent2d ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  console.error("FIRECRAWL_API_KEY is not set");
  process.exit(1);
}

const firecrawl = new Firecrawl({ apiKey });

const SEARCH_LIMIT_PER_VENDOR = 4;
const SOURCE_PAGE_LIMIT_PER_VENDOR = 3;
const CASE_STUDY_LIMIT_PER_SOURCE = 8;
const MAX_COMPANY_CAP = 10;

const { values: flags } = parseArgs({
  strict: true,
  options: {
    "seed-urls": { type: "string" },
    "max-companies": { type: "string" },
    "output-mode": { type: "string" },
  },
});

if (!flags["seed-urls"]) {
  console.error("--seed-urls is required");
  process.exit(1);
}

const outputMode = flags["output-mode"] ?? "case_study_rows";
if (outputMode !== "case_study_rows" && outputMode !== "grouped_by_company") {
  throw new Error("OUT_OF_SCOPE: --output-mode must be case_study_rows or grouped_by_company");
}

const requestedMaxCompanies = Number(flags["max-companies"] ?? "1");
if (!Number.isFinite(requestedMaxCompanies) || requestedMaxCompanies < 1) {
  throw new Error("OUT_OF_SCOPE: --max-companies must be a positive number");
}
const maxCompanies = Math.min(Math.floor(requestedMaxCompanies), MAX_COMPANY_CAP);

function normalizeSeedUrl(input: string): string | null {
  const trimmed = input.trim();
  if (!trimmed) return null;
  const withProtocol = trimmed.includes("://") ? trimmed : `https://${trimmed}`;
  try {
    const url = new URL(withProtocol);
    if (url.protocol !== "http:" && url.protocol !== "https:") return null;
    url.hash = "";
    return url.toString();
  } catch {
    return null;
  }
}

function hostnameFromUrl(url: string): string {
  const host = new URL(url).hostname.toLowerCase();
  return host.startsWith("www.") ? host.slice(4) : host;
}

function domainMatches(candidateUrl: string, vendorDomain: string): boolean {
  try {
    const host = hostnameFromUrl(candidateUrl);
    return host === vendorDomain || host.endsWith(`.${vendorDomain}`);
  } catch {
    return false;
  }
}

function cleanUrl(url: string): string {
  const parsed = new URL(url);
  parsed.hash = "";
  const paramsToDelete: string[] = [];
  parsed.searchParams.forEach((_, key) => {
    const lower = key.toLowerCase();
    if (
      lower.startsWith("utm_") ||
      lower === "hubs_content" ||
      lower === "hubs_content-cta" ||
      lower === "ref" ||
      lower === "source"
    ) {
      paramsToDelete.push(key);
    }
  });
  for (const key of paramsToDelete) parsed.searchParams.delete(key);
  return parsed.toString();
}

function hasCaseStudySignal(url: string, title?: string, description?: string): boolean {
  const text = `${url} ${title ?? ""} ${description ?? ""}`.toLowerCase();
  const signals = [
    "case-study",
    "case-studies",
    "customer-story",
    "customer-stories",
    "success-story",
    "success-stories",
    "customers",
    "testimonial",
    "testimonials",
  ];
  return signals.some((signal) => text.includes(signal));
}

function deriveVendorName(domain: string): string {
  const first = domain.split(".")[0] ?? domain;
  if (!first) return domain;
  return `${first.charAt(0).toUpperCase()}${first.slice(1)}`;
}

function compactText(value: unknown): string | null {
  if (typeof value !== "string") return null;
  const trimmed = value.split("\n").map((part) => part.trim()).filter(Boolean).join(" ");
  const lower = trimmed.toLowerCase();
  if (lower === "null" || lower === "n/a" || lower === "not visible" || lower === "not specified") return null;
  return trimmed || null;
}

function normalizeStringArray(value: unknown): string[] {
  if (!Array.isArray(value)) return [];
  return value
    .map((item) => compactText(item))
    .filter((item): item is string => Boolean(item))
    .slice(0, 8);
}

function confidenceValue(value: unknown): "high" | "medium" | "low" {
  const normalized = compactText(value)?.toLowerCase();
  if (normalized === "high" || normalized === "medium" || normalized === "low") return normalized;
  return "medium";
}

function dedupeKey(row: CaseStudyRow): string {
  const urlPart = row.case_study_url ? cleanUrl(row.case_study_url).toLowerCase() : "";
  const customerPart = (row.customer_name ?? "").toLowerCase();
  const titlePart = (row.case_study_title ?? "").toLowerCase();
  return `${urlPart}|${customerPart}|${titlePart}`;
}

type CaseStudyRow = {
  vendor_name: string;
  vendor_domain: string;
  source_page_url: string;
  case_study_title: string | null;
  customer_name: string | null;
  customer_industry: string | null;
  customer_size_or_type: string | null;
  use_case: string | null;
  outcome_summary: string | null;
  numeric_results: string[];
  case_study_url: string | null;
  confidence: "high" | "medium" | "low";
  notes: string | null;
};

type VendorInput = {
  seedUrl: string;
  vendorDomain: string;
};

const seedUrls = String(flags["seed-urls"])
  .split(",")
  .map(normalizeSeedUrl)
  .filter((url): url is string => Boolean(url))
  .slice(0, maxCompanies);

if (seedUrls.length === 0) {
  throw new Error("OUT_OF_SCOPE: --seed-urls must contain at least one valid http or https URL");
}

const vendors: VendorInput[] = seedUrls.map((seedUrl) => ({
  seedUrl,
  vendorDomain: hostnameFromUrl(seedUrl),
}));

function extractionSchema() {
  return {
    type: "object",
    properties: {
      vendor_name: { type: ["string", "null"] },
      case_studies: {
        type: "array",
        maxItems: CASE_STUDY_LIMIT_PER_SOURCE,
        items: {
          type: "object",
          properties: {
            case_study_title: { type: ["string", "null"] },
            customer_name: { type: ["string", "null"] },
            customer_industry: { type: ["string", "null"] },
            customer_size_or_type: { type: ["string", "null"] },
            use_case: { type: ["string", "null"] },
            outcome_summary: { type: ["string", "null"] },
            numeric_results: { type: "array", items: { type: "string" } },
            case_study_url: { type: ["string", "null"] },
            confidence: { type: "string", enum: ["high", "medium", "low"] },
            notes: { type: ["string", "null"] },
          },
          required: [
            "case_study_title",
            "customer_name",
            "customer_industry",
            "customer_size_or_type",
            "use_case",
            "outcome_summary",
            "numeric_results",
            "case_study_url",
            "confidence",
            "notes",
          ],
        },
      },
    },
    required: ["vendor_name", "case_studies"],
  };
}

async function discoverSourcePages(vendor: VendorInput): Promise<string[]> {
  const candidates: string[] = [];
  if (hasCaseStudySignal(vendor.seedUrl)) candidates.push(cleanUrl(vendor.seedUrl));

  const query = `site:${vendor.vendorDomain} case studies OR customer stories OR success stories OR testimonials`;
  const searchResults = await firecrawl.search(query, {
    limit: SEARCH_LIMIT_PER_VENDOR,
    integration: "prometheus",
  });

  const webResults = Array.isArray(searchResults.web) ? searchResults.web : [];
  for (const result of webResults.slice(0, SEARCH_LIMIT_PER_VENDOR)) {
    const url = typeof result.url === "string" ? result.url : null;
    if (!url) continue;
    if (!domainMatches(url, vendor.vendorDomain)) continue;
    if (!hasCaseStudySignal(url, result.title, result.description)) continue;
    candidates.push(cleanUrl(url));
  }

  if (candidates.length === 0) candidates.push(cleanUrl(vendor.seedUrl));

  const seen = new Set<string>();
  return candidates
    .filter((url) => {
      const key = url.toLowerCase();
      if (seen.has(key)) return false;
      seen.add(key);
      return true;
    })
    .slice(0, SOURCE_PAGE_LIMIT_PER_VENDOR);
}

async function extractFromSourcePage(vendor: VendorInput, sourcePageUrl: string): Promise<CaseStudyRow[]> {
  const prompt = [
    "Extract only visible official customer case study, customer story, success story, or testimonial entries from this vendor-owned page.",
    "Do not use third-party review snippets, generic blog commentary, or reposted stories.",
    "Do not invent ROI numbers, industries, sizes, or customer details; use null for missing text and an empty numeric_results array when no numeric metric is visible.",
    `Return at most ${CASE_STUDY_LIMIT_PER_SOURCE} compact entries with quoted outcomes, ROI statements, and numeric metrics when visible.`,
  ].join(" ");

  const doc = await firecrawl.scrape(sourcePageUrl, {
    formats: [{ type: "json", schema: extractionSchema(), prompt }],
    onlyMainContent: true,
    integration: "prometheus",
  });

  const extracted = doc.json && typeof doc.json === "object" ? doc.json as Record<string, unknown> : {};
  const vendorName = compactText(extracted.vendor_name) ?? deriveVendorName(vendor.vendorDomain);
  const rawStudies = Array.isArray(extracted.case_studies) ? extracted.case_studies : [];

  return rawStudies.slice(0, CASE_STUDY_LIMIT_PER_SOURCE).map((item) => {
    const study = item && typeof item === "object" ? item as Record<string, unknown> : {};
    const rawCaseUrl = compactText(study.case_study_url);
    const caseUrl = rawCaseUrl && domainMatches(rawCaseUrl, vendor.vendorDomain) ? cleanUrl(rawCaseUrl) : sourcePageUrl;
    return {
      vendor_name: vendorName,
      vendor_domain: vendor.vendorDomain,
      source_page_url: sourcePageUrl,
      case_study_title: compactText(study.case_study_title),
      customer_name: compactText(study.customer_name),
      customer_industry: compactText(study.customer_industry),
      customer_size_or_type: compactText(study.customer_size_or_type),
      use_case: compactText(study.use_case),
      outcome_summary: compactText(study.outcome_summary),
      numeric_results: normalizeStringArray(study.numeric_results),
      case_study_url: caseUrl,
      confidence: confidenceValue(study.confidence),
      notes: compactText(study.notes),
    };
  }).filter((row) => row.case_study_title || row.customer_name || row.outcome_summary);
}

async function collectVendor(vendor: VendorInput): Promise<CaseStudyRow[]> {
  const sourcePages = await discoverSourcePages(vendor);
  const rows: CaseStudyRow[] = [];

  for (const sourcePageUrl of sourcePages.slice(0, SOURCE_PAGE_LIMIT_PER_VENDOR)) {
    try {
      const extractedRows = await extractFromSourcePage(vendor, sourcePageUrl);
      rows.push(...extractedRows);
    } catch (err) {
      const message = err instanceof Error ? err.message : String(err);
      console.error(`Skipping ${sourcePageUrl}: ${message}`);
    }
  }

  const seen = new Set<string>();
  return rows.filter((row) => {
    const key = dedupeKey(row);
    if (seen.has(key)) return false;
    seen.add(key);
    return true;
  });
}

async function main() {
  const allRows: CaseStudyRow[] = [];

  for (const vendor of vendors.slice(0, maxCompanies)) {
    const vendorRows = await collectVendor(vendor);
    allRows.push(...vendorRows);
  }

  if (outputMode === "grouped_by_company") {
    const grouped = vendors.slice(0, maxCompanies).map((vendor) => {
      const caseStudies = allRows
        .filter((row) => row.vendor_domain === vendor.vendorDomain)
        .map(({ vendor_name, vendor_domain, ...study }) => study);
      const vendorName = allRows.find((row) => row.vendor_domain === vendor.vendorDomain)?.vendor_name ?? deriveVendorName(vendor.vendorDomain);
      return {
        vendor_name: vendorName,
        vendor_domain: vendor.vendorDomain,
        case_studies: caseStudies,
      };
    });
    process.stdout.write(JSON.stringify(grouped));
    return;
  }

  process.stdout.write(JSON.stringify(allRows));
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});
deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.

One person builds it. Everyone keeps it fresh.