Changelog Feature Tracker

Name: Changelog Feature Tracker Data Collector | Firecrawl Prometheus
Creator: bo-05
Published: 2026-07-04T06:44:13.837Z
License: https://opensource.org/licenses/MIT

v1Published

Recent product changelog entries from official company changelog, release notes, updates, or product news pages.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data

#	notes	summary	entry_url	confidence	entry_title	company_name	product_area	publish_date	company_domain	source_page_url
0	null	Collect an up-front payment for monthly subscriptions with prebilling.	https://docs.stripe.com/billing/subscriptions/prebilling	0.82	Prebilling to optimize cash flow	Stripe	Billing	Jun 2026	stripe.com	https://stripe.com/shipped
1	No separate official entry URL was visible; source page used. Publish date not visible.	Calculate tax on NetSuite invoices and file directly from your Stripe Dashboard.	https://stripe.com/shipped	0.82	Stripe Tax for NetSuite	Stripe	Tax	null	stripe.com	https://stripe.com/shipped
2	Publish date not visible.	Track performance, pinpoint drop-offs, and re-engage users across your connected account onboarding funnel.	https://docs.stripe.com/connect/supported-embedded-components/account-onboarding	0.82	Onboarding insights	Stripe	Connect	null	stripe.com	https://stripe.com/shipped

Parameters

--seed-urlsstringrequiredComma-separated company websites, changelog pages, release-note pages, or product update URLs to inspect. e.g. "https://stripe.com"

--max-itemsnumberMaximum total number of changelog entries to return across all seed URLs. default 10

--output-modestringReturn flat entry rows with `entry_rows` or company objects with nested entries using `grouped_by_company`. default "entry_rows"

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers

bo-05@bo-05

0 runs in 14d · published 1d ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author

v1builtapprovedcurrent1d ago

How this script collects data

import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  console.error("FIRECRAWL_API_KEY is not set");
  process.exit(1);
}

const { values: flags } = parseArgs({
  strict: true,
  options: {
    "seed-urls": { type: "string" },
    "max-items": { type: "string" },
    "output-mode": { type: "string" },
  },
});

if (!flags["seed-urls"]) {
  console.error("--seed-urls is required");
  process.exit(1);
}

const seedUrls = String(flags["seed-urls"])
  .split(",")
  .map((value) => value.trim())
  .filter(Boolean);

if (seedUrls.length === 0) {
  console.error("OUT_OF_SCOPE: --seed-urls must include at least one URL or domain");
  process.exit(1);
}

const maxItems = Number(flags["max-items"] ?? "10");
if (!Number.isFinite(maxItems) || maxItems < 1) {
  console.error("OUT_OF_SCOPE: --max-items must be a positive number");
  process.exit(1);
}

const outputMode = String(flags["output-mode"] ?? "entry_rows");
if (!["entry_rows", "grouped_by_company"].includes(outputMode)) {
  console.error('OUT_OF_SCOPE: --output-mode must be "entry_rows" or "grouped_by_company"');
  process.exit(1);
}

const firecrawl = new Firecrawl({ apiKey });

type Candidate = {
  url: string;
  title: string;
  description: string;
  score: number;
};

type EntryRow = {
  company_name: string;
  company_domain: string;
  source_page_url: string;
  entry_title: string;
  publish_date: string | null;
  summary: string | null;
  product_area: string | null;
  entry_url: string;
  confidence: number;
  notes: string | null;
};

const extractionSchema = {
  type: "object",
  properties: {
    company_name: { type: ["string", "null"] },
    entries: {
      type: "array",
      items: {
        type: "object",
        properties: {
          entry_title: { type: "string" },
          publish_date: { type: ["string", "null"] },
          summary: { type: ["string", "null"] },
          product_area: { type: ["string", "null"] },
          entry_url: { type: ["string", "null"] },
          confidence: { type: ["number", "null"] },
          notes: { type: ["string", "null"] },
        },
        required: [
          "entry_title",
          "publish_date",
          "summary",
          "product_area",
          "entry_url",
          "confidence",
          "notes",
        ],
      },
    },
  },
  required: ["company_name", "entries"],
};

function normalizeSeed(input: string): URL {
  const withProtocol = /^https?:\/\//i.test(input) ? input : `https://${input}`;
  try {
    return new URL(withProtocol);
  } catch {
    throw new Error(`OUT_OF_SCOPE: invalid seed URL "${input}"`);
  }
}

function cleanHost(host: string): string {
  return host.toLowerCase().replace(/^www\./, "");
}

function comparableDomain(host: string): string {
  const parts = cleanHost(host).split(".").filter(Boolean);
  if (parts.length <= 2) return parts.join(".");
  const secondLevel = parts[parts.length - 2];
  const topLevel = parts[parts.length - 1];
  if (topLevel.length === 2 && ["ac", "co", "com", "edu", "gov", "net", "org"].includes(secondLevel)) {
    return parts.slice(-3).join(".");
  }
  return parts.slice(-2).join(".");
}

function isOfficialUrl(url: string, companyDomain: string): boolean {
  try {
    const host = cleanHost(new URL(url).hostname);
    return host === companyDomain || host.endsWith(`.${companyDomain}`) || comparableDomain(host) === companyDomain;
  } catch {
    return false;
  }
}

function titleFromDomain(domain: string): string {
  const label = domain.split(".")[0] ?? domain;
  return label
    .split(/[-_]/)
    .filter(Boolean)
    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
    .join(" ");
}

function scoreCandidate(url: string, title = "", description = ""): number {
  const haystack = `${url} ${title} ${description}`.toLowerCase();
  let score = 0;
  const strongTerms = ["changelog", "release notes", "product updates", "what's new", "whats-new", "product news"];
  for (const term of strongTerms) {
    if (haystack.includes(term)) score += 8;
  }
  if (/\/(changelog|release-notes|product-updates|whats-new|shipped)(\/|$)/i.test(url)) score += 8;
  if (/(updates|releases|new-features)/i.test(url)) score += 3;
  if (/(docs|help|support)/i.test(url)) score += 1;
  if (/(press|careers|legal|privacy|terms|pricing)/i.test(url)) score -= 7;
  if (/\/blog\//i.test(url) && !/(changelog|release|product|update|shipped)/i.test(haystack)) score -= 5;
  return score;
}

function looksLikeChangelogUrl(url: string): boolean {
  return /(changelog|release-notes|product-updates|updates|whats-new|shipped|releases|product-news)/i.test(url);
}

function compactString(value: unknown): string | null {
  if (typeof value !== "string") return null;
  const trimmed = value.replace(/\s+/g, " ").trim();
  return trimmed.length > 0 ? trimmed : null;
}

function normalizeUrlMaybe(url: string | null, sourceUrl: string): string | null {
  if (!url) return null;
  try {
    return new URL(url, sourceUrl).toString();
  } catch {
    return null;
  }
}

function dedupeKey(companyDomain: string, title: string, publishDate: string | null): string {
  return `${companyDomain}|${title.toLowerCase().replace(/\W+/g, " ").trim()}|${publishDate ?? ""}`;
}

async function findCandidates(seed: URL, companyDomain: string): Promise<Candidate[]> {
  const candidates = new Map<string, Candidate>();
  const seedUrl = seed.toString();
  const seedScore = scoreCandidate(seedUrl, "", "");
  if (looksLikeChangelogUrl(seedUrl)) {
    candidates.set(seedUrl, { url: seedUrl, title: "", description: "", score: seedScore + 10 });
  }

  const query = `site:${companyDomain} ("changelog" OR "release notes" OR "product updates" OR "what's new" OR "product news")`;
  try {
    const result = await firecrawl.search(query, {
      limit: 6,
      integration: "prometheus",
    });
    const hits = Array.isArray((result as any).web) ? (result as any).web : Array.isArray((result as any).data) ? (result as any).data : [];
    for (const hit of hits) {
      const url = compactString(hit.url);
      if (!url || !isOfficialUrl(url, companyDomain)) continue;
      const title = compactString(hit.title) ?? "";
      const description = compactString(hit.description) ?? "";
      const score = scoreCandidate(url, title, description);
      if (score < 4) continue;
      const existing = candidates.get(url);
      if (!existing || existing.score < score) {
        candidates.set(url, { url, title, description, score });
      }
    }
  } catch (err) {
    console.error(`Search failed for ${companyDomain}: ${err}`);
  }

  return [...candidates.values()].sort((a, b) => b.score - a.score).slice(0, 2);
}

async function extractEntries(candidate: Candidate, companyDomain: string, fallbackCompanyName: string, remaining: number): Promise<EntryRow[]> {
  const prompt = [
    `Extract up to ${Math.min(remaining, 10)} recent product changelog or release-note entries from this official company page only.`,
    "Use only entries visible or clearly represented on this page.",
    "Ignore press releases, hiring posts, pricing pages, generic documentation, and blog posts unrelated to product updates.",
    "Do not invent dates or summaries. If a publish date, summary, product area, or separate entry URL is not visible, return null for that field.",
    "Keep summaries short and factual.",
  ].join(" ");

  const result = await firecrawl.scrape(candidate.url, {
    formats: [{ type: "json", prompt, schema: extractionSchema }],
    integration: "prometheus",
    timeout: 30000,
  });

  const extracted = (result as any).json;
  const entries = Array.isArray(extracted?.entries) ? extracted.entries : [];
  const companyName = compactString(extracted?.company_name) ?? fallbackCompanyName;
  const rows: EntryRow[] = [];

  for (const entry of entries) {
    const title = compactString(entry.entry_title);
    if (!title) continue;
    const publishDate = compactString(entry.publish_date);
    const summary = compactString(entry.summary);
    const productArea = compactString(entry.product_area);
    const candidateEntryUrl = normalizeUrlMaybe(compactString(entry.entry_url), candidate.url);
    const officialEntryUrl = candidateEntryUrl && isOfficialUrl(candidateEntryUrl, companyDomain) ? candidateEntryUrl : null;
    const notesParts: string[] = [];
    const extractedNotes = compactString(entry.notes);
    if (extractedNotes) notesParts.push(extractedNotes);
    if (!officialEntryUrl) notesParts.push("No separate official entry URL was visible; source page used.");
    if (!publishDate) notesParts.push("Publish date not visible.");
    const rawConfidence = typeof entry.confidence === "number" ? entry.confidence : candidate.score >= 12 ? 0.82 : 0.68;
    const confidence = Math.max(0, Math.min(1, Number(rawConfidence.toFixed(2))));

    rows.push({
      company_name: companyName,
      company_domain: companyDomain,
      source_page_url: candidate.url,
      entry_title: title,
      publish_date: publishDate,
      summary,
      product_area: productArea,
      entry_url: officialEntryUrl ?? candidate.url,
      confidence,
      notes: notesParts.length > 0 ? notesParts.join(" ") : null,
    });
  }

  return rows;
}

async function main() {
  const allRows: EntryRow[] = [];
  const seen = new Map<string, EntryRow>();

  for (const seedInput of seedUrls) {
    if (allRows.length >= maxItems) break;
    const seed = normalizeSeed(seedInput);
    const companyDomain = comparableDomain(seed.hostname);
    const fallbackCompanyName = titleFromDomain(companyDomain);
    const candidates = await findCandidates(seed, companyDomain);

    if (candidates.length === 0) {
      console.error(`No likely official changelog page found for ${companyDomain}`);
      continue;
    }

    for (const candidate of candidates) {
      if (allRows.length >= maxItems) break;
      try {
        const rows = await extractEntries(candidate, companyDomain, fallbackCompanyName, maxItems - allRows.length);
        for (const row of rows) {
          const key = dedupeKey(row.company_domain, row.entry_title, row.publish_date);
          const existing = seen.get(key);
          if (!existing) {
            seen.set(key, row);
            allRows.push(row);
          } else if (
            (!existing.summary && row.summary) ||
            (!existing.publish_date && row.publish_date) ||
            (existing.entry_url === existing.source_page_url && row.entry_url !== row.source_page_url)
          ) {
            const index = allRows.indexOf(existing);
            seen.set(key, row);
            if (index >= 0) allRows[index] = row;
          }
          if (allRows.length >= maxItems) break;
        }
      } catch (err) {
        console.error(`Extraction failed for ${candidate.url}: ${err}`);
      }
    }
  }

  const limitedRows = allRows.slice(0, maxItems);

  if (outputMode === "grouped_by_company") {
    const groups = new Map<string, { company_name: string; company_domain: string; entries: EntryRow[] }>();
    for (const row of limitedRows) {
      const group = groups.get(row.company_domain) ?? {
        company_name: row.company_name,
        company_domain: row.company_domain,
        entries: [],
      };
      group.entries.push(row);
      groups.set(row.company_domain, group);
    }
    process.stdout.write(JSON.stringify([...groups.values()]));
    return;
  }

  process.stdout.write(JSON.stringify(limitedRows));
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});

deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.