Startup Funding And YC Batch Tracker

Name: Startup Funding And YC Batch Tracker Data Collector | Firecrawl Prometheus
Creator: bo-05
Published: 2026-07-03T08:40:35.923Z
License: https://opensource.org/licenses/MIT
v1Published
Tracks YC directory companies and publicly visible funding-database company profiles for a thesis query, with normalized company fields, funding rounds, source URLs, and snapshot metadata.
Output & API

Preview the latest data, download it, or call this collector as an API.
Author's sample data
metadata
companies
Parameters
--source-typesstringrequiredComma-separated source types to use: yc_directory, funding_database, or both. e.g. "yc_directory,funding_database"
--querystringrequiredSector, geography, or thesis used for discovery; at least one of query or seed-urls must be provided. e.g. "AI infrastructure"
--seed-urlsstringComma-separated YC directory pages, funding search pages, or company profile pages to prioritize before search discovery. default ""
--max-companiesnumberrequiredMaximum number of unique companies to return; capped at 25 to keep collection bounded. e.g. 4
--output-modestringrequiredOutput shape: company_rows for one company list, or grouped_by_source for source-grouped company lists. e.g. "company_rows"
--batch-filterstringOptional YC batch filter such as S24 or W25. default ""
--funding-date-rangestringFunding recency phrase used in funding-database discovery queries. default "last 90 days"
--stage-filterstringOptional comma-separated funding stage filter such as pre-seed,seed,series_a. default ""
--regionstringRegion label to attach to the snapshot and company rows when a source-specific region is not available. default "global"
--include-foundersbooleanWhether to include visibly listed founders from source pages. default true
--include-investorsbooleanWhether to include visibly listed investors from funding pages. default true
--snapshot-labelstringSnapshot label for repeated runs; leave blank to use the current run timestamp. default ""
--sort-hintstringSort preference hint recorded in metadata and used in discovery phrasing. default "newest"
Marketplace

Publish this collector so others can deploy it — you keep ownership.
0 subscribers
bo-05@bo-05
0 runs in 14d · published 2d ago
Versions

Every build and self-heal appends a version. Pin one to lock runs to it.
managed by author
v1builtapprovedcurrent2d ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  console.error("FIRECRAWL_API_KEY is not set");
  process.exit(1);
}

const firecrawl = new Firecrawl({ apiKey });

const { values: flags } = parseArgs({
  strict: true,
  options: {
    "source-types": { type: "string" },
    query: { type: "string" },
    "seed-urls": { type: "string" },
    "max-companies": { type: "string" },
    "output-mode": { type: "string" },
    "batch-filter": { type: "string" },
    "funding-date-range": { type: "string" },
    "stage-filter": { type: "string" },
    region: { type: "string" },
    "include-founders": { type: "string" },
    "include-investors": { type: "string" },
    "snapshot-label": { type: "string" },
    "sort-hint": { type: "string" },
  },
});

const MAX_COMPANIES_CAP = 25;
const MAX_SEED_URLS = 20;
const MAX_DISCOVERY_URLS = 12;
const MAX_FINAL_SCRAPE_URLS = 50;
const MAX_CANDIDATE_URLS = 2000;
const MAX_ROUNDS_PER_COMPANY = 5;

function requiredFlag(name: string): string {
  const value = flags[name] as string | undefined;
  if (!value || !value.trim()) {
    console.error(`--${name} is required`);
    process.exit(1);
  }
  return value.trim();
}

const sourceTypes = splitCsv(requiredFlag("source-types"));
const query = cleanText((flags.query as string | undefined) ?? "");
const seedUrls = splitCsv((flags["seed-urls"] as string | undefined) ?? "");
const maxCompaniesRaw = requiredFlag("max-companies");
const outputMode = requiredFlag("output-mode");
const batchFilter = cleanText((flags["batch-filter"] as string | undefined) ?? "");
const fundingDateRange = cleanText((flags["funding-date-range"] as string | undefined) ?? "last 90 days");
const stageFilter = cleanText((flags["stage-filter"] as string | undefined) ?? "");
const requestedRegion = cleanText((flags.region as string | undefined) ?? "global");
const includeFounders = parseBool((flags["include-founders"] as string | undefined) ?? "true", "include-founders");
const includeInvestors = parseBool((flags["include-investors"] as string | undefined) ?? "true", "include-investors");
const snapshotLabel = cleanText((flags["snapshot-label"] as string | undefined) || new Date().toISOString());
const sortHint = cleanText((flags["sort-hint"] as string | undefined) ?? "newest");
const collectedAt = new Date().toISOString();

const maxCompanies = Number(maxCompaniesRaw);
if (!Number.isInteger(maxCompanies) || maxCompanies < 1 || maxCompanies > MAX_COMPANIES_CAP) {
  throw new Error(`OUT_OF_SCOPE: --max-companies must be an integer from 1 to ${MAX_COMPANIES_CAP}`);
}
if (sourceTypes.length === 0) {
  throw new Error("OUT_OF_SCOPE: --source-types must include yc_directory and/or funding_database");
}
for (const sourceType of sourceTypes) {
  if (sourceType !== "yc_directory" && sourceType !== "funding_database") {
    throw new Error("OUT_OF_SCOPE: --source-types values must be yc_directory or funding_database");
  }
}
if (!query && seedUrls.length === 0) {
  throw new Error("OUT_OF_SCOPE: provide --query or --seed-urls");
}
if (outputMode !== "company_rows" && outputMode !== "grouped_by_source") {
  throw new Error("OUT_OF_SCOPE: --output-mode must be company_rows or grouped_by_source");
}
if (seedUrls.length > MAX_SEED_URLS) {
  throw new Error(`OUT_OF_SCOPE: --seed-urls supports at most ${MAX_SEED_URLS} URLs per run`);
}
for (const seedUrl of seedUrls) {
  validateHttpUrl(seedUrl);
}

type SourceType = "yc_directory" | "funding_database";
type FundingRound = {
  round_type: string | null;
  announced_date: string | null;
  amount_raised_text: string | null;
  amount_raised_normalized: number | null;
  investors: string[];
  source_url: string;
  raw_text: string | null;
};
type CompanyRow = {
  tracking_key: string;
  snapshot_label: string;
  collected_at: string;
  source_type: SourceType;
  company_name: string | null;
  company_url: string | null;
  source_page_url: string;
  secondary_source_urls: string[];
  batch: string | null;
  founders: string[];
  company_description: string | null;
  industry_tags: string[];
  region: string | null;
  hq_location: string | null;
  funding_rounds: FundingRound[];
  latest_round_type: string | null;
  latest_round_date: string | null;
  latest_amount_raised_text: string | null;
  latest_amount_raised_normalized: number | null;
  investors: string[];
  confidence: "high" | "medium" | "low";
  notes: string | null;
};

type ScrapeDoc = {
  markdown?: string;
  links?: string[];
  metadata?: {
    title?: string;
    description?: string;
    sourceURL?: string;
    url?: string;
    ogUrl?: string;
    ogTitle?: string;
    ogDescription?: string;
  };
};

function splitCsv(value: string): string[] {
  return value
    .split(",")
    .map((part) => part.trim())
    .filter((part) => part.length > 0);
}

function parseBool(value: string, name: string): boolean {
  const normalized = value.trim().toLowerCase();
  if (normalized === "true") return true;
  if (normalized === "false") return false;
  throw new Error(`OUT_OF_SCOPE: --${name} must be true or false`);
}

function validateHttpUrl(value: string): void {
  let parsed: URL;
  try {
    parsed = new URL(value);
  } catch {
    throw new Error("OUT_OF_SCOPE: --seed-urls must contain valid URLs");
  }
  if (parsed.protocol !== "https:" && parsed.protocol !== "http:") {
    throw new Error("OUT_OF_SCOPE: --seed-urls only supports http and https URLs");
  }
}

function cleanText(value: string | null | undefined): string {
  if (!value) return "";
  return value.replace(/\s+/g, " ").trim();
}

function nullIfUnavailable(value: string | null | undefined): string | null {
  const text = cleanText(value);
  if (!text) return null;
  const lowered = text.toLowerCase();
  if (lowered === "obfuscated" || lowered === "obfuscation" || lowered === "obf" || lowered === "-") return null;
  if (lowered.includes("unlock ") || lowered.includes("register for free")) return null;
  return text;
}

function stripMarkdown(value: string): string {
  return cleanText(
    value
      .replace(/!\[[^\]]*]\([^)]*\)/g, "")
      .replace(/\[([^\]]+)]\([^)]*\)/g, "$1")
      .replace(/<br>/gi, " ")
      .replace(/\\+/g, "")
      .replace(/\*\*/g, "")
      .replace(/^#+\s*/, "")
  );
}

function unique(values: string[]): string[] {
  const seen = new Set<string>();
  const out: string[] = [];
  for (const value of values) {
    const cleaned = nullIfUnavailable(value);
    if (!cleaned) continue;
    const key = cleaned.toLowerCase();
    if (seen.has(key)) continue;
    seen.add(key);
    out.push(cleaned);
  }
  return out;
}

function sourceUrl(doc: ScrapeDoc): string {
  return doc.metadata?.sourceURL || doc.metadata?.ogUrl || doc.metadata?.url || "";
}

function hostOf(value: string): string {
  try {
    return new URL(value).hostname.replace(/^www\./, "");
  } catch {
    return "";
  }
}

function isYcUrl(value: string): boolean {
  return hostOf(value) === "ycombinator.com" && value.includes("/companies");
}

function isFundingUrl(value: string): boolean {
  const host = hostOf(value);
  return host === "crunchbase.com" || host === "tracxn.com" || host === "dealroom.co";
}

function isCompanyProfileUrl(value: string): boolean {
  if (hostOf(value) === "ycombinator.com") {
    const path = new URL(value).pathname;
    return /^\/companies\/[^/]+\/?$/.test(path);
  }
  if (hostOf(value) === "crunchbase.com") return new URL(value).pathname.startsWith("/organization/");
  if (hostOf(value) === "tracxn.com") return new URL(value).pathname.startsWith("/d/companies/");
  if (hostOf(value) === "dealroom.co") return value.includes("/companies/");
  return false;
}

function inferredSourceType(value: string): SourceType {
  return isYcUrl(value) ? "yc_directory" : "funding_database";
}

function normalizeYcBatch(value: string | null): string | null {
  const text = nullIfUnavailable(value);
  if (!text) return null;
  const compact = text.replace("Y Combinator Logo", "").trim();
  const parts = compact.split(" ");
  if (parts.length >= 2) {
    const season = parts[0].toLowerCase();
    const year = parts[1].slice(-2);
    if (season.startsWith("summer")) return `S${year}`;
    if (season.startsWith("winter")) return `W${year}`;
    if (season.startsWith("spring")) return `Sp${year}`;
    if (season.startsWith("fall")) return `F${year}`;
  }
  return compact;
}

function normalizeAmount(value: string | null): number | null {
  const text = nullIfUnavailable(value);
  if (!text) return null;
  const match = text.match(/\$?\s*([0-9]+(?:\.[0-9]+)?)\s*([KMBT])?/i);
  if (!match) return null;
  const number = Number(match[1]);
  if (!Number.isFinite(number)) return null;
  const suffix = (match[2] || "").toUpperCase();
  const multiplier = suffix === "K" ? 1_000 : suffix === "M" ? 1_000_000 : suffix === "B" ? 1_000_000_000 : suffix === "T" ? 1_000_000_000_000 : 1;
  return Math.round(number * multiplier);
}

function trackingKey(sourceType: SourceType, name: string | null, sourcePageUrl: string): string {
  const base = cleanText(name || sourcePageUrl).toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-|-$/g, "");
  return `${sourceType}:${base || "unknown"}`;
}

function extractLinkTextsByPath(markdown: string, pathPart: string): string[] {
  const out: string[] = [];
  const segments = markdown.split("[");
  const maxSegments = Math.min(segments.length, 300);
  for (let index = 0; index < maxSegments; index += 1) {
    const segment = segments[index];
    const close = segment.indexOf("]");
    const open = segment.indexOf("](");
    if (close < 0 || open < 0) continue;
    const text = segment.slice(0, close);
    const rest = segment.slice(open + 2);
    if (rest.includes(pathPart)) out.push(stripMarkdown(text));
  }
  return unique(out);
}

function pickExternalCompanyUrl(doc: ScrapeDoc): string | null {
  const links = doc.links || [];
  const blockedHosts = ["ycombinator.com", "bookface-images.s3.amazonaws.com", "linkedin.com", "twitter.com", "x.com", "github.com", "crunchbase.com", "tracxn.com"];
  const maxLinks = Math.min(links.length, 100);
  for (let index = 0; index < maxLinks; index += 1) {
    const link = links[index];
    const host = hostOf(link);
    if (!host || blockedHosts.includes(host)) continue;
    if (link.startsWith("http://") || link.startsWith("https://")) return link;
  }
  return null;
}

function linesOf(markdown: string): string[] {
  return markdown
    .split("\n")
    .map((line) => stripMarkdown(line))
    .filter((line) => line.length > 0);
}

function parseYcCompany(doc: ScrapeDoc): CompanyRow | null {
  const url = sourceUrl(doc);
  const markdown = doc.markdown || "";
  const lines = linesOf(markdown);
  const title = cleanText(doc.metadata?.title || doc.metadata?.ogTitle || "");
  const nameFromTitle = title.includes(":") ? title.split(":")[0] : title.replace(" | Y Combinator", "");
  let companyName = nullIfUnavailable(nameFromTitle);
  if (!companyName) {
    const markerIndex = lines.findIndex((line) => line.includes("›"));
    companyName = nullIfUnavailable(lines[Math.min(markerIndex + 1, lines.length - 1)]);
  }

  const batchLine = lines.find((line) => line.includes("Y Combinator Logo")) || lines.find((line) => line.startsWith("Batch:"));
  const batch = normalizeYcBatch(batchLine?.replace("Batch:", ""));
  if (batchFilter && batch && batch.toLowerCase() !== batchFilter.toLowerCase()) return null;

  const descriptionHeadingIndex = lines.findIndex((line) => line.startsWith("### "));
  const descriptionParts: string[] = [];
  if (descriptionHeadingIndex >= 0) {
    const maxDescriptionLines = Math.min(lines.length, descriptionHeadingIndex + 6);
    for (let index = descriptionHeadingIndex; index < maxDescriptionLines; index += 1) {
      const line = lines[index].replace(/^###\s*/, "");
      if (line === "Active Founders" || line === "YC Photos") break;
      descriptionParts.push(line);
    }
  }
  const fallbackDescription = doc.metadata?.description || doc.metadata?.ogDescription || null;
  const description = nullIfUnavailable(descriptionParts.join(" ")) || nullIfUnavailable(fallbackDescription);

  const industryTags = extractLinkTextsByPath(markdown, "/companies/industry/");
  const locationLine = lines.find((line) => line.startsWith("Location:"));
  const hqLocation = nullIfUnavailable(locationLine?.replace("Location:", "")) || extractLinkTextsByPath(markdown, "/companies/location/")[0] || null;
  const founders = includeFounders ? extractYcFounders(lines) : [];
  const companyUrl = pickExternalCompanyUrl(doc);
  const notes = "YC directory profile; funding fields are null unless visible on the source page.";

  return {
    tracking_key: trackingKey("yc_directory", companyName, url),
    snapshot_label: snapshotLabel,
    collected_at: collectedAt,
    source_type: "yc_directory",
    company_name: companyName,
    company_url: companyUrl,
    source_page_url: url,
    secondary_source_urls: [],
    batch,
    founders,
    company_description: description,
    industry_tags: industryTags,
    region: requestedRegion || "global",
    hq_location: hqLocation,
    funding_rounds: [],
    latest_round_type: null,
    latest_round_date: null,
    latest_amount_raised_text: null,
    latest_amount_raised_normalized: null,
    investors: [],
    confidence: companyName ? "high" : "medium",
    notes,
  };
}

function extractYcFounders(lines: string[]): string[] {
  const start = lines.findIndex((line) => line === "Active Founders");
  if (start < 0) return [];
  const endCandidates = [lines.findIndex((line, index) => index > start && line === "YC Photos"), lines.findIndex((line, index) => index > start && line === "Hear from the founders")].filter((index) => index > start);
  const end = endCandidates.length ? Math.min(...endCandidates) : Math.min(lines.length, start + 80);
  const names: string[] = [];
  const limit = Math.min(end, start + 80);
  for (let index = start + 1; index < limit; index += 1) {
    const line = lines[index];
    const nearby = lines.slice(index + 1, Math.min(index + 4, limit)).join(" ");
    if (!nearby.includes("Founder")) continue;
    if (line === "Founder" || line.includes("profile") || line.includes("account")) continue;
    if (line.length > 60 || !line.includes(" ")) continue;
    names.push(line);
  }
  return unique(names);
}

function parseFundingCompany(doc: ScrapeDoc): CompanyRow | null {
  const url = sourceUrl(doc);
  const markdown = doc.markdown || "";
  const lines = linesOf(markdown);
  const title = cleanText(doc.metadata?.title || doc.metadata?.ogTitle || "");
  let companyName = title.replace(" - Crunchbase Company Profile & Funding", "").replace(/ - 20[0-9]{2}.*$/, "");
  if (!companyName || companyName === title) {
    const overviewIndex = lines.findIndex((line) => line === "Overview");
    companyName = overviewIndex >= 0 ? lines[overviewIndex + 1] || companyName : companyName;
  }
  companyName = nullIfUnavailable(companyName) || nullIfUnavailable(lines[0]) || null;

  const rounds = extractFundingRounds(markdown, url);
  if (stageFilter && rounds.length > 0 && !rounds.some((round) => stageMatches(round.round_type, stageFilter))) return null;

  const latestRound = rounds[0] || null;
  const investors = includeInvestors
    ? unique(rounds.flatMap((round) => round.investors).concat(extractFaqInvestors(markdown))).filter((name) => name.toLowerCase() !== (companyName || "").toLowerCase())
    : [];
  const industryTags = extractLinkTextsByPath(markdown, "/categories/").concat(extractTracxnTags(lines));
  const companyUrl = pickExternalCompanyUrl(doc);
  const description = nullIfUnavailable(doc.metadata?.description || doc.metadata?.ogDescription) || extractFundingDescription(lines, companyName);
  const hqLocation = extractHeadquarters(lines);
  const notesParts = [
    `Funding database profile discovered for ${query || "provided seed URL"}.`,
    `Funding date range used for discovery: ${fundingDateRange}.`,
  ];
  if (rounds.some((round) => round.raw_text?.toLowerCase().includes("obfuscated"))) {
    notesParts.push("Some funding fields were obfuscated by the source and are returned as null with raw context preserved.");
  }

  return {
    tracking_key: trackingKey("funding_database", companyName, url),
    snapshot_label: snapshotLabel,
    collected_at: collectedAt,
    source_type: "funding_database",
    company_name: companyName,
    company_url: companyUrl,
    source_page_url: url,
    secondary_source_urls: [],
    batch: null,
    founders: [],
    company_description: description,
    industry_tags: unique(industryTags),
    region: requestedRegion || "global",
    hq_location: hqLocation,
    funding_rounds: rounds,
    latest_round_type: latestRound?.round_type || null,
    latest_round_date: latestRound?.announced_date || null,
    latest_amount_raised_text: latestRound?.amount_raised_text || null,
    latest_amount_raised_normalized: latestRound?.amount_raised_normalized || null,
    investors,
    confidence: companyName && rounds.length > 0 ? "medium" : "low",
    notes: notesParts.join(" "),
  };
}

function stageMatches(roundType: string | null, filter: string): boolean {
  if (!roundType) return true;
  const normalizedRound = roundType.toLowerCase().replace(/[^a-z0-9]/g, "");
  const filters = splitCsv(filter).map((part) => part.toLowerCase().replace(/[^a-z0-9]/g, ""));
  return filters.some((part) => part && normalizedRound.includes(part));
}

function extractFundingRounds(markdown: string, url: string): FundingRound[] {
  const rows: FundingRound[] = [];
  const tableLines = markdown.split("\n").filter((line) => line.trim().startsWith("|"));
  const maxTableLines = Math.min(tableLines.length, 80);
  for (let index = 0; index < maxTableLines; index += 1) {
    const line = tableLines[index];
    if (!line.includes("Funding Type") && !line.includes("Pre Seed Round") && !line.includes("Seed Round") && !line.includes("Series")) continue;
    if (line.includes("---") || line.includes("Announced Date")) continue;
    const cells = line.split("|").slice(1, -1).map((cell) => stripMarkdown(cell));
    if (cells.length < 5) continue;
    const announcedDate = nullIfUnavailable(cells[0]);
    const rawTransaction = nullIfUnavailable(cells[1]);
    const moneyRaised = nullIfUnavailable(cells[3]);
    const leadInvestors = includeInvestors ? unique(cells[4].split(",").map((part) => stripMarkdown(part))) : [];
    const fundingType = nullIfUnavailable(cells[5] || rawTransaction);
    const rawText = stripMarkdown(line);
    rows.push({
      round_type: fundingType,
      announced_date: announcedDate,
      amount_raised_text: moneyRaised,
      amount_raised_normalized: normalizeAmount(moneyRaised),
      investors: leadInvestors,
      source_url: url,
      raw_text: rawText || null,
    });
    if (rows.length >= MAX_ROUNDS_PER_COMPANY) break;
  }
  return dedupeRounds(rows);
}

function dedupeRounds(rounds: FundingRound[]): FundingRound[] {
  const seen = new Set<string>();
  const out: FundingRound[] = [];
  for (const round of rounds) {
    const key = `${round.round_type || ""}|${round.announced_date || ""}|${round.amount_raised_text || ""}|${round.investors.join(",")}`;
    if (seen.has(key)) continue;
    seen.add(key);
    out.push(round);
  }
  return out;
}

function extractFaqInvestors(markdown: string): string[] {
  const marker = "Who invested in";
  const index = markdown.indexOf(marker);
  if (index < 0) return [];
  const snippet = markdown.slice(index, Math.min(markdown.length, index + 700));
  const names = extractLinkTextsByPath(snippet, "/person/").concat(extractLinkTextsByPath(snippet, "/organization/"));
  return unique(names).filter((name) => !name.toLowerCase().includes("crunchbase"));
}

function extractFundingDescription(lines: string[], companyName: string | null): string | null {
  if (!companyName) return null;
  const maxLines = Math.min(lines.length, 120);
  for (let index = 0; index < maxLines; index += 1) {
    const line = lines[index];
    if (line.startsWith(`${companyName} is `) || line.startsWith(`${companyName} offers `) || line.startsWith(`${companyName} develops `)) {
      return line;
    }
  }
  return null;
}

function extractHeadquarters(lines: string[]): string | null {
  const maxLines = Math.min(lines.length, 180);
  for (let index = 0; index < maxLines; index += 1) {
    const line = lines[index];
    if (line.includes(" is located in ")) {
      const parts = line.split(" is located in ");
      return nullIfUnavailable(parts[1]?.replace(/\.$/, ""));
    }
    if (line.startsWith("Headquarters")) {
      return nullIfUnavailable(lines[index + 1]);
    }
  }
  return null;
}

function extractTracxnTags(lines: string[]): string[] {
  const out: string[] = [];
  const maxLines = Math.min(lines.length, 120);
  for (let index = 0; index < maxLines; index += 1) {
    const line = lines[index];
    if (line.includes("API &") || line.includes("Developer Tools") || line.includes("FinTech") || line.includes("SaaS")) out.push(line);
  }
  return out.slice(0, 5);
}

function candidateUrlsFromDocs(docs: ScrapeDoc[]): string[] {
  const urls: string[] = [];
  for (const seedUrl of seedUrls) urls.push(seedUrl);
  const maxDocs = Math.min(docs.length, MAX_DISCOVERY_URLS);
  for (let docIndex = 0; docIndex < maxDocs; docIndex += 1) {
    const doc = docs[docIndex];
    const docUrl = sourceUrl(doc);
    if (docUrl && isCompanyProfileUrl(docUrl)) urls.push(docUrl);
    const links = doc.links || [];
    const maxLinks = Math.min(links.length, 200);
    for (let linkIndex = 0; linkIndex < maxLinks; linkIndex += 1) {
      const link = links[linkIndex];
      if (isCompanyProfileUrl(link)) urls.push(link.split("?")[0]);
    }
  }
  return unique(urls).slice(0, MAX_CANDIDATE_URLS);
}

async function discoverUrls(): Promise<string[]> {
  const discoveryUrls: string[] = seedUrls.slice(0, MAX_SEED_URLS);

  if (query && sourceTypes.includes("yc_directory")) {
    const ycQuery = ["site:ycombinator.com/companies", query, batchFilter, "Y Combinator company"].filter(Boolean).join(" ");
    const ycLimit = Math.min(6, Math.max(4, maxCompanies + 1));
    const ycResults = await firecrawl.v1.search(ycQuery, { limit: ycLimit, integration: "prometheus" });
    for (const item of (ycResults.data || []).slice(0, ycLimit)) {
      if (item.url && isYcUrl(item.url)) discoveryUrls.push(item.url);
    }
  }

  if (query && sourceTypes.includes("funding_database")) {
    const fundingLimit = Math.min(6, Math.max(4, maxCompanies + 1));
    const crunchbaseQuery = ["site:crunchbase.com/organization", query, fundingDateRange, stageFilter, "funding startup"].filter(Boolean).join(" ");
    const tracxnQuery = ["site:tracxn.com/d/companies", query, fundingDateRange, stageFilter, "funding investors startup"].filter(Boolean).join(" ");
    const crunchbaseResults = await firecrawl.v1.search(crunchbaseQuery, { limit: fundingLimit, integration: "prometheus" });
    const tracxnResults = await firecrawl.v1.search(tracxnQuery, { limit: fundingLimit, integration: "prometheus" });
    for (const item of (crunchbaseResults.data || []).slice(0, fundingLimit)) {
      if (item.url && isFundingUrl(item.url)) discoveryUrls.push(item.url);
    }
    for (const item of (tracxnResults.data || []).slice(0, fundingLimit)) {
      if (item.url && isFundingUrl(item.url)) discoveryUrls.push(item.url);
    }
  }

  const boundedDiscoveryUrls = unique(discoveryUrls).slice(0, MAX_DISCOVERY_URLS);
  if (boundedDiscoveryUrls.length === 0) return [];
  const discoveryScrape = await firecrawl.v1.batchScrapeUrls(boundedDiscoveryUrls, {
    formats: ["markdown", "links"],
    integration: "prometheus",
  });
  return candidateUrlsFromDocs((discoveryScrape.data || []) as ScrapeDoc[]);
}

function prioritizeCandidateUrls(urls: string[]): string[] {
  const seeds = unique(seedUrls.filter((url) => isCompanyProfileUrl(url)));
  const yc = urls.filter((url) => inferredSourceType(url) === "yc_directory" && !seeds.includes(url));
  const funding = urls.filter((url) => inferredSourceType(url) === "funding_database" && !seeds.includes(url));
  const ordered: string[] = seeds.slice(0, MAX_SEED_URLS);
  const rounds = Math.min(Math.max(yc.length, funding.length), MAX_FINAL_SCRAPE_URLS);
  for (let index = 0; index < rounds; index += 1) {
    if (sourceTypes.includes("yc_directory") && yc[index]) ordered.push(yc[index]);
    if (sourceTypes.includes("funding_database") && funding[index]) ordered.push(funding[index]);
    if (ordered.length >= MAX_FINAL_SCRAPE_URLS) break;
  }
  return unique(ordered);
}

function mergeRows(rows: CompanyRow[]): CompanyRow[] {
  const byKey = new Map<string, CompanyRow>();
  for (const row of rows) {
    if (!row.company_name || !row.source_page_url) continue;
    const existing = byKey.get(row.tracking_key);
    if (!existing) {
      byKey.set(row.tracking_key, row);
      continue;
    }
    existing.secondary_source_urls = unique(existing.secondary_source_urls.concat(row.source_page_url, row.secondary_source_urls));
    existing.funding_rounds = dedupeRounds(existing.funding_rounds.concat(row.funding_rounds));
    existing.investors = unique(existing.investors.concat(row.investors));
    existing.founders = unique(existing.founders.concat(row.founders));
    existing.industry_tags = unique(existing.industry_tags.concat(row.industry_tags));
  }
  return Array.from(byKey.values()).slice(0, maxCompanies);
}

async function main() {
  const candidateUrls = await discoverUrls();
  if (candidateUrls.length === 0) {
    throw new Error("no candidate company URLs found from the provided sources");
  }

  const finalUrls = prioritizeCandidateUrls(candidateUrls)
    .filter((url) => {
      const type = inferredSourceType(url);
      return sourceTypes.includes(type);
    })
    .slice(0, Math.min(MAX_FINAL_SCRAPE_URLS, maxCompanies + sourceTypes.length + 2));

  if (finalUrls.length === 0) {
    throw new Error("no in-scope company URLs found after filtering source types");
  }

  const finalScrape = await firecrawl.v1.batchScrapeUrls(finalUrls, {
    formats: ["markdown", "links"],
    integration: "prometheus",
  });

  const rows: CompanyRow[] = [];
  const docs = ((finalScrape.data || []) as ScrapeDoc[]).slice(0, MAX_FINAL_SCRAPE_URLS);
  for (const doc of docs) {
    const url = sourceUrl(doc);
    const type = inferredSourceType(url);
    const parsed = type === "yc_directory" ? parseYcCompany(doc) : parseFundingCompany(doc);
    if (parsed) rows.push(parsed);
    if (rows.length >= maxCompanies * 2) break;
  }

  const companies = mergeRows(rows).slice(0, maxCompanies);
  if (companies.length === 0) {
    throw new Error("no company rows could be parsed from scraped pages");
  }

  const metadata = {
    snapshot_label: snapshotLabel,
    collected_at: collectedAt,
    query: query || null,
    seed_urls: seedUrls,
    source_types: sourceTypes,
    batch_filter: batchFilter || null,
    funding_date_range: fundingDateRange,
    stage_filter: stageFilter || null,
    region: requestedRegion,
    include_founders: includeFounders,
    include_investors: includeInvestors,
    sort_hint: sortHint,
    max_companies: maxCompanies,
    result_count: companies.length,
  };

  const out =
    outputMode === "grouped_by_source"
      ? {
          metadata,
          sources: sourceTypes.map((sourceType) => ({
            source_type: sourceType,
            companies: companies.filter((company) => company.source_type === sourceType),
          })),
        }
      : { metadata, companies };

  process.stdout.write(JSON.stringify(out));
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});
deploy to unlock
Deploy this collector to unlock schedules, the API endpoint, and destinations.