Startup Funding And YC Batch Tracker
v1PublishedTracks YC directory companies and publicly visible funding-database company profiles for a thesis query, with normalized company fields, funding rounds, source URLs, and snapshot metadata.
Output & API
Preview the latest data, download it, or call this collector as an API.
| metadata | |
|---|---|
| companies |
Marketplace
Publish this collector so others can deploy it — you keep ownership.
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY is not set");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
const { values: flags } = parseArgs({
strict: true,
options: {
"source-types": { type: "string" },
query: { type: "string" },
"seed-urls": { type: "string" },
"max-companies": { type: "string" },
"output-mode": { type: "string" },
"batch-filter": { type: "string" },
"funding-date-range": { type: "string" },
"stage-filter": { type: "string" },
region: { type: "string" },
"include-founders": { type: "string" },
"include-investors": { type: "string" },
"snapshot-label": { type: "string" },
"sort-hint": { type: "string" },
},
});
const MAX_COMPANIES_CAP = 25;
const MAX_SEED_URLS = 20;
const MAX_DISCOVERY_URLS = 12;
const MAX_FINAL_SCRAPE_URLS = 50;
const MAX_CANDIDATE_URLS = 2000;
const MAX_ROUNDS_PER_COMPANY = 5;
function requiredFlag(name: string): string {
const value = flags[name] as string | undefined;
if (!value || !value.trim()) {
console.error(`--${name} is required`);
process.exit(1);
}
return value.trim();
}
const sourceTypes = splitCsv(requiredFlag("source-types"));
const query = cleanText((flags.query as string | undefined) ?? "");
const seedUrls = splitCsv((flags["seed-urls"] as string | undefined) ?? "");
const maxCompaniesRaw = requiredFlag("max-companies");
const outputMode = requiredFlag("output-mode");
const batchFilter = cleanText((flags["batch-filter"] as string | undefined) ?? "");
const fundingDateRange = cleanText((flags["funding-date-range"] as string | undefined) ?? "last 90 days");
const stageFilter = cleanText((flags["stage-filter"] as string | undefined) ?? "");
const requestedRegion = cleanText((flags.region as string | undefined) ?? "global");
const includeFounders = parseBool((flags["include-founders"] as string | undefined) ?? "true", "include-founders");
const includeInvestors = parseBool((flags["include-investors"] as string | undefined) ?? "true", "include-investors");
const snapshotLabel = cleanText((flags["snapshot-label"] as string | undefined) || new Date().toISOString());
const sortHint = cleanText((flags["sort-hint"] as string | undefined) ?? "newest");
const collectedAt = new Date().toISOString();
const maxCompanies = Number(maxCompaniesRaw);
if (!Number.isInteger(maxCompanies) || maxCompanies < 1 || maxCompanies > MAX_COMPANIES_CAP) {
throw new Error(`OUT_OF_SCOPE: --max-companies must be an integer from 1 to ${MAX_COMPANIES_CAP}`);
}
if (sourceTypes.length === 0) {
throw new Error("OUT_OF_SCOPE: --source-types must include yc_directory and/or funding_database");
}
for (const sourceType of sourceTypes) {
if (sourceType !== "yc_directory" && sourceType !== "funding_database") {
throw new Error("OUT_OF_SCOPE: --source-types values must be yc_directory or funding_database");
}
}
if (!query && seedUrls.length === 0) {
throw new Error("OUT_OF_SCOPE: provide --query or --seed-urls");
}
if (outputMode !== "company_rows" && outputMode !== "grouped_by_source") {
throw new Error("OUT_OF_SCOPE: --output-mode must be company_rows or grouped_by_source");
}
if (seedUrls.length > MAX_SEED_URLS) {
throw new Error(`OUT_OF_SCOPE: --seed-urls supports at most ${MAX_SEED_URLS} URLs per run`);
}
for (const seedUrl of seedUrls) {
validateHttpUrl(seedUrl);
}
type SourceType = "yc_directory" | "funding_database";
type FundingRound = {
round_type: string | null;
announced_date: string | null;
amount_raised_text: string | null;
amount_raised_normalized: number | null;
investors: string[];
source_url: string;
raw_text: string | null;
};
type CompanyRow = {
tracking_key: string;
snapshot_label: string;
collected_at: string;
source_type: SourceType;
company_name: string | null;
company_url: string | null;
source_page_url: string;
secondary_source_urls: string[];
batch: string | null;
founders: string[];
company_description: string | null;
industry_tags: string[];
region: string | null;
hq_location: string | null;
funding_rounds: FundingRound[];
latest_round_type: string | null;
latest_round_date: string | null;
latest_amount_raised_text: string | null;
latest_amount_raised_normalized: number | null;
investors: string[];
confidence: "high" | "medium" | "low";
notes: string | null;
};
type ScrapeDoc = {
markdown?: string;
links?: string[];
metadata?: {
title?: string;
description?: string;
sourceURL?: string;
url?: string;
ogUrl?: string;
ogTitle?: string;
ogDescription?: string;
};
};
function splitCsv(value: string): string[] {
return value
.split(",")
.map((part) => part.trim())
.filter((part) => part.length > 0);
}
function parseBool(value: string, name: string): boolean {
const normalized = value.trim().toLowerCase();
if (normalized === "true") return true;
if (normalized === "false") return false;
throw new Error(`OUT_OF_SCOPE: --${name} must be true or false`);
}
function validateHttpUrl(value: string): void {
let parsed: URL;
try {
parsed = new URL(value);
} catch {
throw new Error("OUT_OF_SCOPE: --seed-urls must contain valid URLs");
}
if (parsed.protocol !== "https:" && parsed.protocol !== "http:") {
throw new Error("OUT_OF_SCOPE: --seed-urls only supports http and https URLs");
}
}
function cleanText(value: string | null | undefined): string {
if (!value) return "";
return value.replace(/\s+/g, " ").trim();
}
function nullIfUnavailable(value: string | null | undefined): string | null {
const text = cleanText(value);
if (!text) return null;
const lowered = text.toLowerCase();
if (lowered === "obfuscated" || lowered === "obfuscation" || lowered === "obf" || lowered === "-") return null;
if (lowered.includes("unlock ") || lowered.includes("register for free")) return null;
return text;
}
function stripMarkdown(value: string): string {
return cleanText(
value
.replace(/!\[[^\]]*]\([^)]*\)/g, "")
.replace(/\[([^\]]+)]\([^)]*\)/g, "$1")
.replace(/<br>/gi, " ")
.replace(/\\+/g, "")
.replace(/\*\*/g, "")
.replace(/^#+\s*/, "")
);
}
function unique(values: string[]): string[] {
const seen = new Set<string>();
const out: string[] = [];
for (const value of values) {
const cleaned = nullIfUnavailable(value);
if (!cleaned) continue;
const key = cleaned.toLowerCase();
if (seen.has(key)) continue;
seen.add(key);
out.push(cleaned);
}
return out;
}
function sourceUrl(doc: ScrapeDoc): string {
return doc.metadata?.sourceURL || doc.metadata?.ogUrl || doc.metadata?.url || "";
}
function hostOf(value: string): string {
try {
return new URL(value).hostname.replace(/^www\./, "");
} catch {
return "";
}
}
function isYcUrl(value: string): boolean {
return hostOf(value) === "ycombinator.com" && value.includes("/companies");
}
function isFundingUrl(value: string): boolean {
const host = hostOf(value);
return host === "crunchbase.com" || host === "tracxn.com" || host === "dealroom.co";
}
function isCompanyProfileUrl(value: string): boolean {
if (hostOf(value) === "ycombinator.com") {
const path = new URL(value).pathname;
return /^\/companies\/[^/]+\/?$/.test(path);
}
if (hostOf(value) === "crunchbase.com") return new URL(value).pathname.startsWith("/organization/");
if (hostOf(value) === "tracxn.com") return new URL(value).pathname.startsWith("/d/companies/");
if (hostOf(value) === "dealroom.co") return value.includes("/companies/");
return false;
}
function inferredSourceType(value: string): SourceType {
return isYcUrl(value) ? "yc_directory" : "funding_database";
}
function normalizeYcBatch(value: string | null): string | null {
const text = nullIfUnavailable(value);
if (!text) return null;
const compact = text.replace("Y Combinator Logo", "").trim();
const parts = compact.split(" ");
if (parts.length >= 2) {
const season = parts[0].toLowerCase();
const year = parts[1].slice(-2);
if (season.startsWith("summer")) return `S${year}`;
if (season.startsWith("winter")) return `W${year}`;
if (season.startsWith("spring")) return `Sp${year}`;
if (season.startsWith("fall")) return `F${year}`;
}
return compact;
}
function normalizeAmount(value: string | null): number | null {
const text = nullIfUnavailable(value);
if (!text) return null;
const match = text.match(/\$?\s*([0-9]+(?:\.[0-9]+)?)\s*([KMBT])?/i);
if (!match) return null;
const number = Number(match[1]);
if (!Number.isFinite(number)) return null;
const suffix = (match[2] || "").toUpperCase();
const multiplier = suffix === "K" ? 1_000 : suffix === "M" ? 1_000_000 : suffix === "B" ? 1_000_000_000 : suffix === "T" ? 1_000_000_000_000 : 1;
return Math.round(number * multiplier);
}
function trackingKey(sourceType: SourceType, name: string | null, sourcePageUrl: string): string {
const base = cleanText(name || sourcePageUrl).toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-|-$/g, "");
return `${sourceType}:${base || "unknown"}`;
}
function extractLinkTextsByPath(markdown: string, pathPart: string): string[] {
const out: string[] = [];
const segments = markdown.split("[");
const maxSegments = Math.min(segments.length, 300);
for (let index = 0; index < maxSegments; index += 1) {
const segment = segments[index];
const close = segment.indexOf("]");
const open = segment.indexOf("](");
if (close < 0 || open < 0) continue;
const text = segment.slice(0, close);
const rest = segment.slice(open + 2);
if (rest.includes(pathPart)) out.push(stripMarkdown(text));
}
return unique(out);
}
function pickExternalCompanyUrl(doc: ScrapeDoc): string | null {
const links = doc.links || [];
const blockedHosts = ["ycombinator.com", "bookface-images.s3.amazonaws.com", "linkedin.com", "twitter.com", "x.com", "github.com", "crunchbase.com", "tracxn.com"];
const maxLinks = Math.min(links.length, 100);
for (let index = 0; index < maxLinks; index += 1) {
const link = links[index];
const host = hostOf(link);
if (!host || blockedHosts.includes(host)) continue;
if (link.startsWith("http://") || link.startsWith("https://")) return link;
}
return null;
}
function linesOf(markdown: string): string[] {
return markdown
.split("\n")
.map((line) => stripMarkdown(line))
.filter((line) => line.length > 0);
}
function parseYcCompany(doc: ScrapeDoc): CompanyRow | null {
const url = sourceUrl(doc);
const markdown = doc.markdown || "";
const lines = linesOf(markdown);
const title = cleanText(doc.metadata?.title || doc.metadata?.ogTitle || "");
const nameFromTitle = title.includes(":") ? title.split(":")[0] : title.replace(" | Y Combinator", "");
let companyName = nullIfUnavailable(nameFromTitle);
if (!companyName) {
const markerIndex = lines.findIndex((line) => line.includes("›"));
companyName = nullIfUnavailable(lines[Math.min(markerIndex + 1, lines.length - 1)]);
}
const batchLine = lines.find((line) => line.includes("Y Combinator Logo")) || lines.find((line) => line.startsWith("Batch:"));
const batch = normalizeYcBatch(batchLine?.replace("Batch:", ""));
if (batchFilter && batch && batch.toLowerCase() !== batchFilter.toLowerCase()) return null;
const descriptionHeadingIndex = lines.findIndex((line) => line.startsWith("### "));
const descriptionParts: string[] = [];
if (descriptionHeadingIndex >= 0) {
const maxDescriptionLines = Math.min(lines.length, descriptionHeadingIndex + 6);
for (let index = descriptionHeadingIndex; index < maxDescriptionLines; index += 1) {
const line = lines[index].replace(/^###\s*/, "");
if (line === "Active Founders" || line === "YC Photos") break;
descriptionParts.push(line);
}
}
const fallbackDescription = doc.metadata?.description || doc.metadata?.ogDescription || null;
const description = nullIfUnavailable(descriptionParts.join(" ")) || nullIfUnavailable(fallbackDescription);
const industryTags = extractLinkTextsByPath(markdown, "/companies/industry/");
const locationLine = lines.find((line) => line.startsWith("Location:"));
const hqLocation = nullIfUnavailable(locationLine?.replace("Location:", "")) || extractLinkTextsByPath(markdown, "/companies/location/")[0] || null;
const founders = includeFounders ? extractYcFounders(lines) : [];
const companyUrl = pickExternalCompanyUrl(doc);
const notes = "YC directory profile; funding fields are null unless visible on the source page.";
return {
tracking_key: trackingKey("yc_directory", companyName, url),
snapshot_label: snapshotLabel,
collected_at: collectedAt,
source_type: "yc_directory",
company_name: companyName,
company_url: companyUrl,
source_page_url: url,
secondary_source_urls: [],
batch,
founders,
company_description: description,
industry_tags: industryTags,
region: requestedRegion || "global",
hq_location: hqLocation,
funding_rounds: [],
latest_round_type: null,
latest_round_date: null,
latest_amount_raised_text: null,
latest_amount_raised_normalized: null,
investors: [],
confidence: companyName ? "high" : "medium",
notes,
};
}
function extractYcFounders(lines: string[]): string[] {
const start = lines.findIndex((line) => line === "Active Founders");
if (start < 0) return [];
const endCandidates = [lines.findIndex((line, index) => index > start && line === "YC Photos"), lines.findIndex((line, index) => index > start && line === "Hear from the founders")].filter((index) => index > start);
const end = endCandidates.length ? Math.min(...endCandidates) : Math.min(lines.length, start + 80);
const names: string[] = [];
const limit = Math.min(end, start + 80);
for (let index = start + 1; index < limit; index += 1) {
const line = lines[index];
const nearby = lines.slice(index + 1, Math.min(index + 4, limit)).join(" ");
if (!nearby.includes("Founder")) continue;
if (line === "Founder" || line.includes("profile") || line.includes("account")) continue;
if (line.length > 60 || !line.includes(" ")) continue;
names.push(line);
}
return unique(names);
}
function parseFundingCompany(doc: ScrapeDoc): CompanyRow | null {
const url = sourceUrl(doc);
const markdown = doc.markdown || "";
const lines = linesOf(markdown);
const title = cleanText(doc.metadata?.title || doc.metadata?.ogTitle || "");
let companyName = title.replace(" - Crunchbase Company Profile & Funding", "").replace(/ - 20[0-9]{2}.*$/, "");
if (!companyName || companyName === title) {
const overviewIndex = lines.findIndex((line) => line === "Overview");
companyName = overviewIndex >= 0 ? lines[overviewIndex + 1] || companyName : companyName;
}
companyName = nullIfUnavailable(companyName) || nullIfUnavailable(lines[0]) || null;
const rounds = extractFundingRounds(markdown, url);
if (stageFilter && rounds.length > 0 && !rounds.some((round) => stageMatches(round.round_type, stageFilter))) return null;
const latestRound = rounds[0] || null;
const investors = includeInvestors
? unique(rounds.flatMap((round) => round.investors).concat(extractFaqInvestors(markdown))).filter((name) => name.toLowerCase() !== (companyName || "").toLowerCase())
: [];
const industryTags = extractLinkTextsByPath(markdown, "/categories/").concat(extractTracxnTags(lines));
const companyUrl = pickExternalCompanyUrl(doc);
const description = nullIfUnavailable(doc.metadata?.description || doc.metadata?.ogDescription) || extractFundingDescription(lines, companyName);
const hqLocation = extractHeadquarters(lines);
const notesParts = [
`Funding database profile discovered for ${query || "provided seed URL"}.`,
`Funding date range used for discovery: ${fundingDateRange}.`,
];
if (rounds.some((round) => round.raw_text?.toLowerCase().includes("obfuscated"))) {
notesParts.push("Some funding fields were obfuscated by the source and are returned as null with raw context preserved.");
}
return {
tracking_key: trackingKey("funding_database", companyName, url),
snapshot_label: snapshotLabel,
collected_at: collectedAt,
source_type: "funding_database",
company_name: companyName,
company_url: companyUrl,
source_page_url: url,
secondary_source_urls: [],
batch: null,
founders: [],
company_description: description,
industry_tags: unique(industryTags),
region: requestedRegion || "global",
hq_location: hqLocation,
funding_rounds: rounds,
latest_round_type: latestRound?.round_type || null,
latest_round_date: latestRound?.announced_date || null,
latest_amount_raised_text: latestRound?.amount_raised_text || null,
latest_amount_raised_normalized: latestRound?.amount_raised_normalized || null,
investors,
confidence: companyName && rounds.length > 0 ? "medium" : "low",
notes: notesParts.join(" "),
};
}
function stageMatches(roundType: string | null, filter: string): boolean {
if (!roundType) return true;
const normalizedRound = roundType.toLowerCase().replace(/[^a-z0-9]/g, "");
const filters = splitCsv(filter).map((part) => part.toLowerCase().replace(/[^a-z0-9]/g, ""));
return filters.some((part) => part && normalizedRound.includes(part));
}
function extractFundingRounds(markdown: string, url: string): FundingRound[] {
const rows: FundingRound[] = [];
const tableLines = markdown.split("\n").filter((line) => line.trim().startsWith("|"));
const maxTableLines = Math.min(tableLines.length, 80);
for (let index = 0; index < maxTableLines; index += 1) {
const line = tableLines[index];
if (!line.includes("Funding Type") && !line.includes("Pre Seed Round") && !line.includes("Seed Round") && !line.includes("Series")) continue;
if (line.includes("---") || line.includes("Announced Date")) continue;
const cells = line.split("|").slice(1, -1).map((cell) => stripMarkdown(cell));
if (cells.length < 5) continue;
const announcedDate = nullIfUnavailable(cells[0]);
const rawTransaction = nullIfUnavailable(cells[1]);
const moneyRaised = nullIfUnavailable(cells[3]);
const leadInvestors = includeInvestors ? unique(cells[4].split(",").map((part) => stripMarkdown(part))) : [];
const fundingType = nullIfUnavailable(cells[5] || rawTransaction);
const rawText = stripMarkdown(line);
rows.push({
round_type: fundingType,
announced_date: announcedDate,
amount_raised_text: moneyRaised,
amount_raised_normalized: normalizeAmount(moneyRaised),
investors: leadInvestors,
source_url: url,
raw_text: rawText || null,
});
if (rows.length >= MAX_ROUNDS_PER_COMPANY) break;
}
return dedupeRounds(rows);
}
function dedupeRounds(rounds: FundingRound[]): FundingRound[] {
const seen = new Set<string>();
const out: FundingRound[] = [];
for (const round of rounds) {
const key = `${round.round_type || ""}|${round.announced_date || ""}|${round.amount_raised_text || ""}|${round.investors.join(",")}`;
if (seen.has(key)) continue;
seen.add(key);
out.push(round);
}
return out;
}
function extractFaqInvestors(markdown: string): string[] {
const marker = "Who invested in";
const index = markdown.indexOf(marker);
if (index < 0) return [];
const snippet = markdown.slice(index, Math.min(markdown.length, index + 700));
const names = extractLinkTextsByPath(snippet, "/person/").concat(extractLinkTextsByPath(snippet, "/organization/"));
return unique(names).filter((name) => !name.toLowerCase().includes("crunchbase"));
}
function extractFundingDescription(lines: string[], companyName: string | null): string | null {
if (!companyName) return null;
const maxLines = Math.min(lines.length, 120);
for (let index = 0; index < maxLines; index += 1) {
const line = lines[index];
if (line.startsWith(`${companyName} is `) || line.startsWith(`${companyName} offers `) || line.startsWith(`${companyName} develops `)) {
return line;
}
}
return null;
}
function extractHeadquarters(lines: string[]): string | null {
const maxLines = Math.min(lines.length, 180);
for (let index = 0; index < maxLines; index += 1) {
const line = lines[index];
if (line.includes(" is located in ")) {
const parts = line.split(" is located in ");
return nullIfUnavailable(parts[1]?.replace(/\.$/, ""));
}
if (line.startsWith("Headquarters")) {
return nullIfUnavailable(lines[index + 1]);
}
}
return null;
}
function extractTracxnTags(lines: string[]): string[] {
const out: string[] = [];
const maxLines = Math.min(lines.length, 120);
for (let index = 0; index < maxLines; index += 1) {
const line = lines[index];
if (line.includes("API &") || line.includes("Developer Tools") || line.includes("FinTech") || line.includes("SaaS")) out.push(line);
}
return out.slice(0, 5);
}
function candidateUrlsFromDocs(docs: ScrapeDoc[]): string[] {
const urls: string[] = [];
for (const seedUrl of seedUrls) urls.push(seedUrl);
const maxDocs = Math.min(docs.length, MAX_DISCOVERY_URLS);
for (let docIndex = 0; docIndex < maxDocs; docIndex += 1) {
const doc = docs[docIndex];
const docUrl = sourceUrl(doc);
if (docUrl && isCompanyProfileUrl(docUrl)) urls.push(docUrl);
const links = doc.links || [];
const maxLinks = Math.min(links.length, 200);
for (let linkIndex = 0; linkIndex < maxLinks; linkIndex += 1) {
const link = links[linkIndex];
if (isCompanyProfileUrl(link)) urls.push(link.split("?")[0]);
}
}
return unique(urls).slice(0, MAX_CANDIDATE_URLS);
}
async function discoverUrls(): Promise<string[]> {
const discoveryUrls: string[] = seedUrls.slice(0, MAX_SEED_URLS);
if (query && sourceTypes.includes("yc_directory")) {
const ycQuery = ["site:ycombinator.com/companies", query, batchFilter, "Y Combinator company"].filter(Boolean).join(" ");
const ycLimit = Math.min(6, Math.max(4, maxCompanies + 1));
const ycResults = await firecrawl.v1.search(ycQuery, { limit: ycLimit, integration: "prometheus" });
for (const item of (ycResults.data || []).slice(0, ycLimit)) {
if (item.url && isYcUrl(item.url)) discoveryUrls.push(item.url);
}
}
if (query && sourceTypes.includes("funding_database")) {
const fundingLimit = Math.min(6, Math.max(4, maxCompanies + 1));
const crunchbaseQuery = ["site:crunchbase.com/organization", query, fundingDateRange, stageFilter, "funding startup"].filter(Boolean).join(" ");
const tracxnQuery = ["site:tracxn.com/d/companies", query, fundingDateRange, stageFilter, "funding investors startup"].filter(Boolean).join(" ");
const crunchbaseResults = await firecrawl.v1.search(crunchbaseQuery, { limit: fundingLimit, integration: "prometheus" });
const tracxnResults = await firecrawl.v1.search(tracxnQuery, { limit: fundingLimit, integration: "prometheus" });
for (const item of (crunchbaseResults.data || []).slice(0, fundingLimit)) {
if (item.url && isFundingUrl(item.url)) discoveryUrls.push(item.url);
}
for (const item of (tracxnResults.data || []).slice(0, fundingLimit)) {
if (item.url && isFundingUrl(item.url)) discoveryUrls.push(item.url);
}
}
const boundedDiscoveryUrls = unique(discoveryUrls).slice(0, MAX_DISCOVERY_URLS);
if (boundedDiscoveryUrls.length === 0) return [];
const discoveryScrape = await firecrawl.v1.batchScrapeUrls(boundedDiscoveryUrls, {
formats: ["markdown", "links"],
integration: "prometheus",
});
return candidateUrlsFromDocs((discoveryScrape.data || []) as ScrapeDoc[]);
}
function prioritizeCandidateUrls(urls: string[]): string[] {
const seeds = unique(seedUrls.filter((url) => isCompanyProfileUrl(url)));
const yc = urls.filter((url) => inferredSourceType(url) === "yc_directory" && !seeds.includes(url));
const funding = urls.filter((url) => inferredSourceType(url) === "funding_database" && !seeds.includes(url));
const ordered: string[] = seeds.slice(0, MAX_SEED_URLS);
const rounds = Math.min(Math.max(yc.length, funding.length), MAX_FINAL_SCRAPE_URLS);
for (let index = 0; index < rounds; index += 1) {
if (sourceTypes.includes("yc_directory") && yc[index]) ordered.push(yc[index]);
if (sourceTypes.includes("funding_database") && funding[index]) ordered.push(funding[index]);
if (ordered.length >= MAX_FINAL_SCRAPE_URLS) break;
}
return unique(ordered);
}
function mergeRows(rows: CompanyRow[]): CompanyRow[] {
const byKey = new Map<string, CompanyRow>();
for (const row of rows) {
if (!row.company_name || !row.source_page_url) continue;
const existing = byKey.get(row.tracking_key);
if (!existing) {
byKey.set(row.tracking_key, row);
continue;
}
existing.secondary_source_urls = unique(existing.secondary_source_urls.concat(row.source_page_url, row.secondary_source_urls));
existing.funding_rounds = dedupeRounds(existing.funding_rounds.concat(row.funding_rounds));
existing.investors = unique(existing.investors.concat(row.investors));
existing.founders = unique(existing.founders.concat(row.founders));
existing.industry_tags = unique(existing.industry_tags.concat(row.industry_tags));
}
return Array.from(byKey.values()).slice(0, maxCompanies);
}
async function main() {
const candidateUrls = await discoverUrls();
if (candidateUrls.length === 0) {
throw new Error("no candidate company URLs found from the provided sources");
}
const finalUrls = prioritizeCandidateUrls(candidateUrls)
.filter((url) => {
const type = inferredSourceType(url);
return sourceTypes.includes(type);
})
.slice(0, Math.min(MAX_FINAL_SCRAPE_URLS, maxCompanies + sourceTypes.length + 2));
if (finalUrls.length === 0) {
throw new Error("no in-scope company URLs found after filtering source types");
}
const finalScrape = await firecrawl.v1.batchScrapeUrls(finalUrls, {
formats: ["markdown", "links"],
integration: "prometheus",
});
const rows: CompanyRow[] = [];
const docs = ((finalScrape.data || []) as ScrapeDoc[]).slice(0, MAX_FINAL_SCRAPE_URLS);
for (const doc of docs) {
const url = sourceUrl(doc);
const type = inferredSourceType(url);
const parsed = type === "yc_directory" ? parseYcCompany(doc) : parseFundingCompany(doc);
if (parsed) rows.push(parsed);
if (rows.length >= maxCompanies * 2) break;
}
const companies = mergeRows(rows).slice(0, maxCompanies);
if (companies.length === 0) {
throw new Error("no company rows could be parsed from scraped pages");
}
const metadata = {
snapshot_label: snapshotLabel,
collected_at: collectedAt,
query: query || null,
seed_urls: seedUrls,
source_types: sourceTypes,
batch_filter: batchFilter || null,
funding_date_range: fundingDateRange,
stage_filter: stageFilter || null,
region: requestedRegion,
include_founders: includeFounders,
include_investors: includeInvestors,
sort_hint: sortHint,
max_companies: maxCompanies,
result_count: companies.length,
};
const out =
outputMode === "grouped_by_source"
? {
metadata,
sources: sourceTypes.map((sourceType) => ({
source_type: sourceType,
companies: companies.filter((company) => company.source_type === sourceType),
})),
}
: { metadata, companies };
process.stdout.write(JSON.stringify(out));
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.