MEGA-llms.txt!!!!!
v1PublishedAll the knowledge in the world at your fingertips!
›Author's sample data
| count | 1994 |
|---|---|
| source | https://directory.llmstxt.cloud/ |
| entries | |
| scrapedAt | 2026-06-11T23:31:37.489Z |
| totalPages | 21 |
| pagesFetched | 21 |
›Publisher
1 subscriberEvery day at 6:30 AM0 runs in 14d · published 5h ago
›Parameters
--max-pagesnumberMaximum number of directory pages to scrape (0 = all pages). default 0
›Versions
managed by authorv1builtapprovedcurrent5h ago
Schedulesdeploy to enable
Run this collector on a cadence — daily, hourly, your call.
API endpointdeploy to unlock
POST to run it on demand and get fresh data in the response.
Destinationsdeploy to route
Deliver every run to S3, Postgres, a Google Sheet, or a webhook — automatically.
Activitydeploy to track
0 subscriber runs in the last 14 days.
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY is not set");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
const { values: flags } = parseArgs({
strict: true,
options: {
"max-pages": { type: "string" }, // --max-pages=5 (optional, default 0 = all pages)
},
});
const maxPages = Number(flags["max-pages"] ?? "0");
if (!Number.isInteger(maxPages) || maxPages < 0) {
console.error("--max-pages must be a non-negative integer (0 = all pages)");
process.exit(1);
}
interface Entry {
name: string;
siteUrl: string | null;
llmsTxtUrl: string | null;
llmsTxtTokens: string | null;
llmsFullTxtUrl: string | null;
llmsFullTxtTokens: string | null;
}
function pageUrl(page: number): string {
return page === 1
? "https://directory.llmstxt.cloud/?sort=name_asc"
: `https://directory.llmstxt.cloud/?sort=name_asc&page=${page}`;
}
async function fetchPage(page: number): Promise<string> {
const doc = await firecrawl.scrape(pageUrl(page), {
formats: ["html"],
integration: "prometheus",
});
const html = doc.html;
if (!html) throw new Error(`no HTML returned for directory page ${page}`);
return html;
}
function parseEntries(html: string, page: number): Entry[] {
const $ = cheerio.load(html);
const rows = $("#search-results .grid.grid-cols-12").filter((_, el) =>
$(el).find("a").length > 0
);
if (rows.length === 0) {
throw new Error(`no directory rows found on page ${page}`);
}
const entries: Entry[] = [];
rows.each((_, row) => {
const cols = $(row).children("div");
const nameLink = cols.eq(0).find("a").first();
const name = nameLink.text().trim();
if (!name) return;
const siteUrl = nameLink.attr("href") ?? null;
let llmsTxtUrl: string | null = null;
let llmsTxtTokens: string | null = null;
let llmsFullTxtUrl: string | null = null;
let llmsFullTxtTokens: string | null = null;
$(row)
.find("a")
.each((_, a) => {
const href = $(a).attr("href") ?? "";
const label = $(a).text().trim();
const tokens =
$(a).parent().find("span").first().text().trim().replace(/[()]/g, "") ||
null;
if (label === "/llms.txt") {
llmsTxtUrl = href;
llmsTxtTokens = tokens;
} else if (label === "/llms-full.txt") {
llmsFullTxtUrl = href;
llmsFullTxtTokens = tokens;
}
});
entries.push({
name,
siteUrl,
llmsTxtUrl,
llmsTxtTokens,
llmsFullTxtUrl,
llmsFullTxtTokens,
});
});
return entries;
}
function parseTotalPages(html: string): number {
const m = html.match(/Page\s+\d+\s+of\s+(\d+)/);
if (!m) throw new Error("could not find 'Page X of Y' pagination text on page 1");
return Number(m[1]);
}
async function main() {
console.error("Fetching page 1...");
const firstHtml = await fetchPage(1);
const totalPages = parseTotalPages(firstHtml);
const pagesToFetch =
maxPages > 0 ? Math.min(maxPages, totalPages) : totalPages;
console.error(`Directory has ${totalPages} pages; fetching ${pagesToFetch}.`);
const allEntries: Entry[] = parseEntries(firstHtml, 1);
const remaining: number[] = [];
for (let p = 2; p <= pagesToFetch; p++) remaining.push(p);
const concurrency = 4;
const results: Entry[][] = new Array(remaining.length);
let idx = 0;
async function worker() {
while (idx < remaining.length) {
const my = idx++;
const page = remaining[my];
console.error(`Fetching page ${page}...`);
const html = await fetchPage(page);
results[my] = parseEntries(html, page);
console.error(`Page ${page}: ${results[my].length} entries`);
}
}
await Promise.all(
Array.from({ length: Math.min(concurrency, remaining.length) }, worker)
);
for (const r of results) allEntries.push(...r);
// De-duplicate in case pagination boundaries shift between requests
const seen = new Set<string>();
const entries = allEntries.filter((e) => {
const key = `${e.name}|${e.siteUrl}`;
if (seen.has(key)) return false;
seen.add(key);
return true;
});
const out = {
source: "https://directory.llmstxt.cloud/",
scrapedAt: new Date().toISOString(),
totalPages,
pagesFetched: pagesToFetch,
count: entries.length,
entries,
};
process.stdout.write(JSON.stringify(out));
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
Build prompt
MEGA-llms.txt!!!!!