MEGA-llms.txt!!!!!

Name: MEGA-llms.txt!!!!!
Creator: troycarboni
Published: 2026-06-11T23:41:28.556Z
License: https://opensource.org/licenses/MIT

v1Published

All the knowledge in the world at your fingertips!

›Author's sample data

count	1994
source	https://directory.llmstxt.cloud/
entries
scrapedAt	2026-06-11T23:31:37.489Z
totalPages	21
pagesFetched	21

›Publisher

1 subscriber

troycarboni@troycarboni

Every day at 6:30 AM0 runs in 14d · published 5h ago

›Parameters

--max-pagesnumberMaximum number of directory pages to scrape (0 = all pages). default 0

›Versions

managed by author

v1builtapprovedcurrent5h ago

Schedulesdeploy to enable

Run this collector on a cadence — daily, hourly, your call.

API endpointdeploy to unlock

POST to run it on demand and get fresh data in the response.

Destinationsdeploy to route

Deliver every run to S3, Postgres, a Google Sheet, or a webhook — automatically.

Activitydeploy to track

0 subscriber runs in the last 14 days.

How this script collects data

import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  console.error("FIRECRAWL_API_KEY is not set");
  process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });

const { values: flags } = parseArgs({
  strict: true,
  options: {
    "max-pages": { type: "string" }, // --max-pages=5 (optional, default 0 = all pages)
  },
});
const maxPages = Number(flags["max-pages"] ?? "0");
if (!Number.isInteger(maxPages) || maxPages < 0) {
  console.error("--max-pages must be a non-negative integer (0 = all pages)");
  process.exit(1);
}

interface Entry {
  name: string;
  siteUrl: string | null;
  llmsTxtUrl: string | null;
  llmsTxtTokens: string | null;
  llmsFullTxtUrl: string | null;
  llmsFullTxtTokens: string | null;
}

function pageUrl(page: number): string {
  return page === 1
    ? "https://directory.llmstxt.cloud/?sort=name_asc"
    : `https://directory.llmstxt.cloud/?sort=name_asc&page=${page}`;
}

async function fetchPage(page: number): Promise<string> {
  const doc = await firecrawl.scrape(pageUrl(page), {
    formats: ["html"],
    integration: "prometheus",
  });
  const html = doc.html;
  if (!html) throw new Error(`no HTML returned for directory page ${page}`);
  return html;
}

function parseEntries(html: string, page: number): Entry[] {
  const $ = cheerio.load(html);
  const rows = $("#search-results .grid.grid-cols-12").filter((_, el) =>
    $(el).find("a").length > 0
  );
  if (rows.length === 0) {
    throw new Error(`no directory rows found on page ${page}`);
  }
  const entries: Entry[] = [];
  rows.each((_, row) => {
    const cols = $(row).children("div");
    const nameLink = cols.eq(0).find("a").first();
    const name = nameLink.text().trim();
    if (!name) return;
    const siteUrl = nameLink.attr("href") ?? null;

    let llmsTxtUrl: string | null = null;
    let llmsTxtTokens: string | null = null;
    let llmsFullTxtUrl: string | null = null;
    let llmsFullTxtTokens: string | null = null;

    $(row)
      .find("a")
      .each((_, a) => {
        const href = $(a).attr("href") ?? "";
        const label = $(a).text().trim();
        const tokens =
          $(a).parent().find("span").first().text().trim().replace(/[()]/g, "") ||
          null;
        if (label === "/llms.txt") {
          llmsTxtUrl = href;
          llmsTxtTokens = tokens;
        } else if (label === "/llms-full.txt") {
          llmsFullTxtUrl = href;
          llmsFullTxtTokens = tokens;
        }
      });

    entries.push({
      name,
      siteUrl,
      llmsTxtUrl,
      llmsTxtTokens,
      llmsFullTxtUrl,
      llmsFullTxtTokens,
    });
  });
  return entries;
}

function parseTotalPages(html: string): number {
  const m = html.match(/Page\s+\d+\s+of\s+(\d+)/);
  if (!m) throw new Error("could not find 'Page X of Y' pagination text on page 1");
  return Number(m[1]);
}

async function main() {
  console.error("Fetching page 1...");
  const firstHtml = await fetchPage(1);
  const totalPages = parseTotalPages(firstHtml);
  const pagesToFetch =
    maxPages > 0 ? Math.min(maxPages, totalPages) : totalPages;
  console.error(`Directory has ${totalPages} pages; fetching ${pagesToFetch}.`);

  const allEntries: Entry[] = parseEntries(firstHtml, 1);

  const remaining: number[] = [];
  for (let p = 2; p <= pagesToFetch; p++) remaining.push(p);

  const concurrency = 4;
  const results: Entry[][] = new Array(remaining.length);
  let idx = 0;
  async function worker() {
    while (idx < remaining.length) {
      const my = idx++;
      const page = remaining[my];
      console.error(`Fetching page ${page}...`);
      const html = await fetchPage(page);
      results[my] = parseEntries(html, page);
      console.error(`Page ${page}: ${results[my].length} entries`);
    }
  }
  await Promise.all(
    Array.from({ length: Math.min(concurrency, remaining.length) }, worker)
  );
  for (const r of results) allEntries.push(...r);

  // De-duplicate in case pagination boundaries shift between requests
  const seen = new Set<string>();
  const entries = allEntries.filter((e) => {
    const key = `${e.name}|${e.siteUrl}`;
    if (seen.has(key)) return false;
    seen.add(key);
    return true;
  });

  const out = {
    source: "https://directory.llmstxt.cloud/",
    scrapedAt: new Date().toISOString(),
    totalPages,
    pagesFetched: pagesToFetch,
    count: entries.length,
    entries,
  };
  process.stdout.write(JSON.stringify(out));
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});

Build prompt

MEGA-llms.txt!!!!!