START.md LINK TERRORIZER!!
v1PublishedYou know a link hates to see me coming!
›Author's sample data
| pages | |
|---|---|
| source | https://start.me/pages |
| pageCount | 40 |
| scrapedAt | 2026-06-11T23:49:04.000Z |
| totalLinks | 16281 |
›Publisher
0 subscribersEvery day at 3:00 AM0 runs in 14d · published 4h ago
›Parameters
--max-pagesnumberMaximum number of bookmark pages to scrape (BFS-expanded from the directory). Higher values collect more links but take longer. default 40 · e.g. 40
›Versions
managed by authorv1builtapprovedcurrent4h ago
Schedulesdeploy to enable
Run this collector on a cadence — daily, hourly, your call.
API endpointdeploy to unlock
POST to run it on demand and get fresh data in the response.
Activitydeploy to track
0 subscriber runs in the last 14 days.
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY is not set");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
const { values: flags } = parseArgs({
strict: true,
options: {
"max-pages": { type: "string" }, // cap on bookmark pages to scrape (default 40)
},
});
const maxPages = Number(flags["max-pages"] ?? "40");
if (!Number.isFinite(maxPages) || maxPages < 1) {
console.error("--max-pages must be a positive number");
process.exit(1);
}
const PAGE_RE = /^https:\/\/start\.me\/p\/([A-Za-z0-9]+)\/[^"#?]+/;
function pageId(url: string): string | null {
const m = url.match(PAGE_RE);
return m ? m[1] : null;
}
async function getRawHtml(url: string): Promise<string> {
const doc: any = await firecrawl.scrape(url, {
formats: ["rawHtml"],
integration: "prometheus",
});
const html = doc?.rawHtml ?? doc?.data?.rawHtml ?? "";
if (!html) throw new Error(`no rawHtml returned for ${url}`);
return html;
}
// Collect start.me/p/ page links from an HTML document.
function extractPageLinks($: cheerio.CheerioAPI): string[] {
const out = new Set<string>();
$("a[href]").each((_, el) => {
const href = $(el).attr("href") || "";
if (PAGE_RE.test(href)) out.add(href.split("#")[0].split("?")[0]);
});
return [...out];
}
// Extract the bookmarks (external links) listed on a bookmark page.
function extractBookmarks($: cheerio.CheerioAPI): { url: string; title: string }[] {
const out: { url: string; title: string }[] = [];
const seen = new Set<string>();
$("a.bookmark-item__link").each((_, el) => {
const url = ($(el).attr("href") || "").trim();
if (!url || seen.has(url)) return;
seen.add(url);
const title = ($(el).attr("title") || $(el).text() || "").trim();
out.push({ url, title });
});
return out;
}
function pageTitle($: cheerio.CheerioAPI): string {
const t = $("title").first().text().trim();
return t.replace(/\s*-\s*Start\.me\s*$/i, "").trim();
}
async function main() {
const seedUrl = "https://start.me/pages";
console.error(`Loading directory ${seedUrl}`);
const dir = cheerio.load(await getRawHtml(seedUrl));
const seeds = extractPageLinks(dir);
console.error(`Found ${seeds.length} featured bookmark pages`);
if (seeds.length === 0) {
throw new Error("no bookmark page links found on start.me/pages directory");
}
// BFS over bookmark pages, discovering more via each page's "pages-bar".
const queue: string[] = [...seeds];
const queuedIds = new Set<string>(seeds.map((u) => pageId(u)!).filter(Boolean));
const pages: {
pageUrl: string;
title: string;
linkCount: number;
links: { url: string; title: string }[];
}[] = [];
while (queue.length > 0 && pages.length < maxPages) {
const url = queue.shift()!;
try {
console.error(`[${pages.length + 1}/${maxPages}] scraping ${url}`);
const $ = cheerio.load(await getRawHtml(url));
const links = extractBookmarks($);
pages.push({
pageUrl: url,
title: pageTitle($),
linkCount: links.length,
links,
});
// Discover more bookmark pages from this page.
for (const disc of extractPageLinks($)) {
const id = pageId(disc);
if (id && !queuedIds.has(id)) {
queuedIds.add(id);
queue.push(disc);
}
}
} catch (err) {
console.error(` skipped ${url}: ${(err as Error).message}`);
}
}
const totalLinks = pages.reduce((s, p) => s + p.linkCount, 0);
const out = {
source: seedUrl,
scrapedAt: new Date().toISOString(),
pageCount: pages.length,
totalLinks,
pages,
};
console.error(`Done: ${pages.length} pages, ${totalLinks} links`);
process.stdout.write(JSON.stringify(out));
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
Build prompt
START.md LINK TERRORIZER!!