Prometheus

Top LLM Models

v3Published

top llm leaderboards + scoring of models

Output & API
Author's sample data
date2026-06-11
status✅ Prometheus Scraper completed — routing map + visualization data ready for In-Tune Layer at 2026-06-11T23:01:28.328Z
pipeline
top_models
routing_map
visualization_data
Marketplace
0 subscribers
metatron@metatron
0 runs in 14d · published 6h ago
Versions
managed by author
v3manual updateapprovedcurrent5h ago
v2manual updateapproved6h ago
v1builtrejected11h ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { z } from "zod";

// ===========================================================================
// Daily LLM leaderboard aggregation snapshot
//   - normalized scores from 7 leaderboards (6 live, HF archived)
//   - weighted composite rankings
//   - task-routing map (emphasis on coding / agentic work)
//   - visualization data (rank deltas, radar, correlations, time-series seed)
// All network access goes through the Firecrawl SDK. Output is JSON on stdout;
// all diagnostics go to stderr.
// ===========================================================================

const SNAPSHOT_DATE = "2026-06-11";

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  process.stderr.write("FIRECRAWL_API_KEY is not set\n");
  process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });

function log(msg: string): void {
  process.stderr.write(msg + "\n");
}

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------

type Category =
  | "human_pref"
  | "reasoning"
  | "coding"
  | "agentic"
  | "math"
  | "efficiency"
  | "academic"
  | "safety";

type RawScore = {
  leaderboard: string;
  benchmark: string;
  category: Category;
  model_raw: string;
  organization: string | null;
  value: number;
  display: string; // raw value as shown on the site
  higherIsBetter: boolean;
};

type LbStatus = {
  leaderboard: string;
  url: string;
  status: "live" | "archived_no_fresh_data" | "error";
  benchmarks_fetched: number;
  rows_fetched: number;
  note?: string;
};

// ---------------------------------------------------------------------------
// Model name canonicalization (fuzzy matching + alias table)
// ---------------------------------------------------------------------------

// Strip harness/effort qualifiers, dates and punctuation; split version dots;
// sort tokens so word order doesn't matter ("Claude 4.8 Opus" == "Claude Opus 4.8")
const STRIP_TOKENS = new Set([
  "thinking",
  "non-thinking",
  "nonthinking",
  "effort",
  "xhigh",
  "high",
  "medium",
  "low",
  "max",
  "preview",
  "new",
  "unreleased",
  "latest",
  "experimental",
  "chat",
  "instruct",
]);

// alias key (normalized token-sorted) -> canonical normalized key
const ALIASES: Record<string, string> = {
  "5 fable": "5 claude fable", // Scale: "Fable-5 (Claude Code)"
  "4 7 opus": "4 7 claude opus", // Scale: "Opus-4.7 (Claude Code)"
  "4 8 opus": "4 8 claude opus", // Scale: "Opus 4.8 (Claude Code)"
  "5 mythos": "5 claude mythos",
};

function normKey(raw: string): string {
  const s = raw
    .toLowerCase()
    .replace(/\\/g, "")
    .replace(/\*+/g, "")
    .replace(/\([^)]*\)/g, " ")
    .replace(/['’]/g, "")
    .replace(/(\d)\.(\d)/g, "$1 $2") // 4.8 -> "4 8"
    .replace(/[-_/,:]/g, " ")
    .replace(/\b\d{8}\b/g, " ") // dates like 20250929
    .replace(/\b\d{4} \d{2} \d{2}\b/g, " "); // dates like 2025 12 11
  const tokens = s
    .split(/\s+/)
    .filter((t) => t && !STRIP_TOKENS.has(t))
    .sort();
  const key = tokens.join(" ");
  return ALIASES[key] ?? key;
}

// Registry of canonical display names: key -> preferred display + provider
const displayNames = new Map<string, string>();
const displayPriority = new Map<string, number>();
const providers = new Map<string, string>();
const aliasLog = new Map<string, Set<string>>();

const PROVIDER_CANON: Record<string, string> = {
  anthropic: "Anthropic",
  openai: "OpenAI",
  google: "Google",
  "google deepmind": "Google",
  meta: "Meta",
  xai: "xAI",
  deepseek: "DeepSeek",
  alibaba: "Alibaba / Qwen",
  "alibaba cloud / qwen team": "Alibaba / Qwen",
  qwen: "Alibaba / Qwen",
  "moonshot ai": "Moonshot AI",
  moonshotai: "Moonshot AI",
  mistral: "Mistral",
  microsoft: "Microsoft",
  tencent: "Tencent",
  bytedance: "ByteDance",
  zhipu: "Zhipu AI",
  "zhipu ai": "Zhipu AI",
  minimax: "MiniMax",
  nvidia: "NVIDIA",
  amazon: "Amazon",
  cohere: "Cohere",
  "z.ai": "Zhipu AI",
  zai: "Zhipu AI",
};

function inferProvider(key: string): string | null {
  if (/\bclaude\b|\bfable\b|\bmythos\b/.test(key)) return "Anthropic";
  if (/\bgpt\b|^o\d| o\d\b|\boss\b/.test(key)) return "OpenAI";
  if (/\bgemini\b|\bgemma\b/.test(key)) return "Google";
  if (/\bgrok\b/.test(key)) return "xAI";
  if (/\bdeepseek\b/.test(key)) return "DeepSeek";
  if (/\bqwen\b/.test(key)) return "Alibaba / Qwen";
  if (/\bkimi\b/.test(key)) return "Moonshot AI";
  if (/\bllama\b|\bmuse\b/.test(key)) return "Meta";
  if (/\bmistral\b|\bmagistral\b/.test(key)) return "Mistral";
  if (/\bminimax\b/.test(key)) return "MiniMax";
  if (/\bglm\b/.test(key)) return "Zhipu AI";
  if (/\bseed\b/.test(key)) return "ByteDance";
  if (/\bnemotron\b/.test(key)) return "NVIDIA";
  if (/\bnova\b/.test(key)) return "Amazon";
  if (/\bmimo\b/.test(key)) return "Xiaomi";
  return null;
}

function registerModel(raw: string, org: string | null, sourcePriority: number): string {
  const key = normKey(raw);
  if (!key) return key;
  const cleanedDisplay = raw
    .replace(/\\/g, "")
    .replace(/\*+/g, "")
    .replace(/\(([^)]*)\)/g, (full, inner) =>
      /fallback|claude code|thinking|x?high|x?low|medium|max\b|effort/i.test(inner) ? "" : full
    )
    .replace(/\b(non-)?thinking\b/gi, "")
    .replace(/\bx?(high|low|medium)\b(\s+effort)?/gi, "")
    .replace(/\beffort\b/gi, "")
    .replace(/\s+/g, " ")
    .trim();
  // prefer display names from higher-priority sources (lower number wins),
  // recorded first-come within the same priority
  const prev = displayNames.get(key);
  const prevPrio = prev ? displayPriority.get(key)! : 99;
  if (!prev || sourcePriority < prevPrio) {
    displayNames.set(key, cleanedDisplay || raw.trim());
    displayPriority.set(key, sourcePriority);
  }
  if (org) {
    const canonOrg = PROVIDER_CANON[org.toLowerCase().trim()] ?? org.trim();
    if (!providers.has(key)) providers.set(key, canonOrg);
  } else if (!providers.has(key)) {
    const inferred = inferProvider(key);
    if (inferred) providers.set(key, inferred);
  }
  let set = aliasLog.get(key);
  if (!set) aliasLog.set(key, (set = new Set()));
  set.add(cleanedDisplay || raw.trim());
  return key;
}

// ---------------------------------------------------------------------------
// Generic helpers
// ---------------------------------------------------------------------------

const scores: RawScore[] = [];
const statuses: LbStatus[] = [];

function round(x: number, d = 2): number {
  const f = Math.pow(10, d);
  return Math.round(x * f) / f;
}

function toNumber(s: string): number | null {
  const m = String(s).replace(/[, ]/g, "").match(/-?\d+(\.\d+)?/);
  if (!m) return null;
  const n = Number(m[0]);
  return Number.isFinite(n) ? n : null;
}

function cleanText(s: string): string {
  return s
    .replace(/[\u{1F000}-\u{1FFFF}☀-➿️]/gu, "") // emoji / symbols
    .replace(/\s+/g, " ")
    .trim();
}

async function getHtml(url: string): Promise<string> {
  const doc = await firecrawl.scrape(url, {
    formats: ["html"],
    onlyMainContent: false,
    integration: "prometheus",
  });
  return (doc as { html?: string }).html ?? "";
}

// Known provider display prefixes that some sites glue onto the model id
const ORG_PREFIXES: Array<[string, string]> = [
  ["Google DeepMind", "Google"],
  ["Anthropic", "Anthropic"],
  ["OpenAI", "OpenAI"],
  ["Google", "Google"],
  ["Meta", "Meta"],
  ["xAI", "xAI"],
  ["DeepSeek", "DeepSeek"],
  ["Alibaba", "Alibaba / Qwen"],
  ["Qwen", "Alibaba / Qwen"],
  ["Moonshot AI", "Moonshot AI"],
  ["Moonshot", "Moonshot AI"],
  ["Mistral", "Mistral"],
  ["Microsoft", "Microsoft"],
  ["Tencent", "Tencent"],
  ["ByteDance", "ByteDance"],
  ["Zhipu AI", "Zhipu AI"],
  ["Zhipu", "Zhipu AI"],
  ["MiniMax", "MiniMax"],
  ["NVIDIA", "NVIDIA"],
  ["Amazon", "Amazon"],
  ["Cohere", "Cohere"],
  ["Z.ai", "Zhipu AI"],
  ["Xiaomi", "Xiaomi"],
  ["Baidu", "Baidu"],
];

function splitGluedOrg(text: string): { org: string | null; model: string } {
  const t = cleanText(text);
  for (const [prefix, canon] of ORG_PREFIXES) {
    if (t.toLowerCase().startsWith(prefix.toLowerCase()) && t.length > prefix.length) {
      return { org: canon, model: t.slice(prefix.length).trim() };
    }
  }
  return { org: null, model: t };
}

function push(
  leaderboard: string,
  benchmark: string,
  category: Category,
  modelRaw: string,
  organization: string | null,
  value: number,
  display: string,
  higherIsBetter: boolean
): void {
  scores.push({
    leaderboard,
    benchmark,
    category,
    model_raw: modelRaw,
    organization,
    value,
    display,
    higherIsBetter,
  });
}

// ---------------------------------------------------------------------------
// Per-leaderboard parsers (one per source). Each is wrapped by runSource()
// so a single failure never aborts the snapshot.
// ---------------------------------------------------------------------------

type SourceSpec = {
  name: string;
  url: string;
  priority: number;
  run: () => Promise<{ benchmarks: number; rows: number }>;
};

const MAX_ROWS = 60;

// 1) LMArena (Chatbot Arena) -- human preference (rank-based)
async function parseLmArena(name: string, url: string, priority: number) {
  const html = await getHtml(url);
  const $ = cheerio.load(html);
  const table = $("table").first();
  const headers = table
    .find("thead th, tr").first()
    .find("th, td")
    .map((_i, e) => cleanText($(e).text()))
    .get();
  let overallIdx = headers.findIndex((h) => /overall/i.test(h));
  let codingIdx = headers.findIndex((h) => /coding/i.test(h));
  if (overallIdx < 0) overallIdx = 1;
  let rows = 0;
  const benchmarks = new Set<string>();
  table.find("tbody tr").each((_i, tr) => {
    if (rows >= MAX_ROWS) return;
    const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
    if (cells.length <= overallIdx) return;
    const { org, model } = splitGluedOrg(cells[0]);
    if (!model) return;
    const overall = toNumber(cells[overallIdx]);
    if (overall === null) return;
    registerModel(model, org, priority);
    push(name, "Arena Overall (rank)", "human_pref", model, org, overall, cells[overallIdx], false);
    benchmarks.add("Arena Overall (rank)");
    if (codingIdx > 0 && cells.length > codingIdx) {
      const coding = toNumber(cells[codingIdx]);
      if (coding !== null) {
        push(name, "Arena Coding (rank)", "coding", model, org, coding, cells[codingIdx], false);
        benchmarks.add("Arena Coding (rank)");
      }
    }
    rows += 1;
  });
  return { benchmarks: benchmarks.size, rows };
}

// 2) Artificial Analysis -- intelligence index + output speed
async function parseArtificialAnalysis(name: string, url: string, priority: number) {
  const html = await getHtml(url);
  const $ = cheerio.load(html);
  const table = $("table").first();
  let rows = 0;
  const benchmarks = new Set<string>();
  table.find("tbody tr").each((_i, tr) => {
    if (rows >= MAX_ROWS) return;
    const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
    if (cells.length < 6) return;
    const model = cells[0];
    const org = cells[2] || null;
    const intelligence = toNumber(cells[3]);
    const speed = toNumber(cells[5]);
    if (!model || intelligence === null) return;
    registerModel(model, org, priority);
    push(name, "Intelligence Index", "reasoning", model, org, intelligence, cells[3], true);
    benchmarks.add("Intelligence Index");
    if (speed !== null) {
      push(name, "Output Speed (tok/s)", "efficiency", model, org, speed, cells[5], true);
      benchmarks.add("Output Speed (tok/s)");
    }
    rows += 1;
  });
  return { benchmarks: benchmarks.size, rows };
}

// 3) LiveBench -- multi-category averages
async function parseLiveBench(name: string, url: string, priority: number) {
  const html = await getHtml(url);
  const $ = cheerio.load(html);
  const table = $("table").first();
  const headers = table.find("thead th, tr").first().find("th, td")
    .map((_i, e) => cleanText($(e).text())).get();
  const colMap: Array<{ idx: number; benchmark: string; category: Category }> = [];
  headers.forEach((h, idx) => {
    if (/reasoning/i.test(h)) colMap.push({ idx, benchmark: "Reasoning Average", category: "reasoning" });
    else if (/agentic/i.test(h)) colMap.push({ idx, benchmark: "Agentic Coding Average", category: "agentic" });
    else if (/coding/i.test(h)) colMap.push({ idx, benchmark: "Coding Average", category: "coding" });
    else if (/math/i.test(h)) colMap.push({ idx, benchmark: "Mathematics Average", category: "math" });
    else if (/data analysis/i.test(h)) colMap.push({ idx, benchmark: "Data Analysis Average", category: "academic" });
    else if (/\bif\b|instruction/i.test(h)) colMap.push({ idx, benchmark: "Instruction Following Average", category: "academic" });
  });
  // Fallback to known LiveBench column order if header detection failed
  if (colMap.length === 0) {
    colMap.push(
      { idx: 3, benchmark: "Reasoning Average", category: "reasoning" },
      { idx: 4, benchmark: "Coding Average", category: "coding" },
      { idx: 5, benchmark: "Agentic Coding Average", category: "agentic" },
      { idx: 6, benchmark: "Mathematics Average", category: "math" }
    );
  }
  let rows = 0;
  const benchmarks = new Set<string>();
  table.find("tbody tr").each((_i, tr) => {
    if (rows >= MAX_ROWS) return;
    const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
    if (cells.length < 4) return;
    const model = cells[0].split("**")[0].trim();
    const org = cells[1] || null;
    if (!model || toNumber(cells[2]) === null) return;
    registerModel(model, org, priority);
    let any = false;
    for (const col of colMap) {
      if (cells.length <= col.idx) continue;
      const v = toNumber(cells[col.idx]);
      if (v === null) continue;
      push(name, col.benchmark, col.category, model, org, v, cells[col.idx], true);
      benchmarks.add(col.benchmark);
      any = true;
    }
    if (any) rows += 1;
  });
  return { benchmarks: benchmarks.size, rows };
}

// 4) Aider polyglot -- coding (percent correct)
async function parseAider(name: string, url: string, priority: number) {
  const html = await getHtml(url);
  const $ = cheerio.load(html);
  let rows = 0;
  const benchmarks = new Set<string>();
  const seen = new Set<string>();
  $("table tbody tr, table tr").each((_i, tr) => {
    if (rows >= MAX_ROWS) return;
    const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
    // Real rows look like: [▶, model, "88.0%", "$29.08", command, "91.6%", "diff"]
    const modelCell = cells.find((c, idx) => idx > 0 && c && c.length < 80 && !/^[▶]?$/.test(c));
    const pctIdx = cells.findIndex((c) => /^\$?\d+(\.\d+)?%$/.test(c));
    if (pctIdx < 1) return;
    const model = cleanText(cells[pctIdx - 1]);
    const pct = toNumber(cells[pctIdx]);
    if (!model || model.length > 80 || pct === null) return;
    const key = normKey(model);
    if (!key || seen.has(key)) return;
    seen.add(key);
    registerModel(model, null, priority);
    push(name, "Polyglot Percent Correct", "coding", model, null, pct, cells[pctIdx], true);
    benchmarks.add("Polyglot Percent Correct");
    rows += 1;
    void modelCell;
  });
  return { benchmarks: benchmarks.size, rows };
}

// 5) SWE-bench -- agentic software engineering (percent resolved)
async function parseSweBench(name: string, url: string, priority: number) {
  const html = await getHtml(url);
  const $ = cheerio.load(html);
  const table = $("table").first();
  const headers = table.find("thead th, tr").first().find("th, td")
    .map((_i, e) => cleanText($(e).text())).get();
  let modelIdx = headers.findIndex((h) => /model/i.test(h));
  let resolvedIdx = headers.findIndex((h) => /resolved|%/i.test(h));
  let orgIdx = headers.findIndex((h) => /^org/i.test(h));
  if (modelIdx < 0) modelIdx = 1;
  if (resolvedIdx < 0) resolvedIdx = 2;
  let rows = 0;
  const benchmarks = new Set<string>();
  table.find("tbody tr").each((_i, tr) => {
    if (rows >= MAX_ROWS) return;
    const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
    if (cells.length <= resolvedIdx) return;
    const model = cleanText(cells[modelIdx] || "");
    const resolved = toNumber(cells[resolvedIdx]);
    if (!model || resolved === null) return;
    const org = orgIdx > 0 && cells[orgIdx] ? cells[orgIdx] : null;
    registerModel(model, org, priority);
    push(name, "SWE-bench % Resolved", "agentic", model, org, resolved, cells[resolvedIdx], true);
    benchmarks.add("SWE-bench % Resolved");
    rows += 1;
  });
  return { benchmarks: benchmarks.size, rows };
}

// 6) Scale SEAL -- coding leaderboard (client-rendered; LLM-backed JSON mode)
const scaleSchema = z.object({
  models: z.array(
    z.object({
      model: z.string(),
      organization: z.string().nullable().optional(),
      score: z.number().nullable().optional(),
    })
  ),
});

async function parseScale(name: string, url: string, priority: number) {
  const doc = await firecrawl.scrape(url, {
    formats: [
      {
        type: "json",
        prompt:
          "Extract the leaderboard ranking table. Return an array 'models' where each entry has the model name, its organization/creator if shown, and its numeric score (the primary ranking metric).",
        schema: scaleSchema,
      },
    ],
    onlyMainContent: false,
    integration: "prometheus",
  });
  const parsed = scaleSchema.safeParse((doc as { json?: unknown }).json);
  if (!parsed.success) return { benchmarks: 0, rows: 0 };
  let rows = 0;
  for (const m of parsed.data.models.slice(0, MAX_ROWS)) {
    if (!m.model) continue;
    const score = typeof m.score === "number" ? m.score : null;
    if (score === null) continue;
    const org = m.organization ?? null;
    registerModel(m.model, org, priority);
    push(name, "SEAL Coding", "coding", m.model, org, score, String(score), true);
    rows += 1;
  }
  return { benchmarks: rows > 0 ? 1 : 0, rows };
}

// ---------------------------------------------------------------------------
// Source registry (7 leaderboards: 6 live + HF archived)
// ---------------------------------------------------------------------------

const SOURCES: SourceSpec[] = [
  {
    name: "LMArena (Chatbot Arena)",
    url: "https://lmarena.ai/leaderboard",
    priority: 2,
    run: () => parseLmArena("LMArena (Chatbot Arena)", "https://lmarena.ai/leaderboard", 2),
  },
  {
    name: "Artificial Analysis",
    url: "https://artificialanalysis.ai/leaderboards/models",
    priority: 1,
    run: () =>
      parseArtificialAnalysis(
        "Artificial Analysis",
        "https://artificialanalysis.ai/leaderboards/models",
        1
      ),
  },
  {
    name: "LiveBench",
    url: "https://livebench.ai/",
    priority: 1,
    run: () => parseLiveBench("LiveBench", "https://livebench.ai/", 1),
  },
  {
    name: "Aider Polyglot",
    url: "https://aider.chat/docs/leaderboards/",
    priority: 1,
    run: () => parseAider("Aider Polyglot", "https://aider.chat/docs/leaderboards/", 1),
  },
  {
    name: "SWE-bench",
    url: "https://www.swebench.com/",
    priority: 1,
    run: () => parseSweBench("SWE-bench", "https://www.swebench.com/", 1),
  },
  {
    name: "Scale SEAL",
    url: "https://scale.com/leaderboard/coding",
    priority: 3,
    run: () => parseScale("Scale SEAL", "https://scale.com/leaderboard/coding", 3),
  },
];

const HF_SOURCE = {
  name: "HuggingFace Open LLM Leaderboard",
  url: "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard",
};

async function runSource(spec: SourceSpec): Promise<void> {
  try {
    log(`Fetching ${spec.name} ...`);
    const r = await spec.run();
    statuses.push({
      leaderboard: spec.name,
      url: spec.url,
      status: r.rows > 0 ? "live" : "error",
      benchmarks_fetched: r.benchmarks,
      rows_fetched: r.rows,
      note: r.rows > 0 ? undefined : "no rows parsed",
    });
    log(`  ${spec.name}: ${r.rows} rows / ${r.benchmarks} benchmarks`);
  } catch (e) {
    const msg = e instanceof Error ? e.message : String(e);
    statuses.push({
      leaderboard: spec.name,
      url: spec.url,
      status: "error",
      benchmarks_fetched: 0,
      rows_fetched: 0,
      note: msg.slice(0, 200),
    });
    log(`  ${spec.name}: ERROR ${msg}`);
  }
}

// ---------------------------------------------------------------------------
// Aggregation
// ---------------------------------------------------------------------------

const CATEGORY_WEIGHTS: Record<Category, number> = {
  coding: 0.22,
  agentic: 0.22,
  reasoning: 0.18,
  human_pref: 0.12,
  math: 0.1,
  academic: 0.08,
  efficiency: 0.05,
  safety: 0.03,
};

const ALL_CATEGORIES: Category[] = [
  "human_pref",
  "reasoning",
  "coding",
  "agentic",
  "math",
  "efficiency",
  "academic",
  "safety",
];

type NormScore = {
  key: string;
  category: Category;
  normalized: number;
  leaderboard: string;
  benchmark: string;
};

// Normalize every raw score within its (leaderboard + benchmark) group to 0..100.
function normalizeScores(): NormScore[] {
  const groups = new Map<string, RawScore[]>();
  for (const s of scores) {
    const g = `${s.leaderboard}::${s.benchmark}`;
    let arr = groups.get(g);
    if (!arr) groups.set(g, (arr = []));
    arr.push(s);
  }
  const out: NormScore[] = [];
  for (const [, arr] of groups) {
    const vals = arr.map((s) => s.value);
    const min = Math.min(...vals);
    const max = Math.max(...vals);
    const span = max - min;
    for (const s of arr) {
      let n: number;
      if (span === 0) n = 50;
      else if (s.higherIsBetter) n = ((s.value - min) / span) * 100;
      else n = ((max - s.value) / span) * 100;
      const key = normKey(s.model_raw);
      if (!key) continue;
      out.push({
        key,
        category: s.category,
        normalized: round(n, 2),
        leaderboard: s.leaderboard,
        benchmark: s.benchmark,
      });
    }
  }
  return out;
}

type ModelAgg = {
  key: string;
  model: string;
  provider: string | null;
  categoryScores: Map<Category, number>;
  leaderboards: Set<string>;
  benchmarks: Set<string>;
};

function aggregate(norm: NormScore[]): ModelAgg[] {
  const byModel = new Map<string, ModelAgg>();
  const catBuckets = new Map<string, Map<Category, number[]>>();
  for (const n of norm) {
    let agg = byModel.get(n.key);
    if (!agg) {
      byModel.set(
        n.key,
        (agg = {
          key: n.key,
          model: displayNames.get(n.key) ?? n.key,
          provider: providers.get(n.key) ?? inferProvider(n.key),
          categoryScores: new Map(),
          leaderboards: new Set(),
          benchmarks: new Set(),
        })
      );
    }
    agg.leaderboards.add(n.leaderboard);
    agg.benchmarks.add(`${n.leaderboard}::${n.benchmark}`);
    let cb = catBuckets.get(n.key);
    if (!cb) catBuckets.set(n.key, (cb = new Map()));
    let list = cb.get(n.category);
    if (!list) cb.set(n.category, (list = []));
    list.push(n.normalized);
  }
  for (const [key, cb] of catBuckets) {
    const agg = byModel.get(key)!;
    for (const [cat, list] of cb) {
      const mean = list.reduce((a, b) => a + b, 0) / list.length;
      agg.categoryScores.set(cat, round(mean, 2));
    }
  }
  return [...byModel.values()];
}

function compositeOf(agg: ModelAgg, weights: Record<Category, number>): number {
  let wsum = 0;
  let acc = 0;
  for (const cat of ALL_CATEGORIES) {
    const s = agg.categoryScores.get(cat);
    if (s === undefined) continue;
    const w = weights[cat] ?? 0;
    acc += s * w;
    wsum += w;
  }
  if (wsum === 0) return 0;
  return round(acc / wsum, 2);
}

// ---------------------------------------------------------------------------
// Task routing (emphasis on coding / agentic work)
// ---------------------------------------------------------------------------

const TASK_PROFILES: Record<string, Partial<Record<Category, number>>> = {
  coding: { coding: 0.6, agentic: 0.25, reasoning: 0.15 },
  agentic_workflows: { agentic: 0.6, coding: 0.25, reasoning: 0.15 },
  reasoning_research: { reasoning: 0.5, math: 0.3, academic: 0.2 },
  math: { math: 0.7, reasoning: 0.3 },
  general_chat: { human_pref: 0.6, reasoning: 0.2, coding: 0.2 },
  cost_efficient_coding: { efficiency: 0.5, coding: 0.3, agentic: 0.2 },
};

function scoreForProfile(agg: ModelAgg, profile: Partial<Record<Category, number>>): number | null {
  let wsum = 0;
  let acc = 0;
  let covered = 0;
  for (const cat of Object.keys(profile) as Category[]) {
    const s = agg.categoryScores.get(cat);
    const w = profile[cat] ?? 0;
    if (s === undefined) continue;
    acc += s * w;
    wsum += w;
    covered += 1;
  }
  if (covered === 0 || wsum === 0) return null;
  return round(acc / wsum, 2);
}

// ---------------------------------------------------------------------------
// Visualization helpers
// ---------------------------------------------------------------------------

function pearson(pairs: Array<[number, number]>): number | null {
  const n = pairs.length;
  if (n < 3) return null;
  let sx = 0;
  let sy = 0;
  let sxx = 0;
  let syy = 0;
  let sxy = 0;
  for (const [x, y] of pairs) {
    sx += x;
    sy += y;
    sxx += x * x;
    syy += y * y;
    sxy += x * y;
  }
  const cov = n * sxy - sx * sy;
  const dx = Math.sqrt(n * sxx - sx * sx);
  const dy = Math.sqrt(n * syy - sy * sy);
  if (dx === 0 || dy === 0) return null;
  return round(cov / (dx * dy), 3);
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

async function main(): Promise<void> {
  for (const spec of SOURCES) {
    await runSource(spec);
  }
  // HuggingFace Open LLM Leaderboard: archived, no fresh data published.
  statuses.push({
    leaderboard: HF_SOURCE.name,
    url: HF_SOURCE.url,
    status: "archived_no_fresh_data",
    benchmarks_fetched: 0,
    rows_fetched: 0,
    note: "Open LLM Leaderboard was archived; no fresh evaluations are published.",
  });

  const norm = normalizeScores();
  const aggs = aggregate(norm);

  // Composite ranking: models present on at least 2 leaderboards.
  const ranked = aggs
    .filter((a) => a.leaderboards.size >= 2)
    .map((a) => ({
      agg: a,
      composite: compositeOf(a, CATEGORY_WEIGHTS),
    }))
    .sort((a, b) => b.composite - a.composite || a.agg.key.localeCompare(b.agg.key));

  const models = ranked.map((r, i) => {
    const catScores: Record<string, number> = {};
    for (const cat of ALL_CATEGORIES) {
      const s = r.agg.categoryScores.get(cat);
      if (s !== undefined) catScores[cat] = s;
    }
    return {
      rank: i + 1,
      model: r.agg.model,
      provider: r.agg.provider,
      key: r.agg.key,
      composite: r.composite,
      coverage: {
        leaderboards: r.agg.leaderboards.size,
        benchmarks: r.agg.benchmarks.size,
        categories: r.agg.categoryScores.size,
      },
      category_scores: catScores,
    };
  });

  const compositeRankByKey = new Map<string, number>();
  models.forEach((m) => compositeRankByKey.set(m.key, m.rank));

  // Task routing: best models per task profile.
  const taskRouting: Record<string, Array<{ rank: number; model: string; provider: string | null; score: number }>> = {};
  for (const [task, profile] of Object.entries(TASK_PROFILES)) {
    const scored = aggs
      .map((a) => ({ a, s: scoreForProfile(a, profile) }))
      .filter((x): x is { a: ModelAgg; s: number } => x.s !== null && x.a.leaderboards.size >= 1)
      .sort((x, y) => y.s - x.s || x.a.key.localeCompare(y.a.key))
      .slice(0, 8);
    taskRouting[task] = scored.map((x, i) => ({
      rank: i + 1,
      model: x.a.model,
      provider: x.a.provider,
      score: x.s,
    }));
  }

  // --- Visualization data ---

  // Rank deltas: composite rank vs human-preference rank.
  const humanPref = aggs
    .filter((a) => a.categoryScores.has("human_pref"))
    .sort(
      (a, b) =>
        (b.categoryScores.get("human_pref") ?? 0) - (a.categoryScores.get("human_pref") ?? 0) ||
        a.key.localeCompare(b.key)
    );
  const humanPrefRankByKey = new Map<string, number>();
  humanPref.forEach((a, i) => humanPrefRankByKey.set(a.key, i + 1));
  const rankDeltas = models
    .filter((m) => humanPrefRankByKey.has(m.key))
    .map((m) => {
      const hp = humanPrefRankByKey.get(m.key)!;
      return {
        model: m.model,
        provider: m.provider,
        composite_rank: m.rank,
        human_pref_rank: hp,
        delta: hp - m.rank, // positive: composite ranks higher than human preference
      };
    })
    .sort((a, b) => Math.abs(b.delta) - Math.abs(a.delta) || a.model.localeCompare(b.model))
    .slice(0, 20);

  // Radar: per-category normalized score for the top models.
  const radar = models.slice(0, 10).map((m) => {
    const axes: Record<string, number | null> = {};
    for (const cat of ALL_CATEGORIES) {
      axes[cat] = m.category_scores[cat] ?? null;
    }
    return { model: m.model, provider: m.provider, axes };
  });

  // Correlations: Pearson between category score vectors across shared models.
  const catVecByModel = new Map<string, Map<Category, number>>();
  for (const a of aggs) catVecByModel.set(a.key, a.categoryScores);
  const correlations: Array<{ a: Category; b: Category; r: number; n: number }> = [];
  for (let i = 0; i < ALL_CATEGORIES.length; i++) {
    for (let j = i + 1; j < ALL_CATEGORIES.length; j++) {
      const ca = ALL_CATEGORIES[i];
      const cb = ALL_CATEGORIES[j];
      const pairs: Array<[number, number]> = [];
      for (const [, cv] of catVecByModel) {
        const va = cv.get(ca);
        const vb = cv.get(cb);
        if (va !== undefined && vb !== undefined) pairs.push([va, vb]);
      }
      const r = pearson(pairs);
      if (r !== null) correlations.push({ a: ca, b: cb, r, n: pairs.length });
    }
  }
  correlations.sort((x, y) => Math.abs(y.r) - Math.abs(x.r));

  // Time-series seed: today's composite for the top models (first data point).
  const timeSeriesSeed = models.slice(0, 15).map((m) => ({
    date: SNAPSHOT_DATE,
    model: m.model,
    provider: m.provider,
    composite: m.composite,
  }));

  // Normalized raw scores for transparency.
  const normalizedScores = scores.map((s) => {
    const group = norm.find(
      (n) =>
        n.key === normKey(s.model_raw) &&
        n.leaderboard === s.leaderboard &&
        n.benchmark === s.benchmark
    );
    return {
      leaderboard: s.leaderboard,
      benchmark: s.benchmark,
      category: s.category,
      model: displayNames.get(normKey(s.model_raw)) ?? s.model_raw,
      key: normKey(s.model_raw),
      provider: providers.get(normKey(s.model_raw)) ?? inferProvider(normKey(s.model_raw)),
      value: s.value,
      display: s.display,
      higher_is_better: s.higherIsBetter,
      normalized: group ? group.normalized : null,
    };
  });

  const liveCount = statuses.filter((s) => s.status === "live").length;

  const output = {
    snapshot_date: SNAPSHOT_DATE,
    summary: {
      leaderboards_total: statuses.length,
      leaderboards_live: liveCount,
      leaderboards_archived: statuses.filter((s) => s.status === "archived_no_fresh_data").length,
      leaderboards_error: statuses.filter((s) => s.status === "error").length,
      raw_score_rows: normalizedScores.length,
      ranked_models: models.length,
    },
    category_weights: CATEGORY_WEIGHTS,
    leaderboards: statuses,
    normalized_scores: normalizedScores,
    composite_rankings: models,
    task_routing: taskRouting,
    visualization: {
      rank_deltas: rankDeltas,
      radar,
      correlations,
      time_series_seed: timeSeriesSeed,
    },
  };

  process.stdout.write(JSON.stringify(output));
}

main().catch((e) => {
  const msg = e instanceof Error ? e.stack || e.message : String(e);
  log("FATAL " + msg);
  // Emit a minimal valid JSON envelope so downstream consumers never break.
  process.stdout.write(
    JSON.stringify({
      snapshot_date: SNAPSHOT_DATE,
      summary: {
        leaderboards_total: statuses.length,
        leaderboards_live: 0,
        leaderboards_archived: 0,
        leaderboards_error: statuses.length,
        raw_score_rows: 0,
        ranked_models: 0,
      },
      category_weights: CATEGORY_WEIGHTS,
      leaderboards: statuses,
      normalized_scores: [],
      composite_rankings: [],
      task_routing: {},
      visualization: { rank_deltas: [], radar: [], correlations: [], time_series_seed: [] },
    })
  );
});
deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.

One person builds it. Everyone keeps it fresh.
Top LLM Models — Prometheus