Job Listings Search

Name: Job Listings Search Data Collector | Firecrawl Prometheus
Creator: sideguide
Published: 2026-06-13T21:54:49.095Z
License: https://opensource.org/licenses/MIT

v1Published

Find current job listings for a query and location — title, company, location, salary, posting date, and URL. Parameters: query, location.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data

jobs
count	50
query	software engineer
source	simplyhired.com
location	San Francisco, CA
retrievedAt	2026-06-13T21:54:35.647Z

Parameters

--querystringrequiredJob title or keywords to search for, e.g. "software engineer". e.g. "software engineer"

--locationstringrequiredCity/region to search in, e.g. "San Francisco, CA". e.g. "San Francisco, CA"

--max-resultsnumberMaximum number of job listings to return (default 50). default 50

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers

sideguide@sideguide

0 runs in 14d · published 5h ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author

v1builtapprovedcurrent5h ago

How this script collects data

import { parseArgs } from "node:util";
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";

// ---------------------------------------------------------------------------
// CLI parameters
// ---------------------------------------------------------------------------
const { values } = parseArgs({
  strict: true,
  options: {
    query: { type: "string" },
    location: { type: "string" },
    "max-results": { type: "string" },
  },
});

const query = (values.query ?? "").trim();
const location = (values.location ?? "").trim();
const maxResults = Math.max(1, Number(values["max-results"] ?? "50") || 50);

if (!query) {
  console.error("Missing required --query parameter");
  process.exit(1);
}
if (!location) {
  console.error("Missing required --location parameter");
  process.exit(1);
}

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  console.error("Missing FIRECRAWL_API_KEY environment variable");
  process.exit(1);
}

const firecrawl = new Firecrawl({ apiKey });
const ORIGIN = "https://www.simplyhired.com";

type Job = {
  title: string;
  company: string | null;
  location: string | null;
  salary: string | null;
  datePosted: string | null;
  url: string;
};

// Build a SimplyHired search URL. SimplyHired paginates with an opaque
// `cursor` token that the previous page exposes in its embedded state.
function buildSearchUrl(cursor: string | null): string {
  const params = new URLSearchParams();
  params.set("q", query);
  params.set("l", location);
  if (cursor) params.set("cursor", cursor);
  return `${ORIGIN}/search?${params.toString()}`;
}

// SimplyHired ships the full result set as JSON inside the Next.js
// `__NEXT_DATA__` <script> blob, so we parse that rather than scraping
// rendered markup. The blob survives only in rawHtml (html strips <script>).
function parsePage(rawHtml: string): {
  jobs: Job[];
  nextCursor: string | null;
} {
  const $ = cheerio.load(rawHtml);
  const blob = $("#__NEXT_DATA__").first().html();
  if (!blob) {
    throw new Error(
      "no __NEXT_DATA__ blob found on SimplyHired search page",
    );
  }

  let parsed: any;
  try {
    parsed = JSON.parse(blob);
  } catch {
    throw new Error("failed to JSON.parse __NEXT_DATA__ blob");
  }

  const pageProps = parsed?.props?.pageProps;
  const rawJobs = pageProps?.jobs;
  if (!Array.isArray(rawJobs)) {
    throw new Error("no jobs array in SimplyHired page state");
  }

  const jobs: Job[] = rawJobs.map((j: any) => {
    // botUrl is the clean, share-safe relative path (e.g. /job/<key>);
    // fall back to constructing it from the jobKey.
    const path: string =
      typeof j.botUrl === "string" && j.botUrl.startsWith("/")
        ? j.botUrl
        : `/job/${j.jobKey}`;

    let datePosted: string | null = null;
    if (typeof j.dateOnIndeed === "number" && j.dateOnIndeed > 0) {
      const d = new Date(j.dateOnIndeed);
      if (!Number.isNaN(d.getTime())) {
        datePosted = d.toISOString().slice(0, 10);
      }
    }

    const salary =
      typeof j.salaryInfo === "string" && j.salaryInfo.trim()
        ? j.salaryInfo.trim()
        : null;

    return {
      title: typeof j.title === "string" ? j.title : "",
      company: typeof j.company === "string" ? j.company : null,
      location: typeof j.location === "string" ? j.location : null,
      salary,
      datePosted,
      url: `${ORIGIN}${path}`,
    };
  });

  // The next page's cursor is keyed by page number in pageCursors.
  const currentPage = Number(pageProps?.currentPageNumber ?? 1);
  const cursors = pageProps?.pageCursors ?? {};
  const nextCursor =
    typeof cursors?.[String(currentPage + 1)] === "string"
      ? cursors[String(currentPage + 1)]
      : null;

  return { jobs, nextCursor };
}

async function scrapeRawHtml(url: string): Promise<string> {
  const res: any = await firecrawl.scrape(url, {
    formats: ["rawHtml"],
    onlyMainContent: false,
    integration: "prometheus",
  });
  const html: unknown = res?.rawHtml ?? res?.data?.rawHtml;
  if (typeof html !== "string" || !html) {
    throw new Error("SimplyHired scrape returned no rawHtml content");
  }
  return html;
}

async function main() {
  const collected: Job[] = [];
  const seen = new Set<string>();
  let cursor: string | null = null;

  // Safety cap on page fetches so a never-ending cursor chain can't loop.
  const maxPages = Math.min(15, Math.ceil(maxResults / 20) + 1);

  for (let page = 0; page < maxPages; page++) {
    const url = buildSearchUrl(cursor);
    const rawHtml = await scrapeRawHtml(url);
    const { jobs, nextCursor } = parsePage(rawHtml);

    for (const job of jobs) {
      if (seen.has(job.url)) continue;
      seen.add(job.url);
      collected.push(job);
    }

    console.error(
      `page ${page + 1}: +${jobs.length} jobs (total ${collected.length})`,
    );

    if (collected.length >= maxResults) break;
    if (!nextCursor) break;
    cursor = nextCursor;
  }

  const jobs = collected.slice(0, maxResults);

  const out = {
    source: "simplyhired.com",
    query,
    location,
    retrievedAt: new Date().toISOString(),
    count: jobs.length,
    jobs,
  };

  process.stdout.write(JSON.stringify(out));
}

main().catch((err) => {
  console.error(err instanceof Error ? err.message : String(err));
  process.exit(1);
});

deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.