Business Reviews

Name: Business Reviews Data Collector | Firecrawl Prometheus
Creator: sideguide
Published: 2026-06-13T21:32:30.097Z
License: https://opensource.org/licenses/MIT

v1Published

Extract customer reviews from any business or product review page — reviewer, star rating, date, and full text. Parameter: url.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data

reviews
pageTitle	Amazon.com Reviews from Real Customers
sourceUrl	https://www.consumeraffairs.com/online/amazon.html
reviewCount	10

Parameters

--urlstringrequiredThe URL of the business or product review page to extract reviews from e.g. "https://www.consumeraffairs.com/online/amazon.html"

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers

sideguide@sideguide

0 runs in 14d · published 6h ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author

v1builtapprovedcurrent6h ago

How this script collects data

import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";

/**
 * Extract individual customer reviews from a business/product review page.
 *
 * Given any review page URL, this scrapes the page and uses Firecrawl's
 * schema-guided extraction to pull out each review's reviewer name, star
 * rating, review date, and full review text. LLM-backed extraction is used
 * deliberately: review markup differs wildly across sites (Trustpilot, G2,
 * ConsumerAffairs, Amazon, Shopify stores, etc.), and per-review ratings are
 * frequently encoded as images/aria-labels that no single CSS selector can
 * read generically.
 *
 * Output (stdout, JSON only):
 *   {
 *     sourceUrl: string,
 *     pageTitle: string | null,
 *     reviewCount: number,
 *     reviews: Array<{
 *       reviewerName: string | null,
 *       starRating: number | null,   // original scale (typically 1-5)
 *       reviewDate: string | null,   // as displayed on the page
 *       reviewText: string | null
 *     }>
 *   }
 */

interface RawReview {
  reviewerName?: unknown;
  starRating?: unknown;
  reviewDate?: unknown;
  reviewText?: unknown;
}

interface Review {
  reviewerName: string | null;
  starRating: number | null;
  reviewDate: string | null;
  reviewText: string | null;
}

// Plain JSON Schema (not a Zod schema): the Firecrawl SDK's automatic Zod->JSON
// Schema conversion is unreliable for nested arrays, so we hand it a literal
// JSON Schema, which the extraction endpoint accepts directly.
const REVIEW_SCHEMA = {
  type: "object",
  properties: {
    reviews: {
      type: "array",
      items: {
        type: "object",
        properties: {
          reviewerName: {
            type: ["string", "null"],
            description: "Display name of the reviewer/customer who wrote the review",
          },
          starRating: {
            type: ["number", "null"],
            description:
              "The star/numeric rating the reviewer gave, on its original scale (e.g. 1-5). Null if the review shows no rating.",
          },
          reviewDate: {
            type: ["string", "null"],
            description: "The date the review was written/posted, exactly as shown on the page",
          },
          reviewText: {
            type: ["string", "null"],
            description: "The complete body text of the review",
          },
        },
        required: ["reviewerName", "starRating", "reviewDate", "reviewText"],
      },
    },
  },
  required: ["reviews"],
} as const;

const EXTRACTION_PROMPT =
  "Extract every individual customer review listed on this page. For each review capture: " +
  "the reviewer's display name, the star rating they gave (as a number on its original scale, e.g. 1-5), " +
  "the review date exactly as displayed, and the full, complete review text. " +
  "Only include genuine customer reviews — ignore navigation, ads, related products, and editorial copy.";

function toStringOrNull(v: unknown): string | null {
  if (typeof v === "string") {
    const t = v.trim();
    return t.length > 0 ? t : null;
  }
  return null;
}

function toNumberOrNull(v: unknown): number | null {
  if (typeof v === "number" && Number.isFinite(v)) return v;
  if (typeof v === "string") {
    const m = v.match(/-?\d+(\.\d+)?/);
    if (m) {
      const n = Number(m[0]);
      if (Number.isFinite(n)) return n;
    }
  }
  return null;
}

async function main(): Promise<void> {
  const { values } = parseArgs({
    options: {
      url: { type: "string" },
    },
    strict: true,
  });

  const url = values.url;
  if (!url || url.trim().length === 0) {
    console.error("Missing required parameter: --url=<review page URL>");
    process.exit(1);
  }

  let parsed: URL;
  try {
    parsed = new URL(url.trim());
  } catch {
    throw new Error(`OUT_OF_SCOPE: not a valid URL: ${url}`);
  }
  if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
    throw new Error(`OUT_OF_SCOPE: URL must be http(s): ${url}`);
  }

  const apiKey = process.env.FIRECRAWL_API_KEY;
  if (!apiKey) {
    console.error("FIRECRAWL_API_KEY environment variable is not set");
    process.exit(1);
  }

  const firecrawl = new Firecrawl({ apiKey });

  const res = (await firecrawl.scrape(parsed.toString(), {
    formats: [{ type: "json", schema: REVIEW_SCHEMA as unknown as Record<string, unknown>, prompt: EXTRACTION_PROMPT }],
    proxy: "auto",
    integration: "prometheus",
  } as Parameters<typeof firecrawl.scrape>[1])) as {
    json?: { reviews?: RawReview[] };
    metadata?: { title?: string };
  };

  if (!res || res.json === undefined || res.json === null) {
    throw new Error(
      `review extraction returned no structured data for ${parsed.toString()} (page may be bot-blocked, empty, or not a review page)`,
    );
  }

  const rawReviews = Array.isArray(res.json.reviews) ? res.json.reviews : [];

  const reviews: Review[] = rawReviews
    .map((r): Review => ({
      reviewerName: toStringOrNull(r?.reviewerName),
      starRating: toNumberOrNull(r?.starRating),
      reviewDate: toStringOrNull(r?.reviewDate),
      reviewText: toStringOrNull(r?.reviewText),
    }))
    // Keep entries that carry at least the review text or a reviewer name.
    .filter((r) => r.reviewText !== null || r.reviewerName !== null);

  const out = {
    sourceUrl: parsed.toString(),
    pageTitle: toStringOrNull(res.metadata?.title),
    reviewCount: reviews.length,
    reviews,
  };

  process.stdout.write(JSON.stringify(out));
}

main().catch((err) => {
  console.error(err instanceof Error ? err.message : String(err));
  process.exit(1);
});

deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.