Product Data Extractor

Name: Product Data Extractor Data Collector | Firecrawl Prometheus
Creator: sideguide
Published: 2026-06-13T21:42:10.491Z
License: https://opensource.org/licenses/MIT

v1Published

Extract structured product data from any e-commerce product page — title, brand, price, list price, rating, review count, availability, and image. Parameter: url.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data

url	https://www.amazon.com/dp/B0BDHWDR12
brand	Apple
title	Apple AirPods Pro (2nd Gen) Wireless Earbuds
currency	USD
starRating	4.7
reviewCount	57915
availability	In Stock
currentPrice	287.67
mainImageUrl	https://m.media-amazon.com/images/I/21ttIrgHhTL._AC_.jpg
originalPrice	null

Parameters

--urlstringrequiredThe full http(s) URL of the e-commerce product page to extract. e.g. "https://www.amazon.com/dp/B0BDHWDR12"

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers

sideguide@sideguide

0 runs in 14d · published 6h ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author

v1builtapprovedcurrent6h ago

How this script collects data

import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";

/**
 * Generic e-commerce product extractor.
 *
 * Given a single product page URL, this scrapes the page once (requesting both
 * the raw HTML and a schema-guided JSON extraction) and returns one normalized
 * product record:
 *
 *   {
 *     url, title, brand, currentPrice, originalPrice, currency,
 *     starRating, reviewCount, availability, mainImageUrl
 *   }
 *
 * Strategy: prefer deterministic schema.org Product JSON-LD when the page
 * embeds it (authoritative for the main product), and fall back per-field to an
 * LLM-backed extraction for the values JSON-LD omits or sites that ship no
 * structured data. LLM extraction is justified here because product-page markup
 * differs wildly across stores and there is no single CSS selector set that
 * works generically.
 */

interface Product {
  url: string;
  title: string | null;
  brand: string | null;
  currentPrice: number | null;
  originalPrice: number | null;
  currency: string | null;
  starRating: number | null;
  reviewCount: number | null;
  availability: string | null;
  mainImageUrl: string | null;
}

// Plain JSON Schema (the extraction endpoint accepts this directly and it is
// more reliable than SDK Zod->JSON-Schema conversion for mixed-null fields).
const PRODUCT_SCHEMA = {
  type: "object",
  properties: {
    title: { type: ["string", "null"], description: "The product's name/title" },
    brand: { type: ["string", "null"], description: "Brand or manufacturer name" },
    currentPrice: {
      type: ["number", "null"],
      description: "The price the buyer pays now, as a number with no currency symbol",
    },
    originalPrice: {
      type: ["number", "null"],
      description:
        "The original/list/struck-through price as a number, only if a discount is shown; otherwise null",
    },
    currency: {
      type: ["string", "null"],
      description: "ISO 4217 currency code of the prices, e.g. USD, EUR, GBP",
    },
    starRating: {
      type: ["number", "null"],
      description: "Average customer star rating on its original scale (e.g. 4.5 out of 5)",
    },
    reviewCount: {
      type: ["number", "null"],
      description: "Number of customer reviews/ratings the product has",
    },
    availability: {
      type: ["string", "null"],
      description: "Stock/availability status text, e.g. 'In Stock', 'Out of Stock', 'Backordered'",
    },
    mainImageUrl: {
      type: ["string", "null"],
      description: "Absolute URL of the main/primary product image",
    },
  },
  required: [
    "title",
    "brand",
    "currentPrice",
    "originalPrice",
    "currency",
    "starRating",
    "reviewCount",
    "availability",
    "mainImageUrl",
  ],
} as const;

const EXTRACTION_PROMPT =
  "Extract the details of the MAIN product this page is about (the one named in the page title/URL), " +
  "NOT recommended, related, bundled, or 'you may also like' products. " +
  "currentPrice is the price the buyer pays right now. originalPrice is the struck-through/list/was price " +
  "ONLY when a discount is shown, otherwise null. currency as an ISO 4217 code. " +
  "starRating is the average customer rating on its original scale. reviewCount is how many reviews/ratings exist. " +
  "availability is the stock status text. mainImageUrl is the absolute URL of the primary product image.";

function toStringOrNull(v: unknown): string | null {
  if (typeof v === "string") {
    const t = v.trim();
    return t.length > 0 ? t : null;
  }
  if (typeof v === "number" && Number.isFinite(v)) return String(v);
  return null;
}

function toNumberOrNull(v: unknown): number | null {
  if (typeof v === "number" && Number.isFinite(v)) return v;
  if (typeof v === "string") {
    // Keep digits, separators and minus; strip currency symbols/spaces.
    const cleaned = v.replace(/[^0-9.,-]/g, "");
    if (!cleaned) return null;
    // Treat the last separator as the decimal point; drop thousands separators.
    const lastSep = Math.max(cleaned.lastIndexOf("."), cleaned.lastIndexOf(","));
    let normalized: string;
    if (lastSep === -1) {
      normalized = cleaned;
    } else {
      const intPart = cleaned.slice(0, lastSep).replace(/[.,]/g, "");
      const fracPart = cleaned.slice(lastSep + 1).replace(/[.,]/g, "");
      normalized = `${intPart}.${fracPart}`;
    }
    const n = Number(normalized);
    return Number.isFinite(n) ? n : null;
  }
  return null;
}

// schema.org availability can be a full URL ("https://schema.org/InStock"),
// a bare token ("InStock") or free text. Normalize to a readable label.
function normalizeAvailability(v: unknown): string | null {
  const s = toStringOrNull(v);
  if (!s) return null;
  const token = s.split(/[/#]/).pop() ?? s;
  const map: Record<string, string> = {
    InStock: "In Stock",
    OutOfStock: "Out of Stock",
    PreOrder: "Pre-Order",
    PreSale: "Pre-Sale",
    BackOrder: "Backordered",
    Discontinued: "Discontinued",
    SoldOut: "Sold Out",
    LimitedAvailability: "Limited Availability",
    OnlineOnly: "Online Only",
    InStoreOnly: "In Store Only",
  };
  return map[token] ?? s;
}

function firstString(v: unknown): string | null {
  if (Array.isArray(v)) {
    for (const x of v) {
      const s = firstImageUrl(x);
      if (s) return s;
    }
    return null;
  }
  return firstImageUrl(v);
}

// Image entries in JSON-LD may be a string, an array, or ImageObject(s).
function firstImageUrl(v: unknown): string | null {
  if (typeof v === "string") return toStringOrNull(v);
  if (v && typeof v === "object") {
    const o = v as Record<string, unknown>;
    return toStringOrNull(o.url ?? o.contentUrl);
  }
  return null;
}

function brandName(v: unknown): string | null {
  if (typeof v === "string") return toStringOrNull(v);
  if (Array.isArray(v)) return brandName(v[0]);
  if (v && typeof v === "object") return toStringOrNull((v as Record<string, unknown>).name);
  return null;
}

// Flatten every node out of one or more JSON-LD documents (handles @graph and
// top-level arrays) so we can scan for the Product node.
function collectNodes(doc: unknown, out: Record<string, unknown>[]): void {
  if (Array.isArray(doc)) {
    for (const d of doc) collectNodes(d, out);
    return;
  }
  if (doc && typeof doc === "object") {
    const o = doc as Record<string, unknown>;
    out.push(o);
    if (Array.isArray(o["@graph"])) collectNodes(o["@graph"], out);
  }
}

function hasType(node: Record<string, unknown>, type: string): boolean {
  const t = node["@type"];
  if (typeof t === "string") return t === type;
  if (Array.isArray(t)) return t.includes(type);
  return false;
}

interface JsonLdProduct {
  title: string | null;
  brand: string | null;
  currentPrice: number | null;
  currency: string | null;
  starRating: number | null;
  reviewCount: number | null;
  availability: string | null;
  mainImageUrl: string | null;
}

function parseJsonLd(html: string): JsonLdProduct | null {
  const $ = cheerio.load(html);
  const nodes: Record<string, unknown>[] = [];
  $('script[type="application/ld+json"]').each((_, el) => {
    const text = $(el).contents().text().trim();
    if (!text) return;
    try {
      collectNodes(JSON.parse(text), nodes);
    } catch {
      /* ignore malformed JSON-LD blocks */
    }
  });

  const product = nodes.find((n) => hasType(n, "Product"));
  if (!product) return null;

  // offers can be an Offer, an array of Offers, or an AggregateOffer.
  let offer: Record<string, unknown> | undefined;
  const offers = product.offers;
  if (Array.isArray(offers)) {
    offer = offers.find((o) => o && typeof o === "object") as Record<string, unknown> | undefined;
  } else if (offers && typeof offers === "object") {
    offer = offers as Record<string, unknown>;
  }

  let currentPrice: number | null = null;
  let currency: string | null = null;
  let availability: string | null = null;
  if (offer) {
    currentPrice = toNumberOrNull(offer.price ?? offer.lowPrice ?? offer.highPrice);
    currency = toStringOrNull(offer.priceCurrency);
    availability = normalizeAvailability(offer.availability);
    if (currentPrice === null && offer.priceSpecification) {
      const ps = Array.isArray(offer.priceSpecification)
        ? offer.priceSpecification[0]
        : offer.priceSpecification;
      if (ps && typeof ps === "object") {
        currentPrice = toNumberOrNull((ps as Record<string, unknown>).price);
        currency = currency ?? toStringOrNull((ps as Record<string, unknown>).priceCurrency);
      }
    }
  }

  let starRating: number | null = null;
  let reviewCount: number | null = null;
  const agg = product.aggregateRating;
  if (agg && typeof agg === "object") {
    const a = agg as Record<string, unknown>;
    starRating = toNumberOrNull(a.ratingValue);
    reviewCount = toNumberOrNull(a.reviewCount ?? a.ratingCount);
  }

  return {
    title: toStringOrNull(product.name),
    brand: brandName(product.brand) ?? brandName(product.manufacturer),
    currentPrice,
    currency,
    starRating,
    reviewCount,
    availability,
    mainImageUrl: firstString(product.image),
  };
}

async function main(): Promise<void> {
  const { values } = parseArgs({
    strict: true,
    options: { url: { type: "string" } },
  });

  const rawUrl = values.url;
  if (!rawUrl || rawUrl.trim().length === 0) {
    console.error("Missing required parameter: --url=<product page URL>");
    process.exit(1);
  }

  let target: URL;
  try {
    target = new URL(rawUrl.trim());
  } catch {
    throw new Error(`OUT_OF_SCOPE: not a valid URL: ${rawUrl}`);
  }
  if (target.protocol !== "http:" && target.protocol !== "https:") {
    throw new Error(`OUT_OF_SCOPE: URL must use http or https: ${rawUrl}`);
  }

  const apiKey = process.env.FIRECRAWL_API_KEY;
  if (!apiKey) {
    console.error("FIRECRAWL_API_KEY environment variable is not set");
    process.exit(1);
  }

  const firecrawl = new Firecrawl({ apiKey });

  console.error(`Scraping ${target.toString()}`);
  const res = (await firecrawl.scrape(target.toString(), {
    formats: [
      "rawHtml",
      { type: "json", schema: PRODUCT_SCHEMA as unknown as Record<string, unknown>, prompt: EXTRACTION_PROMPT },
    ],
    proxy: "auto",
    integration: "prometheus",
  } as Parameters<typeof firecrawl.scrape>[1])) as {
    rawHtml?: string;
    html?: string;
    json?: Record<string, unknown>;
  };

  const html = res.rawHtml ?? res.html ?? "";
  const ld = html ? parseJsonLd(html) : null;
  if (ld) console.error("Found schema.org Product JSON-LD");

  const llm = (res.json ?? {}) as Record<string, unknown>;

  // Merge: JSON-LD (deterministic, authoritative for the main product) wins;
  // LLM extraction fills any gaps and supplies fields JSON-LD rarely carries
  // (e.g. the original/list price).
  const product: Product = {
    url: target.toString(),
    title: ld?.title ?? toStringOrNull(llm.title),
    brand: ld?.brand ?? toStringOrNull(llm.brand),
    currentPrice: ld?.currentPrice ?? toNumberOrNull(llm.currentPrice),
    originalPrice: toNumberOrNull(llm.originalPrice),
    currency: ld?.currency ?? toStringOrNull(llm.currency),
    starRating: ld?.starRating ?? toNumberOrNull(llm.starRating),
    reviewCount: ld?.reviewCount ?? toNumberOrNull(llm.reviewCount),
    availability: ld?.availability ?? normalizeAvailability(llm.availability),
    mainImageUrl: ld?.mainImageUrl ?? toStringOrNull(llm.mainImageUrl),
  };

  // If absolutely nothing identifying came back, the page is very likely not a
  // product page, was bot-blocked, or rendered empty.
  if (product.title === null && product.currentPrice === null && product.mainImageUrl === null) {
    throw new Error(
      `could not extract product details from ${target.toString()} (page may not be a product page, was bot-blocked, or rendered empty)`,
    );
  }

  process.stdout.write(JSON.stringify(product));
}

main().catch((err) => {
  console.error(err instanceof Error ? err.message : String(err));
  process.exit(1);
});

deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.