Product Data Extractor
v1PublishedExtract structured product data from any e-commerce product page — title, brand, price, list price, rating, review count, availability, and image. Parameter: url.
Output & API
Preview the latest data, download it, or call this collector as an API.
| url | https://www.amazon.com/dp/B0BDHWDR12 |
|---|---|
| brand | Apple |
| title | Apple AirPods Pro (2nd Gen) Wireless Earbuds |
| currency | USD |
| starRating | 4.7 |
| reviewCount | 57915 |
| availability | In Stock |
| currentPrice | 287.67 |
| mainImageUrl | https://m.media-amazon.com/images/I/21ttIrgHhTL._AC_.jpg |
| originalPrice | null |
Marketplace
Publish this collector so others can deploy it — you keep ownership.
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
/**
* Generic e-commerce product extractor.
*
* Given a single product page URL, this scrapes the page once (requesting both
* the raw HTML and a schema-guided JSON extraction) and returns one normalized
* product record:
*
* {
* url, title, brand, currentPrice, originalPrice, currency,
* starRating, reviewCount, availability, mainImageUrl
* }
*
* Strategy: prefer deterministic schema.org Product JSON-LD when the page
* embeds it (authoritative for the main product), and fall back per-field to an
* LLM-backed extraction for the values JSON-LD omits or sites that ship no
* structured data. LLM extraction is justified here because product-page markup
* differs wildly across stores and there is no single CSS selector set that
* works generically.
*/
interface Product {
url: string;
title: string | null;
brand: string | null;
currentPrice: number | null;
originalPrice: number | null;
currency: string | null;
starRating: number | null;
reviewCount: number | null;
availability: string | null;
mainImageUrl: string | null;
}
// Plain JSON Schema (the extraction endpoint accepts this directly and it is
// more reliable than SDK Zod->JSON-Schema conversion for mixed-null fields).
const PRODUCT_SCHEMA = {
type: "object",
properties: {
title: { type: ["string", "null"], description: "The product's name/title" },
brand: { type: ["string", "null"], description: "Brand or manufacturer name" },
currentPrice: {
type: ["number", "null"],
description: "The price the buyer pays now, as a number with no currency symbol",
},
originalPrice: {
type: ["number", "null"],
description:
"The original/list/struck-through price as a number, only if a discount is shown; otherwise null",
},
currency: {
type: ["string", "null"],
description: "ISO 4217 currency code of the prices, e.g. USD, EUR, GBP",
},
starRating: {
type: ["number", "null"],
description: "Average customer star rating on its original scale (e.g. 4.5 out of 5)",
},
reviewCount: {
type: ["number", "null"],
description: "Number of customer reviews/ratings the product has",
},
availability: {
type: ["string", "null"],
description: "Stock/availability status text, e.g. 'In Stock', 'Out of Stock', 'Backordered'",
},
mainImageUrl: {
type: ["string", "null"],
description: "Absolute URL of the main/primary product image",
},
},
required: [
"title",
"brand",
"currentPrice",
"originalPrice",
"currency",
"starRating",
"reviewCount",
"availability",
"mainImageUrl",
],
} as const;
const EXTRACTION_PROMPT =
"Extract the details of the MAIN product this page is about (the one named in the page title/URL), " +
"NOT recommended, related, bundled, or 'you may also like' products. " +
"currentPrice is the price the buyer pays right now. originalPrice is the struck-through/list/was price " +
"ONLY when a discount is shown, otherwise null. currency as an ISO 4217 code. " +
"starRating is the average customer rating on its original scale. reviewCount is how many reviews/ratings exist. " +
"availability is the stock status text. mainImageUrl is the absolute URL of the primary product image.";
function toStringOrNull(v: unknown): string | null {
if (typeof v === "string") {
const t = v.trim();
return t.length > 0 ? t : null;
}
if (typeof v === "number" && Number.isFinite(v)) return String(v);
return null;
}
function toNumberOrNull(v: unknown): number | null {
if (typeof v === "number" && Number.isFinite(v)) return v;
if (typeof v === "string") {
// Keep digits, separators and minus; strip currency symbols/spaces.
const cleaned = v.replace(/[^0-9.,-]/g, "");
if (!cleaned) return null;
// Treat the last separator as the decimal point; drop thousands separators.
const lastSep = Math.max(cleaned.lastIndexOf("."), cleaned.lastIndexOf(","));
let normalized: string;
if (lastSep === -1) {
normalized = cleaned;
} else {
const intPart = cleaned.slice(0, lastSep).replace(/[.,]/g, "");
const fracPart = cleaned.slice(lastSep + 1).replace(/[.,]/g, "");
normalized = `${intPart}.${fracPart}`;
}
const n = Number(normalized);
return Number.isFinite(n) ? n : null;
}
return null;
}
// schema.org availability can be a full URL ("https://schema.org/InStock"),
// a bare token ("InStock") or free text. Normalize to a readable label.
function normalizeAvailability(v: unknown): string | null {
const s = toStringOrNull(v);
if (!s) return null;
const token = s.split(/[/#]/).pop() ?? s;
const map: Record<string, string> = {
InStock: "In Stock",
OutOfStock: "Out of Stock",
PreOrder: "Pre-Order",
PreSale: "Pre-Sale",
BackOrder: "Backordered",
Discontinued: "Discontinued",
SoldOut: "Sold Out",
LimitedAvailability: "Limited Availability",
OnlineOnly: "Online Only",
InStoreOnly: "In Store Only",
};
return map[token] ?? s;
}
function firstString(v: unknown): string | null {
if (Array.isArray(v)) {
for (const x of v) {
const s = firstImageUrl(x);
if (s) return s;
}
return null;
}
return firstImageUrl(v);
}
// Image entries in JSON-LD may be a string, an array, or ImageObject(s).
function firstImageUrl(v: unknown): string | null {
if (typeof v === "string") return toStringOrNull(v);
if (v && typeof v === "object") {
const o = v as Record<string, unknown>;
return toStringOrNull(o.url ?? o.contentUrl);
}
return null;
}
function brandName(v: unknown): string | null {
if (typeof v === "string") return toStringOrNull(v);
if (Array.isArray(v)) return brandName(v[0]);
if (v && typeof v === "object") return toStringOrNull((v as Record<string, unknown>).name);
return null;
}
// Flatten every node out of one or more JSON-LD documents (handles @graph and
// top-level arrays) so we can scan for the Product node.
function collectNodes(doc: unknown, out: Record<string, unknown>[]): void {
if (Array.isArray(doc)) {
for (const d of doc) collectNodes(d, out);
return;
}
if (doc && typeof doc === "object") {
const o = doc as Record<string, unknown>;
out.push(o);
if (Array.isArray(o["@graph"])) collectNodes(o["@graph"], out);
}
}
function hasType(node: Record<string, unknown>, type: string): boolean {
const t = node["@type"];
if (typeof t === "string") return t === type;
if (Array.isArray(t)) return t.includes(type);
return false;
}
interface JsonLdProduct {
title: string | null;
brand: string | null;
currentPrice: number | null;
currency: string | null;
starRating: number | null;
reviewCount: number | null;
availability: string | null;
mainImageUrl: string | null;
}
function parseJsonLd(html: string): JsonLdProduct | null {
const $ = cheerio.load(html);
const nodes: Record<string, unknown>[] = [];
$('script[type="application/ld+json"]').each((_, el) => {
const text = $(el).contents().text().trim();
if (!text) return;
try {
collectNodes(JSON.parse(text), nodes);
} catch {
/* ignore malformed JSON-LD blocks */
}
});
const product = nodes.find((n) => hasType(n, "Product"));
if (!product) return null;
// offers can be an Offer, an array of Offers, or an AggregateOffer.
let offer: Record<string, unknown> | undefined;
const offers = product.offers;
if (Array.isArray(offers)) {
offer = offers.find((o) => o && typeof o === "object") as Record<string, unknown> | undefined;
} else if (offers && typeof offers === "object") {
offer = offers as Record<string, unknown>;
}
let currentPrice: number | null = null;
let currency: string | null = null;
let availability: string | null = null;
if (offer) {
currentPrice = toNumberOrNull(offer.price ?? offer.lowPrice ?? offer.highPrice);
currency = toStringOrNull(offer.priceCurrency);
availability = normalizeAvailability(offer.availability);
if (currentPrice === null && offer.priceSpecification) {
const ps = Array.isArray(offer.priceSpecification)
? offer.priceSpecification[0]
: offer.priceSpecification;
if (ps && typeof ps === "object") {
currentPrice = toNumberOrNull((ps as Record<string, unknown>).price);
currency = currency ?? toStringOrNull((ps as Record<string, unknown>).priceCurrency);
}
}
}
let starRating: number | null = null;
let reviewCount: number | null = null;
const agg = product.aggregateRating;
if (agg && typeof agg === "object") {
const a = agg as Record<string, unknown>;
starRating = toNumberOrNull(a.ratingValue);
reviewCount = toNumberOrNull(a.reviewCount ?? a.ratingCount);
}
return {
title: toStringOrNull(product.name),
brand: brandName(product.brand) ?? brandName(product.manufacturer),
currentPrice,
currency,
starRating,
reviewCount,
availability,
mainImageUrl: firstString(product.image),
};
}
async function main(): Promise<void> {
const { values } = parseArgs({
strict: true,
options: { url: { type: "string" } },
});
const rawUrl = values.url;
if (!rawUrl || rawUrl.trim().length === 0) {
console.error("Missing required parameter: --url=<product page URL>");
process.exit(1);
}
let target: URL;
try {
target = new URL(rawUrl.trim());
} catch {
throw new Error(`OUT_OF_SCOPE: not a valid URL: ${rawUrl}`);
}
if (target.protocol !== "http:" && target.protocol !== "https:") {
throw new Error(`OUT_OF_SCOPE: URL must use http or https: ${rawUrl}`);
}
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY environment variable is not set");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
console.error(`Scraping ${target.toString()}`);
const res = (await firecrawl.scrape(target.toString(), {
formats: [
"rawHtml",
{ type: "json", schema: PRODUCT_SCHEMA as unknown as Record<string, unknown>, prompt: EXTRACTION_PROMPT },
],
proxy: "auto",
integration: "prometheus",
} as Parameters<typeof firecrawl.scrape>[1])) as {
rawHtml?: string;
html?: string;
json?: Record<string, unknown>;
};
const html = res.rawHtml ?? res.html ?? "";
const ld = html ? parseJsonLd(html) : null;
if (ld) console.error("Found schema.org Product JSON-LD");
const llm = (res.json ?? {}) as Record<string, unknown>;
// Merge: JSON-LD (deterministic, authoritative for the main product) wins;
// LLM extraction fills any gaps and supplies fields JSON-LD rarely carries
// (e.g. the original/list price).
const product: Product = {
url: target.toString(),
title: ld?.title ?? toStringOrNull(llm.title),
brand: ld?.brand ?? toStringOrNull(llm.brand),
currentPrice: ld?.currentPrice ?? toNumberOrNull(llm.currentPrice),
originalPrice: toNumberOrNull(llm.originalPrice),
currency: ld?.currency ?? toStringOrNull(llm.currency),
starRating: ld?.starRating ?? toNumberOrNull(llm.starRating),
reviewCount: ld?.reviewCount ?? toNumberOrNull(llm.reviewCount),
availability: ld?.availability ?? normalizeAvailability(llm.availability),
mainImageUrl: ld?.mainImageUrl ?? toStringOrNull(llm.mainImageUrl),
};
// If absolutely nothing identifying came back, the page is very likely not a
// product page, was bot-blocked, or rendered empty.
if (product.title === null && product.currentPrice === null && product.mainImageUrl === null) {
throw new Error(
`could not extract product details from ${target.toString()} (page may not be a product page, was bot-blocked, or rendered empty)`,
);
}
process.stdout.write(JSON.stringify(product));
}
main().catch((err) => {
console.error(err instanceof Error ? err.message : String(err));
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.