chore: ebay parser fix
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
@@ -40,6 +40,229 @@ export interface EbayListingDetails {
|
||||
}
|
||||
|
||||
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£€¥])/u;
|
||||
const EBAY_ITEM_URL_RE = /^https?:\/\/(?:www\.)?ebay\.(?:ca|com)\/itm\//u;
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, "&")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function stripHtml(value: string): string {
|
||||
return decodeHtmlEntities(
|
||||
value.replace(/<[^>]*>/g, " ").replace(/\s+/g, " "),
|
||||
);
|
||||
}
|
||||
|
||||
function getHtmlAttr(tag: string, attrName: string): string | null {
|
||||
const attrMatch = tag.match(
|
||||
new RegExp(`\\s${attrName}=(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`, "iu"),
|
||||
);
|
||||
return attrMatch?.[1] ?? attrMatch?.[2] ?? attrMatch?.[3] ?? null;
|
||||
}
|
||||
|
||||
function normalizeEbayUrl(url: string): string | null {
|
||||
const decodedUrl = decodeHtmlEntities(url);
|
||||
try {
|
||||
const parsed = new URL(decodedUrl, "https://www.ebay.ca");
|
||||
return EBAY_ITEM_URL_RE.test(parsed.href) ? parsed.href : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function toEbayListing(
|
||||
url: string,
|
||||
title: string,
|
||||
priceText: string,
|
||||
): EbayListingDetails | null {
|
||||
const normalizedUrl = normalizeEbayUrl(url);
|
||||
const cleanedTitle = stripHtml(title);
|
||||
const cleanedPrice = stripHtml(priceText);
|
||||
const priceInfo = parseEbayPrice(cleanedPrice);
|
||||
|
||||
if (!normalizedUrl || !cleanedTitle || cleanedTitle === "Shop on eBay") {
|
||||
return null;
|
||||
}
|
||||
if (!priceInfo) return null;
|
||||
|
||||
return {
|
||||
url: normalizedUrl,
|
||||
title: cleanedTitle,
|
||||
listingPrice: {
|
||||
amountFormatted: cleanedPrice,
|
||||
cents: priceInfo.cents,
|
||||
currency: priceInfo.currency,
|
||||
},
|
||||
listingType: "OFFER",
|
||||
listingStatus: "ACTIVE",
|
||||
address: null,
|
||||
};
|
||||
}
|
||||
|
||||
function readObjectString(
|
||||
value: Record<string, unknown>,
|
||||
keys: string[],
|
||||
): string | null {
|
||||
for (const key of keys) {
|
||||
const candidate = value[key];
|
||||
if (typeof candidate === "string" && candidate.trim()) {
|
||||
return candidate.trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function readPayloadPrice(value: Record<string, unknown>): string | null {
|
||||
const directPrice = readObjectString(value, [
|
||||
"price",
|
||||
"currentPrice",
|
||||
"displayPrice",
|
||||
]);
|
||||
if (directPrice) return directPrice;
|
||||
|
||||
for (const key of ["price", "currentPrice", "displayPrice", "priceInfo"]) {
|
||||
const candidate = value[key];
|
||||
if (
|
||||
!candidate ||
|
||||
typeof candidate !== "object" ||
|
||||
Array.isArray(candidate)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const priceObject = candidate as Record<string, unknown>;
|
||||
const formatted = readObjectString(priceObject, [
|
||||
"amount",
|
||||
"formatted",
|
||||
"text",
|
||||
]);
|
||||
if (formatted) return formatted;
|
||||
|
||||
const numericValue = priceObject.value;
|
||||
const currency = readObjectString(priceObject, [
|
||||
"currency",
|
||||
"currencyCode",
|
||||
]);
|
||||
if (typeof numericValue === "string" && numericValue.trim()) {
|
||||
return currency ? `${currency} ${numericValue}` : numericValue;
|
||||
}
|
||||
if (typeof numericValue === "number") {
|
||||
return currency ? `${currency} ${numericValue}` : String(numericValue);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function collectPayloadListings(
|
||||
value: unknown,
|
||||
results: EbayListingDetails[],
|
||||
): void {
|
||||
if (!value || typeof value !== "object") return;
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) {
|
||||
collectPayloadListings(item, results);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const objectValue = value as Record<string, unknown>;
|
||||
const url = readObjectString(objectValue, [
|
||||
"itemWebUrl",
|
||||
"itemUrl",
|
||||
"url",
|
||||
"webUrl",
|
||||
]);
|
||||
const title = readObjectString(objectValue, ["title", "itemTitle", "name"]);
|
||||
const priceText = readPayloadPrice(objectValue);
|
||||
|
||||
if (url && title && priceText) {
|
||||
const listing = toEbayListing(url, title, priceText);
|
||||
if (listing) {
|
||||
results.push(listing);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (const child of Object.values(objectValue)) {
|
||||
collectPayloadListings(child, results);
|
||||
}
|
||||
}
|
||||
|
||||
function parseEmbeddedEbayListings(
|
||||
htmlString: HTMLString,
|
||||
): EbayListingDetails[] {
|
||||
const results: EbayListingDetails[] = [];
|
||||
const payloadMatches = htmlString.matchAll(
|
||||
/data-inlinepayload=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/giu,
|
||||
);
|
||||
|
||||
for (const match of payloadMatches) {
|
||||
const rawPayload = match[1] ?? match[2] ?? match[3];
|
||||
if (!rawPayload) continue;
|
||||
|
||||
try {
|
||||
const decodedPayload = decodeURIComponent(decodeHtmlEntities(rawPayload));
|
||||
collectPayloadListings(JSON.parse(decodedPayload), results);
|
||||
} catch {
|
||||
// eBay inline payloads vary by module; non-JSON payloads are ignored.
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function parseSCardHtmlListings(htmlString: HTMLString): EbayListingDetails[] {
|
||||
const results: EbayListingDetails[] = [];
|
||||
const cardMatches = htmlString.matchAll(
|
||||
/<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)[\s\S]*?(?=<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)|<\/body>|<\/html>)/giu,
|
||||
);
|
||||
|
||||
for (const cardMatch of cardMatches) {
|
||||
const cardHtml = cardMatch[0];
|
||||
const linkTag = cardHtml.match(
|
||||
/<a\b[^>]*\bhref=(?:"[^"]*\/itm\/[^"]*"|'[^']*\/itm\/[^']*'|[^\s>]*\/itm\/[^\s>]*)[^>]*>/iu,
|
||||
)?.[0];
|
||||
const titleMatch = cardHtml.match(
|
||||
/<[^>]*\bclass=(?:"[^"]*\bs-card__title\b[^"]*"|'[^']*\bs-card__title\b[^']*'|[^\s>]*\bs-card__title\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
|
||||
);
|
||||
const priceMatch = cardHtml.match(
|
||||
/<[^>]*\bclass=(?:"[^"]*\bs-card__price\b[^"]*"|'[^']*\bs-card__price\b[^']*'|[^\s>]*\bs-card__price\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
|
||||
);
|
||||
|
||||
if (!linkTag || !titleMatch?.[1] || !priceMatch?.[1]) continue;
|
||||
|
||||
const href = getHtmlAttr(linkTag, "href");
|
||||
if (!href) continue;
|
||||
|
||||
const listing = toEbayListing(href, titleMatch[1], priceMatch[1]);
|
||||
if (listing) results.push(listing);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function dedupeEbayListings(
|
||||
listings: EbayListingDetails[],
|
||||
): EbayListingDetails[] {
|
||||
const results: EbayListingDetails[] = [];
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
for (const listing of listings) {
|
||||
const canonicalUrl = canonicalizeEbayItemUrl(listing.url);
|
||||
if (seenUrls.has(canonicalUrl)) continue;
|
||||
seenUrls.add(canonicalUrl);
|
||||
results.push(listing);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function canonicalizeEbayItemUrl(url: string): string {
|
||||
try {
|
||||
@@ -124,6 +347,11 @@ function parseEbayListings(
|
||||
exclusions: string[],
|
||||
strictMode: boolean,
|
||||
): EbayListingDetails[] {
|
||||
const embeddedListings = parseEmbeddedEbayListings(htmlString);
|
||||
if (embeddedListings.length > 0) {
|
||||
return dedupeEbayListings(embeddedListings);
|
||||
}
|
||||
|
||||
const { document } = parseHTML(htmlString);
|
||||
const results: EbayListingDetails[] = [];
|
||||
const seenUrls = new Set<string>();
|
||||
@@ -359,7 +587,28 @@ function parseEbayListings(
|
||||
}
|
||||
}
|
||||
|
||||
if (results.length > 0) {
|
||||
return results;
|
||||
}
|
||||
|
||||
return dedupeEbayListings(
|
||||
parseSCardHtmlListings(htmlString).filter((listing) => {
|
||||
if (
|
||||
exclusions.some((exclusion) =>
|
||||
listing.title.toLowerCase().includes(exclusion.toLowerCase()),
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
!strictMode ||
|
||||
keywords.some((keyword) =>
|
||||
listing.title.toLowerCase().includes(keyword.toLowerCase()),
|
||||
)
|
||||
);
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// ----------------------------- Cookie Loading -----------------------------
|
||||
@@ -481,12 +730,14 @@ export default async function fetchEbayItems(
|
||||
// Use custom headers modeled after real browser requests to bypass bot detection
|
||||
const headers: Record<string, string> = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Language": "en-CA,en-US;q=0.9,en;q=0.8",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
Referer: "https://www.ebay.ca/",
|
||||
Connection: "keep-alive",
|
||||
"Cache-Control": "no-cache",
|
||||
Pragma: "no-cache",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
|
||||
@@ -7,6 +7,7 @@ import { logger } from "./logger";
|
||||
export interface Cookie {
|
||||
name: string;
|
||||
value: string;
|
||||
rawValue?: string;
|
||||
domain: string;
|
||||
path: string;
|
||||
secure?: boolean;
|
||||
@@ -55,6 +56,7 @@ export function parseCookieString(
|
||||
return {
|
||||
name: trimmedName,
|
||||
value: decodeURIComponent(trimmedValue),
|
||||
rawValue: trimmedValue,
|
||||
domain,
|
||||
path: "/",
|
||||
secure: true,
|
||||
@@ -95,7 +97,7 @@ export function formatCookiesForHeader(
|
||||
});
|
||||
|
||||
return validCookies
|
||||
.map((cookie) => `${cookie.name}=${cookie.value}`)
|
||||
.map((cookie) => `${cookie.name}=${cookie.rawValue ?? cookie.value}`)
|
||||
.join("; ");
|
||||
}
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ const originalWarn = console.warn;
|
||||
|
||||
describe("eBay Scraper Cookie Handling", () => {
|
||||
beforeEach(() => {
|
||||
delete process.env.EBAY_COOKIE;
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
@@ -210,6 +211,81 @@ describe("eBay Scraper Cookie Handling", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
test("parses current eBay s-card markup with unquoted item links", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(`
|
||||
<html><body>
|
||||
<div class="s-card s-card--horizontal">
|
||||
<div class=su-card-container__header>
|
||||
<a class=s-card__link href=https://ebay.com/itm/1234567890?itmmeta=abc>
|
||||
<div role=heading aria-level=3 class=s-card__title>
|
||||
<span class="su-styled-text primary default">Apple MacBook Air M1 2020 8GB 256GB</span>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class=su-card-container__attributes>
|
||||
<span class="su-styled-text primary bold large-1 s-card__price">CA $599.00</span>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>
|
||||
`),
|
||||
}),
|
||||
) as unknown as typeof fetch;
|
||||
|
||||
const results = await fetchEbayItems("macbook", 1000);
|
||||
|
||||
expect(results).toEqual([
|
||||
expect.objectContaining({
|
||||
title: "Apple MacBook Air M1 2020 8GB 256GB",
|
||||
url: "https://ebay.com/itm/1234567890?itmmeta=abc",
|
||||
listingPrice: expect.objectContaining({ cents: 59_900 }),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
test("parses embedded eBay payload listings before HTML fallback", async () => {
|
||||
const payload = encodeURIComponent(
|
||||
JSON.stringify({
|
||||
searchResults: [
|
||||
{
|
||||
title: "Apple MacBook Air M1 API Result",
|
||||
itemWebUrl: "https://www.ebay.ca/itm/9876543210?hash=item987",
|
||||
price: { value: "550.00", currency: "CAD" },
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(`
|
||||
<html><body>
|
||||
<script data-inlinepayload="${payload}"></script>
|
||||
</body></html>
|
||||
`),
|
||||
}),
|
||||
) as unknown as typeof fetch;
|
||||
|
||||
const results = await fetchEbayItems("macbook", 1000);
|
||||
|
||||
expect(results).toEqual([
|
||||
expect.objectContaining({
|
||||
title: "Apple MacBook Air M1 API Result",
|
||||
url: "https://www.ebay.ca/itm/9876543210?hash=item987",
|
||||
listingPrice: expect.objectContaining({
|
||||
amountFormatted: "CAD 550.00",
|
||||
cents: 55_000,
|
||||
currency: "CAD",
|
||||
}),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
test("treats US dollar prices as USD", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
|
||||
@@ -70,6 +70,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
expect(result[0]).toEqual({
|
||||
name: "c_user",
|
||||
value: "123456789",
|
||||
rawValue: "123456789",
|
||||
domain: ".facebook.com",
|
||||
path: "/",
|
||||
secure: true,
|
||||
@@ -80,6 +81,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
expect(result[1]).toEqual({
|
||||
name: "xs",
|
||||
value: "abcdef123456",
|
||||
rawValue: "abcdef123456",
|
||||
domain: ".facebook.com",
|
||||
path: "/",
|
||||
secure: true,
|
||||
@@ -97,6 +99,16 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
expect(result[1]?.value).toBe("abc=def");
|
||||
});
|
||||
|
||||
test("should preserve raw encoded values when formatting cookie headers", () => {
|
||||
const cookieString = "c_user=123%2B456; xs=abc%3Ddef";
|
||||
const result = formatCookiesForHeader(
|
||||
parseFacebookCookieString(cookieString),
|
||||
"www.facebook.com",
|
||||
);
|
||||
|
||||
expect(result).toBe(cookieString);
|
||||
});
|
||||
|
||||
test("should filter out malformed cookies", () => {
|
||||
const cookieString = "c_user=123; invalid; xs=abc; =empty";
|
||||
const result = parseFacebookCookieString(cookieString);
|
||||
|
||||
Reference in New Issue
Block a user