chore: ebay parser fix

Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
2026-04-30 16:56:55 -04:00
parent 3a722a2d11
commit 84f17fbdfd
4 changed files with 345 additions and 4 deletions

View File

@@ -40,6 +40,229 @@ export interface EbayListingDetails {
} }
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£¥])/u; const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£¥])/u;
const EBAY_ITEM_URL_RE = /^https?:\/\/(?:www\.)?ebay\.(?:ca|com)\/itm\//u;
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.trim();
}
function stripHtml(value: string): string {
return decodeHtmlEntities(
value.replace(/<[^>]*>/g, " ").replace(/\s+/g, " "),
);
}
function getHtmlAttr(tag: string, attrName: string): string | null {
const attrMatch = tag.match(
new RegExp(`\\s${attrName}=(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`, "iu"),
);
return attrMatch?.[1] ?? attrMatch?.[2] ?? attrMatch?.[3] ?? null;
}
function normalizeEbayUrl(url: string): string | null {
const decodedUrl = decodeHtmlEntities(url);
try {
const parsed = new URL(decodedUrl, "https://www.ebay.ca");
return EBAY_ITEM_URL_RE.test(parsed.href) ? parsed.href : null;
} catch {
return null;
}
}
function toEbayListing(
url: string,
title: string,
priceText: string,
): EbayListingDetails | null {
const normalizedUrl = normalizeEbayUrl(url);
const cleanedTitle = stripHtml(title);
const cleanedPrice = stripHtml(priceText);
const priceInfo = parseEbayPrice(cleanedPrice);
if (!normalizedUrl || !cleanedTitle || cleanedTitle === "Shop on eBay") {
return null;
}
if (!priceInfo) return null;
return {
url: normalizedUrl,
title: cleanedTitle,
listingPrice: {
amountFormatted: cleanedPrice,
cents: priceInfo.cents,
currency: priceInfo.currency,
},
listingType: "OFFER",
listingStatus: "ACTIVE",
address: null,
};
}
function readObjectString(
value: Record<string, unknown>,
keys: string[],
): string | null {
for (const key of keys) {
const candidate = value[key];
if (typeof candidate === "string" && candidate.trim()) {
return candidate.trim();
}
}
return null;
}
function readPayloadPrice(value: Record<string, unknown>): string | null {
const directPrice = readObjectString(value, [
"price",
"currentPrice",
"displayPrice",
]);
if (directPrice) return directPrice;
for (const key of ["price", "currentPrice", "displayPrice", "priceInfo"]) {
const candidate = value[key];
if (
!candidate ||
typeof candidate !== "object" ||
Array.isArray(candidate)
) {
continue;
}
const priceObject = candidate as Record<string, unknown>;
const formatted = readObjectString(priceObject, [
"amount",
"formatted",
"text",
]);
if (formatted) return formatted;
const numericValue = priceObject.value;
const currency = readObjectString(priceObject, [
"currency",
"currencyCode",
]);
if (typeof numericValue === "string" && numericValue.trim()) {
return currency ? `${currency} ${numericValue}` : numericValue;
}
if (typeof numericValue === "number") {
return currency ? `${currency} ${numericValue}` : String(numericValue);
}
}
return null;
}
function collectPayloadListings(
value: unknown,
results: EbayListingDetails[],
): void {
if (!value || typeof value !== "object") return;
if (Array.isArray(value)) {
for (const item of value) {
collectPayloadListings(item, results);
}
return;
}
const objectValue = value as Record<string, unknown>;
const url = readObjectString(objectValue, [
"itemWebUrl",
"itemUrl",
"url",
"webUrl",
]);
const title = readObjectString(objectValue, ["title", "itemTitle", "name"]);
const priceText = readPayloadPrice(objectValue);
if (url && title && priceText) {
const listing = toEbayListing(url, title, priceText);
if (listing) {
results.push(listing);
return;
}
}
for (const child of Object.values(objectValue)) {
collectPayloadListings(child, results);
}
}
function parseEmbeddedEbayListings(
htmlString: HTMLString,
): EbayListingDetails[] {
const results: EbayListingDetails[] = [];
const payloadMatches = htmlString.matchAll(
/data-inlinepayload=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/giu,
);
for (const match of payloadMatches) {
const rawPayload = match[1] ?? match[2] ?? match[3];
if (!rawPayload) continue;
try {
const decodedPayload = decodeURIComponent(decodeHtmlEntities(rawPayload));
collectPayloadListings(JSON.parse(decodedPayload), results);
} catch {
// eBay inline payloads vary by module; non-JSON payloads are ignored.
}
}
return results;
}
function parseSCardHtmlListings(htmlString: HTMLString): EbayListingDetails[] {
const results: EbayListingDetails[] = [];
const cardMatches = htmlString.matchAll(
/<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)[\s\S]*?(?=<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)|<\/body>|<\/html>)/giu,
);
for (const cardMatch of cardMatches) {
const cardHtml = cardMatch[0];
const linkTag = cardHtml.match(
/<a\b[^>]*\bhref=(?:"[^"]*\/itm\/[^"]*"|'[^']*\/itm\/[^']*'|[^\s>]*\/itm\/[^\s>]*)[^>]*>/iu,
)?.[0];
const titleMatch = cardHtml.match(
/<[^>]*\bclass=(?:"[^"]*\bs-card__title\b[^"]*"|'[^']*\bs-card__title\b[^']*'|[^\s>]*\bs-card__title\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
);
const priceMatch = cardHtml.match(
/<[^>]*\bclass=(?:"[^"]*\bs-card__price\b[^"]*"|'[^']*\bs-card__price\b[^']*'|[^\s>]*\bs-card__price\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
);
if (!linkTag || !titleMatch?.[1] || !priceMatch?.[1]) continue;
const href = getHtmlAttr(linkTag, "href");
if (!href) continue;
const listing = toEbayListing(href, titleMatch[1], priceMatch[1]);
if (listing) results.push(listing);
}
return results;
}
function dedupeEbayListings(
listings: EbayListingDetails[],
): EbayListingDetails[] {
const results: EbayListingDetails[] = [];
const seenUrls = new Set<string>();
for (const listing of listings) {
const canonicalUrl = canonicalizeEbayItemUrl(listing.url);
if (seenUrls.has(canonicalUrl)) continue;
seenUrls.add(canonicalUrl);
results.push(listing);
}
return results;
}
function canonicalizeEbayItemUrl(url: string): string { function canonicalizeEbayItemUrl(url: string): string {
try { try {
@@ -124,6 +347,11 @@ function parseEbayListings(
exclusions: string[], exclusions: string[],
strictMode: boolean, strictMode: boolean,
): EbayListingDetails[] { ): EbayListingDetails[] {
const embeddedListings = parseEmbeddedEbayListings(htmlString);
if (embeddedListings.length > 0) {
return dedupeEbayListings(embeddedListings);
}
const { document } = parseHTML(htmlString); const { document } = parseHTML(htmlString);
const results: EbayListingDetails[] = []; const results: EbayListingDetails[] = [];
const seenUrls = new Set<string>(); const seenUrls = new Set<string>();
@@ -359,7 +587,28 @@ function parseEbayListings(
} }
} }
if (results.length > 0) {
return results; return results;
}
return dedupeEbayListings(
parseSCardHtmlListings(htmlString).filter((listing) => {
if (
exclusions.some((exclusion) =>
listing.title.toLowerCase().includes(exclusion.toLowerCase()),
)
) {
return false;
}
return (
!strictMode ||
keywords.some((keyword) =>
listing.title.toLowerCase().includes(keyword.toLowerCase()),
)
);
}),
);
} }
// ----------------------------- Cookie Loading ----------------------------- // ----------------------------- Cookie Loading -----------------------------
@@ -481,12 +730,14 @@ export default async function fetchEbayItems(
// Use custom headers modeled after real browser requests to bypass bot detection // Use custom headers modeled after real browser requests to bypass bot detection
const headers: Record<string, string> = { const headers: Record<string, string> = {
"User-Agent": "User-Agent":
"Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5", "Accept-Language": "en-CA,en-US;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Encoding": "gzip, deflate, br, zstd",
Referer: "https://www.ebay.ca/", Referer: "https://www.ebay.ca/",
Connection: "keep-alive", Connection: "keep-alive",
"Cache-Control": "no-cache",
Pragma: "no-cache",
"Upgrade-Insecure-Requests": "1", "Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document", "Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate", "Sec-Fetch-Mode": "navigate",

View File

@@ -7,6 +7,7 @@ import { logger } from "./logger";
export interface Cookie { export interface Cookie {
name: string; name: string;
value: string; value: string;
rawValue?: string;
domain: string; domain: string;
path: string; path: string;
secure?: boolean; secure?: boolean;
@@ -55,6 +56,7 @@ export function parseCookieString(
return { return {
name: trimmedName, name: trimmedName,
value: decodeURIComponent(trimmedValue), value: decodeURIComponent(trimmedValue),
rawValue: trimmedValue,
domain, domain,
path: "/", path: "/",
secure: true, secure: true,
@@ -95,7 +97,7 @@ export function formatCookiesForHeader(
}); });
return validCookies return validCookies
.map((cookie) => `${cookie.name}=${cookie.value}`) .map((cookie) => `${cookie.name}=${cookie.rawValue ?? cookie.value}`)
.join("; "); .join("; ");
} }

View File

@@ -29,6 +29,7 @@ const originalWarn = console.warn;
describe("eBay Scraper Cookie Handling", () => { describe("eBay Scraper Cookie Handling", () => {
beforeEach(() => { beforeEach(() => {
delete process.env.EBAY_COOKIE;
global.fetch = mock(() => global.fetch = mock(() =>
Promise.resolve({ Promise.resolve({
ok: true, ok: true,
@@ -210,6 +211,81 @@ describe("eBay Scraper Cookie Handling", () => {
]); ]);
}); });
test("parses current eBay s-card markup with unquoted item links", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(`
<html><body>
<div class="s-card s-card--horizontal">
<div class=su-card-container__header>
<a class=s-card__link href=https://ebay.com/itm/1234567890?itmmeta=abc>
<div role=heading aria-level=3 class=s-card__title>
<span class="su-styled-text primary default">Apple MacBook Air M1 2020 8GB 256GB</span>
</div>
</a>
</div>
<div class=su-card-container__attributes>
<span class="su-styled-text primary bold large-1 s-card__price">CA $599.00</span>
</div>
</div>
</body></html>
`),
}),
) as unknown as typeof fetch;
const results = await fetchEbayItems("macbook", 1000);
expect(results).toEqual([
expect.objectContaining({
title: "Apple MacBook Air M1 2020 8GB 256GB",
url: "https://ebay.com/itm/1234567890?itmmeta=abc",
listingPrice: expect.objectContaining({ cents: 59_900 }),
}),
]);
});
test("parses embedded eBay payload listings before HTML fallback", async () => {
const payload = encodeURIComponent(
JSON.stringify({
searchResults: [
{
title: "Apple MacBook Air M1 API Result",
itemWebUrl: "https://www.ebay.ca/itm/9876543210?hash=item987",
price: { value: "550.00", currency: "CAD" },
},
],
}),
);
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(`
<html><body>
<script data-inlinepayload="${payload}"></script>
</body></html>
`),
}),
) as unknown as typeof fetch;
const results = await fetchEbayItems("macbook", 1000);
expect(results).toEqual([
expect.objectContaining({
title: "Apple MacBook Air M1 API Result",
url: "https://www.ebay.ca/itm/9876543210?hash=item987",
listingPrice: expect.objectContaining({
amountFormatted: "CAD 550.00",
cents: 55_000,
currency: "CAD",
}),
}),
]);
});
test("treats US dollar prices as USD", async () => { test("treats US dollar prices as USD", async () => {
global.fetch = mock(() => global.fetch = mock(() =>
Promise.resolve({ Promise.resolve({

View File

@@ -70,6 +70,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
expect(result[0]).toEqual({ expect(result[0]).toEqual({
name: "c_user", name: "c_user",
value: "123456789", value: "123456789",
rawValue: "123456789",
domain: ".facebook.com", domain: ".facebook.com",
path: "/", path: "/",
secure: true, secure: true,
@@ -80,6 +81,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
expect(result[1]).toEqual({ expect(result[1]).toEqual({
name: "xs", name: "xs",
value: "abcdef123456", value: "abcdef123456",
rawValue: "abcdef123456",
domain: ".facebook.com", domain: ".facebook.com",
path: "/", path: "/",
secure: true, secure: true,
@@ -97,6 +99,16 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
expect(result[1]?.value).toBe("abc=def"); expect(result[1]?.value).toBe("abc=def");
}); });
test("should preserve raw encoded values when formatting cookie headers", () => {
const cookieString = "c_user=123%2B456; xs=abc%3Ddef";
const result = formatCookiesForHeader(
parseFacebookCookieString(cookieString),
"www.facebook.com",
);
expect(result).toBe(cookieString);
});
test("should filter out malformed cookies", () => { test("should filter out malformed cookies", () => {
const cookieString = "c_user=123; invalid; xs=abc; =empty"; const cookieString = "c_user=123; invalid; xs=abc; =empty";
const result = parseFacebookCookieString(cookieString); const result = parseFacebookCookieString(cookieString);