chore: ebay parser fix
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
@@ -40,6 +40,229 @@ export interface EbayListingDetails {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£€¥])/u;
|
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£€¥])/u;
|
||||||
|
const EBAY_ITEM_URL_RE = /^https?:\/\/(?:www\.)?ebay\.(?:ca|com)\/itm\//u;
|
||||||
|
|
||||||
|
function decodeHtmlEntities(value: string): string {
|
||||||
|
return value
|
||||||
|
.replace(/&/g, "&")
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/</g, "<")
|
||||||
|
.replace(/>/g, ">")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function stripHtml(value: string): string {
|
||||||
|
return decodeHtmlEntities(
|
||||||
|
value.replace(/<[^>]*>/g, " ").replace(/\s+/g, " "),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getHtmlAttr(tag: string, attrName: string): string | null {
|
||||||
|
const attrMatch = tag.match(
|
||||||
|
new RegExp(`\\s${attrName}=(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`, "iu"),
|
||||||
|
);
|
||||||
|
return attrMatch?.[1] ?? attrMatch?.[2] ?? attrMatch?.[3] ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeEbayUrl(url: string): string | null {
|
||||||
|
const decodedUrl = decodeHtmlEntities(url);
|
||||||
|
try {
|
||||||
|
const parsed = new URL(decodedUrl, "https://www.ebay.ca");
|
||||||
|
return EBAY_ITEM_URL_RE.test(parsed.href) ? parsed.href : null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function toEbayListing(
|
||||||
|
url: string,
|
||||||
|
title: string,
|
||||||
|
priceText: string,
|
||||||
|
): EbayListingDetails | null {
|
||||||
|
const normalizedUrl = normalizeEbayUrl(url);
|
||||||
|
const cleanedTitle = stripHtml(title);
|
||||||
|
const cleanedPrice = stripHtml(priceText);
|
||||||
|
const priceInfo = parseEbayPrice(cleanedPrice);
|
||||||
|
|
||||||
|
if (!normalizedUrl || !cleanedTitle || cleanedTitle === "Shop on eBay") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!priceInfo) return null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: normalizedUrl,
|
||||||
|
title: cleanedTitle,
|
||||||
|
listingPrice: {
|
||||||
|
amountFormatted: cleanedPrice,
|
||||||
|
cents: priceInfo.cents,
|
||||||
|
currency: priceInfo.currency,
|
||||||
|
},
|
||||||
|
listingType: "OFFER",
|
||||||
|
listingStatus: "ACTIVE",
|
||||||
|
address: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function readObjectString(
|
||||||
|
value: Record<string, unknown>,
|
||||||
|
keys: string[],
|
||||||
|
): string | null {
|
||||||
|
for (const key of keys) {
|
||||||
|
const candidate = value[key];
|
||||||
|
if (typeof candidate === "string" && candidate.trim()) {
|
||||||
|
return candidate.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function readPayloadPrice(value: Record<string, unknown>): string | null {
|
||||||
|
const directPrice = readObjectString(value, [
|
||||||
|
"price",
|
||||||
|
"currentPrice",
|
||||||
|
"displayPrice",
|
||||||
|
]);
|
||||||
|
if (directPrice) return directPrice;
|
||||||
|
|
||||||
|
for (const key of ["price", "currentPrice", "displayPrice", "priceInfo"]) {
|
||||||
|
const candidate = value[key];
|
||||||
|
if (
|
||||||
|
!candidate ||
|
||||||
|
typeof candidate !== "object" ||
|
||||||
|
Array.isArray(candidate)
|
||||||
|
) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const priceObject = candidate as Record<string, unknown>;
|
||||||
|
const formatted = readObjectString(priceObject, [
|
||||||
|
"amount",
|
||||||
|
"formatted",
|
||||||
|
"text",
|
||||||
|
]);
|
||||||
|
if (formatted) return formatted;
|
||||||
|
|
||||||
|
const numericValue = priceObject.value;
|
||||||
|
const currency = readObjectString(priceObject, [
|
||||||
|
"currency",
|
||||||
|
"currencyCode",
|
||||||
|
]);
|
||||||
|
if (typeof numericValue === "string" && numericValue.trim()) {
|
||||||
|
return currency ? `${currency} ${numericValue}` : numericValue;
|
||||||
|
}
|
||||||
|
if (typeof numericValue === "number") {
|
||||||
|
return currency ? `${currency} ${numericValue}` : String(numericValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectPayloadListings(
|
||||||
|
value: unknown,
|
||||||
|
results: EbayListingDetails[],
|
||||||
|
): void {
|
||||||
|
if (!value || typeof value !== "object") return;
|
||||||
|
|
||||||
|
if (Array.isArray(value)) {
|
||||||
|
for (const item of value) {
|
||||||
|
collectPayloadListings(item, results);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const objectValue = value as Record<string, unknown>;
|
||||||
|
const url = readObjectString(objectValue, [
|
||||||
|
"itemWebUrl",
|
||||||
|
"itemUrl",
|
||||||
|
"url",
|
||||||
|
"webUrl",
|
||||||
|
]);
|
||||||
|
const title = readObjectString(objectValue, ["title", "itemTitle", "name"]);
|
||||||
|
const priceText = readPayloadPrice(objectValue);
|
||||||
|
|
||||||
|
if (url && title && priceText) {
|
||||||
|
const listing = toEbayListing(url, title, priceText);
|
||||||
|
if (listing) {
|
||||||
|
results.push(listing);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const child of Object.values(objectValue)) {
|
||||||
|
collectPayloadListings(child, results);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseEmbeddedEbayListings(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): EbayListingDetails[] {
|
||||||
|
const results: EbayListingDetails[] = [];
|
||||||
|
const payloadMatches = htmlString.matchAll(
|
||||||
|
/data-inlinepayload=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/giu,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const match of payloadMatches) {
|
||||||
|
const rawPayload = match[1] ?? match[2] ?? match[3];
|
||||||
|
if (!rawPayload) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const decodedPayload = decodeURIComponent(decodeHtmlEntities(rawPayload));
|
||||||
|
collectPayloadListings(JSON.parse(decodedPayload), results);
|
||||||
|
} catch {
|
||||||
|
// eBay inline payloads vary by module; non-JSON payloads are ignored.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseSCardHtmlListings(htmlString: HTMLString): EbayListingDetails[] {
|
||||||
|
const results: EbayListingDetails[] = [];
|
||||||
|
const cardMatches = htmlString.matchAll(
|
||||||
|
/<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)[\s\S]*?(?=<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)|<\/body>|<\/html>)/giu,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const cardMatch of cardMatches) {
|
||||||
|
const cardHtml = cardMatch[0];
|
||||||
|
const linkTag = cardHtml.match(
|
||||||
|
/<a\b[^>]*\bhref=(?:"[^"]*\/itm\/[^"]*"|'[^']*\/itm\/[^']*'|[^\s>]*\/itm\/[^\s>]*)[^>]*>/iu,
|
||||||
|
)?.[0];
|
||||||
|
const titleMatch = cardHtml.match(
|
||||||
|
/<[^>]*\bclass=(?:"[^"]*\bs-card__title\b[^"]*"|'[^']*\bs-card__title\b[^']*'|[^\s>]*\bs-card__title\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
|
||||||
|
);
|
||||||
|
const priceMatch = cardHtml.match(
|
||||||
|
/<[^>]*\bclass=(?:"[^"]*\bs-card__price\b[^"]*"|'[^']*\bs-card__price\b[^']*'|[^\s>]*\bs-card__price\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!linkTag || !titleMatch?.[1] || !priceMatch?.[1]) continue;
|
||||||
|
|
||||||
|
const href = getHtmlAttr(linkTag, "href");
|
||||||
|
if (!href) continue;
|
||||||
|
|
||||||
|
const listing = toEbayListing(href, titleMatch[1], priceMatch[1]);
|
||||||
|
if (listing) results.push(listing);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
function dedupeEbayListings(
|
||||||
|
listings: EbayListingDetails[],
|
||||||
|
): EbayListingDetails[] {
|
||||||
|
const results: EbayListingDetails[] = [];
|
||||||
|
const seenUrls = new Set<string>();
|
||||||
|
|
||||||
|
for (const listing of listings) {
|
||||||
|
const canonicalUrl = canonicalizeEbayItemUrl(listing.url);
|
||||||
|
if (seenUrls.has(canonicalUrl)) continue;
|
||||||
|
seenUrls.add(canonicalUrl);
|
||||||
|
results.push(listing);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
function canonicalizeEbayItemUrl(url: string): string {
|
function canonicalizeEbayItemUrl(url: string): string {
|
||||||
try {
|
try {
|
||||||
@@ -124,6 +347,11 @@ function parseEbayListings(
|
|||||||
exclusions: string[],
|
exclusions: string[],
|
||||||
strictMode: boolean,
|
strictMode: boolean,
|
||||||
): EbayListingDetails[] {
|
): EbayListingDetails[] {
|
||||||
|
const embeddedListings = parseEmbeddedEbayListings(htmlString);
|
||||||
|
if (embeddedListings.length > 0) {
|
||||||
|
return dedupeEbayListings(embeddedListings);
|
||||||
|
}
|
||||||
|
|
||||||
const { document } = parseHTML(htmlString);
|
const { document } = parseHTML(htmlString);
|
||||||
const results: EbayListingDetails[] = [];
|
const results: EbayListingDetails[] = [];
|
||||||
const seenUrls = new Set<string>();
|
const seenUrls = new Set<string>();
|
||||||
@@ -359,7 +587,28 @@ function parseEbayListings(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
if (results.length > 0) {
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
return dedupeEbayListings(
|
||||||
|
parseSCardHtmlListings(htmlString).filter((listing) => {
|
||||||
|
if (
|
||||||
|
exclusions.some((exclusion) =>
|
||||||
|
listing.title.toLowerCase().includes(exclusion.toLowerCase()),
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
!strictMode ||
|
||||||
|
keywords.some((keyword) =>
|
||||||
|
listing.title.toLowerCase().includes(keyword.toLowerCase()),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----------------------------- Cookie Loading -----------------------------
|
// ----------------------------- Cookie Loading -----------------------------
|
||||||
@@ -481,12 +730,14 @@ export default async function fetchEbayItems(
|
|||||||
// Use custom headers modeled after real browser requests to bypass bot detection
|
// Use custom headers modeled after real browser requests to bypass bot detection
|
||||||
const headers: Record<string, string> = {
|
const headers: Record<string, string> = {
|
||||||
"User-Agent":
|
"User-Agent":
|
||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0",
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Accept-Language": "en-US,en;q=0.5",
|
"Accept-Language": "en-CA,en-US;q=0.9,en;q=0.8",
|
||||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||||
Referer: "https://www.ebay.ca/",
|
Referer: "https://www.ebay.ca/",
|
||||||
Connection: "keep-alive",
|
Connection: "keep-alive",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
Pragma: "no-cache",
|
||||||
"Upgrade-Insecure-Requests": "1",
|
"Upgrade-Insecure-Requests": "1",
|
||||||
"Sec-Fetch-Dest": "document",
|
"Sec-Fetch-Dest": "document",
|
||||||
"Sec-Fetch-Mode": "navigate",
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import { logger } from "./logger";
|
|||||||
export interface Cookie {
|
export interface Cookie {
|
||||||
name: string;
|
name: string;
|
||||||
value: string;
|
value: string;
|
||||||
|
rawValue?: string;
|
||||||
domain: string;
|
domain: string;
|
||||||
path: string;
|
path: string;
|
||||||
secure?: boolean;
|
secure?: boolean;
|
||||||
@@ -55,6 +56,7 @@ export function parseCookieString(
|
|||||||
return {
|
return {
|
||||||
name: trimmedName,
|
name: trimmedName,
|
||||||
value: decodeURIComponent(trimmedValue),
|
value: decodeURIComponent(trimmedValue),
|
||||||
|
rawValue: trimmedValue,
|
||||||
domain,
|
domain,
|
||||||
path: "/",
|
path: "/",
|
||||||
secure: true,
|
secure: true,
|
||||||
@@ -95,7 +97,7 @@ export function formatCookiesForHeader(
|
|||||||
});
|
});
|
||||||
|
|
||||||
return validCookies
|
return validCookies
|
||||||
.map((cookie) => `${cookie.name}=${cookie.value}`)
|
.map((cookie) => `${cookie.name}=${cookie.rawValue ?? cookie.value}`)
|
||||||
.join("; ");
|
.join("; ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ const originalWarn = console.warn;
|
|||||||
|
|
||||||
describe("eBay Scraper Cookie Handling", () => {
|
describe("eBay Scraper Cookie Handling", () => {
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
|
delete process.env.EBAY_COOKIE;
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
@@ -210,6 +211,81 @@ describe("eBay Scraper Cookie Handling", () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("parses current eBay s-card markup with unquoted item links", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () =>
|
||||||
|
Promise.resolve(`
|
||||||
|
<html><body>
|
||||||
|
<div class="s-card s-card--horizontal">
|
||||||
|
<div class=su-card-container__header>
|
||||||
|
<a class=s-card__link href=https://ebay.com/itm/1234567890?itmmeta=abc>
|
||||||
|
<div role=heading aria-level=3 class=s-card__title>
|
||||||
|
<span class="su-styled-text primary default">Apple MacBook Air M1 2020 8GB 256GB</span>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<div class=su-card-container__attributes>
|
||||||
|
<span class="su-styled-text primary bold large-1 s-card__price">CA $599.00</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>
|
||||||
|
`),
|
||||||
|
}),
|
||||||
|
) as unknown as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchEbayItems("macbook", 1000);
|
||||||
|
|
||||||
|
expect(results).toEqual([
|
||||||
|
expect.objectContaining({
|
||||||
|
title: "Apple MacBook Air M1 2020 8GB 256GB",
|
||||||
|
url: "https://ebay.com/itm/1234567890?itmmeta=abc",
|
||||||
|
listingPrice: expect.objectContaining({ cents: 59_900 }),
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("parses embedded eBay payload listings before HTML fallback", async () => {
|
||||||
|
const payload = encodeURIComponent(
|
||||||
|
JSON.stringify({
|
||||||
|
searchResults: [
|
||||||
|
{
|
||||||
|
title: "Apple MacBook Air M1 API Result",
|
||||||
|
itemWebUrl: "https://www.ebay.ca/itm/9876543210?hash=item987",
|
||||||
|
price: { value: "550.00", currency: "CAD" },
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () =>
|
||||||
|
Promise.resolve(`
|
||||||
|
<html><body>
|
||||||
|
<script data-inlinepayload="${payload}"></script>
|
||||||
|
</body></html>
|
||||||
|
`),
|
||||||
|
}),
|
||||||
|
) as unknown as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchEbayItems("macbook", 1000);
|
||||||
|
|
||||||
|
expect(results).toEqual([
|
||||||
|
expect.objectContaining({
|
||||||
|
title: "Apple MacBook Air M1 API Result",
|
||||||
|
url: "https://www.ebay.ca/itm/9876543210?hash=item987",
|
||||||
|
listingPrice: expect.objectContaining({
|
||||||
|
amountFormatted: "CAD 550.00",
|
||||||
|
cents: 55_000,
|
||||||
|
currency: "CAD",
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
test("treats US dollar prices as USD", async () => {
|
test("treats US dollar prices as USD", async () => {
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
expect(result[0]).toEqual({
|
expect(result[0]).toEqual({
|
||||||
name: "c_user",
|
name: "c_user",
|
||||||
value: "123456789",
|
value: "123456789",
|
||||||
|
rawValue: "123456789",
|
||||||
domain: ".facebook.com",
|
domain: ".facebook.com",
|
||||||
path: "/",
|
path: "/",
|
||||||
secure: true,
|
secure: true,
|
||||||
@@ -80,6 +81,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
expect(result[1]).toEqual({
|
expect(result[1]).toEqual({
|
||||||
name: "xs",
|
name: "xs",
|
||||||
value: "abcdef123456",
|
value: "abcdef123456",
|
||||||
|
rawValue: "abcdef123456",
|
||||||
domain: ".facebook.com",
|
domain: ".facebook.com",
|
||||||
path: "/",
|
path: "/",
|
||||||
secure: true,
|
secure: true,
|
||||||
@@ -97,6 +99,16 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
expect(result[1]?.value).toBe("abc=def");
|
expect(result[1]?.value).toBe("abc=def");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("should preserve raw encoded values when formatting cookie headers", () => {
|
||||||
|
const cookieString = "c_user=123%2B456; xs=abc%3Ddef";
|
||||||
|
const result = formatCookiesForHeader(
|
||||||
|
parseFacebookCookieString(cookieString),
|
||||||
|
"www.facebook.com",
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result).toBe(cookieString);
|
||||||
|
});
|
||||||
|
|
||||||
test("should filter out malformed cookies", () => {
|
test("should filter out malformed cookies", () => {
|
||||||
const cookieString = "c_user=123; invalid; xs=abc; =empty";
|
const cookieString = "c_user=123; invalid; xs=abc; =empty";
|
||||||
const result = parseFacebookCookieString(cookieString);
|
const result = parseFacebookCookieString(cookieString);
|
||||||
|
|||||||
Reference in New Issue
Block a user