chore: ebay parser fix
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
@@ -40,6 +40,229 @@ export interface EbayListingDetails {
|
||||
}
|
||||
|
||||
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£€¥])/u;
|
||||
const EBAY_ITEM_URL_RE = /^https?:\/\/(?:www\.)?ebay\.(?:ca|com)\/itm\//u;
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, "&")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function stripHtml(value: string): string {
|
||||
return decodeHtmlEntities(
|
||||
value.replace(/<[^>]*>/g, " ").replace(/\s+/g, " "),
|
||||
);
|
||||
}
|
||||
|
||||
function getHtmlAttr(tag: string, attrName: string): string | null {
|
||||
const attrMatch = tag.match(
|
||||
new RegExp(`\\s${attrName}=(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`, "iu"),
|
||||
);
|
||||
return attrMatch?.[1] ?? attrMatch?.[2] ?? attrMatch?.[3] ?? null;
|
||||
}
|
||||
|
||||
function normalizeEbayUrl(url: string): string | null {
|
||||
const decodedUrl = decodeHtmlEntities(url);
|
||||
try {
|
||||
const parsed = new URL(decodedUrl, "https://www.ebay.ca");
|
||||
return EBAY_ITEM_URL_RE.test(parsed.href) ? parsed.href : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function toEbayListing(
|
||||
url: string,
|
||||
title: string,
|
||||
priceText: string,
|
||||
): EbayListingDetails | null {
|
||||
const normalizedUrl = normalizeEbayUrl(url);
|
||||
const cleanedTitle = stripHtml(title);
|
||||
const cleanedPrice = stripHtml(priceText);
|
||||
const priceInfo = parseEbayPrice(cleanedPrice);
|
||||
|
||||
if (!normalizedUrl || !cleanedTitle || cleanedTitle === "Shop on eBay") {
|
||||
return null;
|
||||
}
|
||||
if (!priceInfo) return null;
|
||||
|
||||
return {
|
||||
url: normalizedUrl,
|
||||
title: cleanedTitle,
|
||||
listingPrice: {
|
||||
amountFormatted: cleanedPrice,
|
||||
cents: priceInfo.cents,
|
||||
currency: priceInfo.currency,
|
||||
},
|
||||
listingType: "OFFER",
|
||||
listingStatus: "ACTIVE",
|
||||
address: null,
|
||||
};
|
||||
}
|
||||
|
||||
function readObjectString(
|
||||
value: Record<string, unknown>,
|
||||
keys: string[],
|
||||
): string | null {
|
||||
for (const key of keys) {
|
||||
const candidate = value[key];
|
||||
if (typeof candidate === "string" && candidate.trim()) {
|
||||
return candidate.trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function readPayloadPrice(value: Record<string, unknown>): string | null {
|
||||
const directPrice = readObjectString(value, [
|
||||
"price",
|
||||
"currentPrice",
|
||||
"displayPrice",
|
||||
]);
|
||||
if (directPrice) return directPrice;
|
||||
|
||||
for (const key of ["price", "currentPrice", "displayPrice", "priceInfo"]) {
|
||||
const candidate = value[key];
|
||||
if (
|
||||
!candidate ||
|
||||
typeof candidate !== "object" ||
|
||||
Array.isArray(candidate)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const priceObject = candidate as Record<string, unknown>;
|
||||
const formatted = readObjectString(priceObject, [
|
||||
"amount",
|
||||
"formatted",
|
||||
"text",
|
||||
]);
|
||||
if (formatted) return formatted;
|
||||
|
||||
const numericValue = priceObject.value;
|
||||
const currency = readObjectString(priceObject, [
|
||||
"currency",
|
||||
"currencyCode",
|
||||
]);
|
||||
if (typeof numericValue === "string" && numericValue.trim()) {
|
||||
return currency ? `${currency} ${numericValue}` : numericValue;
|
||||
}
|
||||
if (typeof numericValue === "number") {
|
||||
return currency ? `${currency} ${numericValue}` : String(numericValue);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function collectPayloadListings(
|
||||
value: unknown,
|
||||
results: EbayListingDetails[],
|
||||
): void {
|
||||
if (!value || typeof value !== "object") return;
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) {
|
||||
collectPayloadListings(item, results);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const objectValue = value as Record<string, unknown>;
|
||||
const url = readObjectString(objectValue, [
|
||||
"itemWebUrl",
|
||||
"itemUrl",
|
||||
"url",
|
||||
"webUrl",
|
||||
]);
|
||||
const title = readObjectString(objectValue, ["title", "itemTitle", "name"]);
|
||||
const priceText = readPayloadPrice(objectValue);
|
||||
|
||||
if (url && title && priceText) {
|
||||
const listing = toEbayListing(url, title, priceText);
|
||||
if (listing) {
|
||||
results.push(listing);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (const child of Object.values(objectValue)) {
|
||||
collectPayloadListings(child, results);
|
||||
}
|
||||
}
|
||||
|
||||
function parseEmbeddedEbayListings(
|
||||
htmlString: HTMLString,
|
||||
): EbayListingDetails[] {
|
||||
const results: EbayListingDetails[] = [];
|
||||
const payloadMatches = htmlString.matchAll(
|
||||
/data-inlinepayload=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/giu,
|
||||
);
|
||||
|
||||
for (const match of payloadMatches) {
|
||||
const rawPayload = match[1] ?? match[2] ?? match[3];
|
||||
if (!rawPayload) continue;
|
||||
|
||||
try {
|
||||
const decodedPayload = decodeURIComponent(decodeHtmlEntities(rawPayload));
|
||||
collectPayloadListings(JSON.parse(decodedPayload), results);
|
||||
} catch {
|
||||
// eBay inline payloads vary by module; non-JSON payloads are ignored.
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function parseSCardHtmlListings(htmlString: HTMLString): EbayListingDetails[] {
|
||||
const results: EbayListingDetails[] = [];
|
||||
const cardMatches = htmlString.matchAll(
|
||||
/<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)[\s\S]*?(?=<div\b[^>]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)|<\/body>|<\/html>)/giu,
|
||||
);
|
||||
|
||||
for (const cardMatch of cardMatches) {
|
||||
const cardHtml = cardMatch[0];
|
||||
const linkTag = cardHtml.match(
|
||||
/<a\b[^>]*\bhref=(?:"[^"]*\/itm\/[^"]*"|'[^']*\/itm\/[^']*'|[^\s>]*\/itm\/[^\s>]*)[^>]*>/iu,
|
||||
)?.[0];
|
||||
const titleMatch = cardHtml.match(
|
||||
/<[^>]*\bclass=(?:"[^"]*\bs-card__title\b[^"]*"|'[^']*\bs-card__title\b[^']*'|[^\s>]*\bs-card__title\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
|
||||
);
|
||||
const priceMatch = cardHtml.match(
|
||||
/<[^>]*\bclass=(?:"[^"]*\bs-card__price\b[^"]*"|'[^']*\bs-card__price\b[^']*'|[^\s>]*\bs-card__price\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu,
|
||||
);
|
||||
|
||||
if (!linkTag || !titleMatch?.[1] || !priceMatch?.[1]) continue;
|
||||
|
||||
const href = getHtmlAttr(linkTag, "href");
|
||||
if (!href) continue;
|
||||
|
||||
const listing = toEbayListing(href, titleMatch[1], priceMatch[1]);
|
||||
if (listing) results.push(listing);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function dedupeEbayListings(
|
||||
listings: EbayListingDetails[],
|
||||
): EbayListingDetails[] {
|
||||
const results: EbayListingDetails[] = [];
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
for (const listing of listings) {
|
||||
const canonicalUrl = canonicalizeEbayItemUrl(listing.url);
|
||||
if (seenUrls.has(canonicalUrl)) continue;
|
||||
seenUrls.add(canonicalUrl);
|
||||
results.push(listing);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function canonicalizeEbayItemUrl(url: string): string {
|
||||
try {
|
||||
@@ -124,6 +347,11 @@ function parseEbayListings(
|
||||
exclusions: string[],
|
||||
strictMode: boolean,
|
||||
): EbayListingDetails[] {
|
||||
const embeddedListings = parseEmbeddedEbayListings(htmlString);
|
||||
if (embeddedListings.length > 0) {
|
||||
return dedupeEbayListings(embeddedListings);
|
||||
}
|
||||
|
||||
const { document } = parseHTML(htmlString);
|
||||
const results: EbayListingDetails[] = [];
|
||||
const seenUrls = new Set<string>();
|
||||
@@ -359,7 +587,28 @@ function parseEbayListings(
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
if (results.length > 0) {
|
||||
return results;
|
||||
}
|
||||
|
||||
return dedupeEbayListings(
|
||||
parseSCardHtmlListings(htmlString).filter((listing) => {
|
||||
if (
|
||||
exclusions.some((exclusion) =>
|
||||
listing.title.toLowerCase().includes(exclusion.toLowerCase()),
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
!strictMode ||
|
||||
keywords.some((keyword) =>
|
||||
listing.title.toLowerCase().includes(keyword.toLowerCase()),
|
||||
)
|
||||
);
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// ----------------------------- Cookie Loading -----------------------------
|
||||
@@ -481,12 +730,14 @@ export default async function fetchEbayItems(
|
||||
// Use custom headers modeled after real browser requests to bypass bot detection
|
||||
const headers: Record<string, string> = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Language": "en-CA,en-US;q=0.9,en;q=0.8",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
Referer: "https://www.ebay.ca/",
|
||||
Connection: "keep-alive",
|
||||
"Cache-Control": "no-cache",
|
||||
Pragma: "no-cache",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
|
||||
Reference in New Issue
Block a user