diff --git a/packages/core/src/scrapers/ebay.ts b/packages/core/src/scrapers/ebay.ts index 25254d7..c479f7e 100644 --- a/packages/core/src/scrapers/ebay.ts +++ b/packages/core/src/scrapers/ebay.ts @@ -40,6 +40,229 @@ export interface EbayListingDetails { } const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C|US)\s*\$|\s*[$£€¥])/u; +const EBAY_ITEM_URL_RE = /^https?:\/\/(?:www\.)?ebay\.(?:ca|com)\/itm\//u; + +function decodeHtmlEntities(value: string): string { + return value + .replace(/&/g, "&") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/</g, "<") + .replace(/>/g, ">") + .trim(); +} + +function stripHtml(value: string): string { + return decodeHtmlEntities( + value.replace(/<[^>]*>/g, " ").replace(/\s+/g, " "), + ); +} + +function getHtmlAttr(tag: string, attrName: string): string | null { + const attrMatch = tag.match( + new RegExp(`\\s${attrName}=(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`, "iu"), + ); + return attrMatch?.[1] ?? attrMatch?.[2] ?? attrMatch?.[3] ?? null; +} + +function normalizeEbayUrl(url: string): string | null { + const decodedUrl = decodeHtmlEntities(url); + try { + const parsed = new URL(decodedUrl, "https://www.ebay.ca"); + return EBAY_ITEM_URL_RE.test(parsed.href) ? parsed.href : null; + } catch { + return null; + } +} + +function toEbayListing( + url: string, + title: string, + priceText: string, +): EbayListingDetails | null { + const normalizedUrl = normalizeEbayUrl(url); + const cleanedTitle = stripHtml(title); + const cleanedPrice = stripHtml(priceText); + const priceInfo = parseEbayPrice(cleanedPrice); + + if (!normalizedUrl || !cleanedTitle || cleanedTitle === "Shop on eBay") { + return null; + } + if (!priceInfo) return null; + + return { + url: normalizedUrl, + title: cleanedTitle, + listingPrice: { + amountFormatted: cleanedPrice, + cents: priceInfo.cents, + currency: priceInfo.currency, + }, + listingType: "OFFER", + listingStatus: "ACTIVE", + address: null, + }; +} + +function readObjectString( + value: Record, + keys: string[], +): string | null { + for (const key of keys) { + const candidate = value[key]; + if (typeof candidate === "string" && candidate.trim()) { + return candidate.trim(); + } + } + return null; +} + +function readPayloadPrice(value: Record): string | null { + const directPrice = readObjectString(value, [ + "price", + "currentPrice", + "displayPrice", + ]); + if (directPrice) return directPrice; + + for (const key of ["price", "currentPrice", "displayPrice", "priceInfo"]) { + const candidate = value[key]; + if ( + !candidate || + typeof candidate !== "object" || + Array.isArray(candidate) + ) { + continue; + } + + const priceObject = candidate as Record; + const formatted = readObjectString(priceObject, [ + "amount", + "formatted", + "text", + ]); + if (formatted) return formatted; + + const numericValue = priceObject.value; + const currency = readObjectString(priceObject, [ + "currency", + "currencyCode", + ]); + if (typeof numericValue === "string" && numericValue.trim()) { + return currency ? `${currency} ${numericValue}` : numericValue; + } + if (typeof numericValue === "number") { + return currency ? `${currency} ${numericValue}` : String(numericValue); + } + } + + return null; +} + +function collectPayloadListings( + value: unknown, + results: EbayListingDetails[], +): void { + if (!value || typeof value !== "object") return; + + if (Array.isArray(value)) { + for (const item of value) { + collectPayloadListings(item, results); + } + return; + } + + const objectValue = value as Record; + const url = readObjectString(objectValue, [ + "itemWebUrl", + "itemUrl", + "url", + "webUrl", + ]); + const title = readObjectString(objectValue, ["title", "itemTitle", "name"]); + const priceText = readPayloadPrice(objectValue); + + if (url && title && priceText) { + const listing = toEbayListing(url, title, priceText); + if (listing) { + results.push(listing); + return; + } + } + + for (const child of Object.values(objectValue)) { + collectPayloadListings(child, results); + } +} + +function parseEmbeddedEbayListings( + htmlString: HTMLString, +): EbayListingDetails[] { + const results: EbayListingDetails[] = []; + const payloadMatches = htmlString.matchAll( + /data-inlinepayload=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/giu, + ); + + for (const match of payloadMatches) { + const rawPayload = match[1] ?? match[2] ?? match[3]; + if (!rawPayload) continue; + + try { + const decodedPayload = decodeURIComponent(decodeHtmlEntities(rawPayload)); + collectPayloadListings(JSON.parse(decodedPayload), results); + } catch { + // eBay inline payloads vary by module; non-JSON payloads are ignored. + } + } + + return results; +} + +function parseSCardHtmlListings(htmlString: HTMLString): EbayListingDetails[] { + const results: EbayListingDetails[] = []; + const cardMatches = htmlString.matchAll( + /]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)[\s\S]*?(?=]*class=(?:"[^"]*\bs-card\b[^"]*"|'[^']*\bs-card\b[^']*'|[^\s>]*\bs-card\b[^\s>]*)|<\/body>|<\/html>)/giu, + ); + + for (const cardMatch of cardMatches) { + const cardHtml = cardMatch[0]; + const linkTag = cardHtml.match( + /]*\bhref=(?:"[^"]*\/itm\/[^"]*"|'[^']*\/itm\/[^']*'|[^\s>]*\/itm\/[^\s>]*)[^>]*>/iu, + )?.[0]; + const titleMatch = cardHtml.match( + /<[^>]*\bclass=(?:"[^"]*\bs-card__title\b[^"]*"|'[^']*\bs-card__title\b[^']*'|[^\s>]*\bs-card__title\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu, + ); + const priceMatch = cardHtml.match( + /<[^>]*\bclass=(?:"[^"]*\bs-card__price\b[^"]*"|'[^']*\bs-card__price\b[^']*'|[^\s>]*\bs-card__price\b[^\s>]*)[^>]*>([\s\S]*?)<\/[^>]+>/iu, + ); + + if (!linkTag || !titleMatch?.[1] || !priceMatch?.[1]) continue; + + const href = getHtmlAttr(linkTag, "href"); + if (!href) continue; + + const listing = toEbayListing(href, titleMatch[1], priceMatch[1]); + if (listing) results.push(listing); + } + + return results; +} + +function dedupeEbayListings( + listings: EbayListingDetails[], +): EbayListingDetails[] { + const results: EbayListingDetails[] = []; + const seenUrls = new Set(); + + for (const listing of listings) { + const canonicalUrl = canonicalizeEbayItemUrl(listing.url); + if (seenUrls.has(canonicalUrl)) continue; + seenUrls.add(canonicalUrl); + results.push(listing); + } + + return results; +} function canonicalizeEbayItemUrl(url: string): string { try { @@ -124,6 +347,11 @@ function parseEbayListings( exclusions: string[], strictMode: boolean, ): EbayListingDetails[] { + const embeddedListings = parseEmbeddedEbayListings(htmlString); + if (embeddedListings.length > 0) { + return dedupeEbayListings(embeddedListings); + } + const { document } = parseHTML(htmlString); const results: EbayListingDetails[] = []; const seenUrls = new Set(); @@ -359,7 +587,28 @@ function parseEbayListings( } } - return results; + if (results.length > 0) { + return results; + } + + return dedupeEbayListings( + parseSCardHtmlListings(htmlString).filter((listing) => { + if ( + exclusions.some((exclusion) => + listing.title.toLowerCase().includes(exclusion.toLowerCase()), + ) + ) { + return false; + } + + return ( + !strictMode || + keywords.some((keyword) => + listing.title.toLowerCase().includes(keyword.toLowerCase()), + ) + ); + }), + ); } // ----------------------------- Cookie Loading ----------------------------- @@ -481,12 +730,14 @@ export default async function fetchEbayItems( // Use custom headers modeled after real browser requests to bypass bot detection const headers: Record = { "User-Agent": - "Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", + "Accept-Language": "en-CA,en-US;q=0.9,en;q=0.8", "Accept-Encoding": "gzip, deflate, br, zstd", Referer: "https://www.ebay.ca/", Connection: "keep-alive", + "Cache-Control": "no-cache", + Pragma: "no-cache", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", diff --git a/packages/core/src/utils/cookies.ts b/packages/core/src/utils/cookies.ts index 6fc70b6..678ee2c 100644 --- a/packages/core/src/utils/cookies.ts +++ b/packages/core/src/utils/cookies.ts @@ -7,6 +7,7 @@ import { logger } from "./logger"; export interface Cookie { name: string; value: string; + rawValue?: string; domain: string; path: string; secure?: boolean; @@ -55,6 +56,7 @@ export function parseCookieString( return { name: trimmedName, value: decodeURIComponent(trimmedValue), + rawValue: trimmedValue, domain, path: "/", secure: true, @@ -95,7 +97,7 @@ export function formatCookiesForHeader( }); return validCookies - .map((cookie) => `${cookie.name}=${cookie.value}`) + .map((cookie) => `${cookie.name}=${cookie.rawValue ?? cookie.value}`) .join("; "); } diff --git a/packages/core/test/ebay-core.test.ts b/packages/core/test/ebay-core.test.ts index 2016fdb..bcfa0d7 100644 --- a/packages/core/test/ebay-core.test.ts +++ b/packages/core/test/ebay-core.test.ts @@ -29,6 +29,7 @@ const originalWarn = console.warn; describe("eBay Scraper Cookie Handling", () => { beforeEach(() => { + delete process.env.EBAY_COOKIE; global.fetch = mock(() => Promise.resolve({ ok: true, @@ -210,6 +211,81 @@ describe("eBay Scraper Cookie Handling", () => { ]); }); + test("parses current eBay s-card markup with unquoted item links", async () => { + global.fetch = mock(() => + Promise.resolve({ + ok: true, + text: () => + Promise.resolve(` + + + + `), + }), + ) as unknown as typeof fetch; + + const results = await fetchEbayItems("macbook", 1000); + + expect(results).toEqual([ + expect.objectContaining({ + title: "Apple MacBook Air M1 2020 8GB 256GB", + url: "https://ebay.com/itm/1234567890?itmmeta=abc", + listingPrice: expect.objectContaining({ cents: 59_900 }), + }), + ]); + }); + + test("parses embedded eBay payload listings before HTML fallback", async () => { + const payload = encodeURIComponent( + JSON.stringify({ + searchResults: [ + { + title: "Apple MacBook Air M1 API Result", + itemWebUrl: "https://www.ebay.ca/itm/9876543210?hash=item987", + price: { value: "550.00", currency: "CAD" }, + }, + ], + }), + ); + + global.fetch = mock(() => + Promise.resolve({ + ok: true, + text: () => + Promise.resolve(` + + + + `), + }), + ) as unknown as typeof fetch; + + const results = await fetchEbayItems("macbook", 1000); + + expect(results).toEqual([ + expect.objectContaining({ + title: "Apple MacBook Air M1 API Result", + url: "https://www.ebay.ca/itm/9876543210?hash=item987", + listingPrice: expect.objectContaining({ + amountFormatted: "CAD 550.00", + cents: 55_000, + currency: "CAD", + }), + }), + ]); + }); + test("treats US dollar prices as USD", async () => { global.fetch = mock(() => Promise.resolve({ diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index 448870b..3fd4f63 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -70,6 +70,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => { expect(result[0]).toEqual({ name: "c_user", value: "123456789", + rawValue: "123456789", domain: ".facebook.com", path: "/", secure: true, @@ -80,6 +81,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => { expect(result[1]).toEqual({ name: "xs", value: "abcdef123456", + rawValue: "abcdef123456", domain: ".facebook.com", path: "/", secure: true, @@ -97,6 +99,16 @@ describe("Facebook Marketplace Scraper Core Tests", () => { expect(result[1]?.value).toBe("abc=def"); }); + test("should preserve raw encoded values when formatting cookie headers", () => { + const cookieString = "c_user=123%2B456; xs=abc%3Ddef"; + const result = formatCookiesForHeader( + parseFacebookCookieString(cookieString), + "www.facebook.com", + ); + + expect(result).toBe(cookieString); + }); + test("should filter out malformed cookies", () => { const cookieString = "c_user=123; invalid; xs=abc; =empty"; const result = parseFacebookCookieString(cookieString);