diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index f09ba38..0e23c6a 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -361,6 +361,9 @@ async function fetchHtml( await delay(DELAY_MS); return { html, responseUrl: res.url || url }; } catch (err) { + if (err instanceof HttpError) { + throw err; + } if (attempt >= maxRetries) throw err; await delay((attempt + 1) * retryBaseMs); } @@ -1286,7 +1289,7 @@ export async function fetchFacebookItem( return null; } - if (classification.unavailable || itemHtml.includes("This item has been sold")) { + if (classification.unavailable) { logExtractionMetrics(false, itemId); console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`); return null; @@ -1304,6 +1307,11 @@ export async function fetchFacebookItem( if (!itemData) { logExtractionMetrics(false, itemId); + if (itemHtml.includes("This item has been sold")) { + console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`); + return null; + } + console.warn( `No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`, ); diff --git a/packages/core/src/scrapers/kijiji.ts b/packages/core/src/scrapers/kijiji.ts index 87408d3..2006667 100644 --- a/packages/core/src/scrapers/kijiji.ts +++ b/packages/core/src/scrapers/kijiji.ts @@ -254,18 +254,31 @@ export function buildSearchUrl( let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`; - const sortParam = options.sortBy - ? `&sort=${SORT_MAPPINGS[options.sortBy]}` - : ""; + const sortValue = + options.sortBy && options.sortBy !== "relevancy" + ? SORT_MAPPINGS[options.sortBy] + : "relevancyDesc"; const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC"; const pageParam = options.page && options.page > 1 ? `&page=${options.page}` : ""; - url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`; + url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`; return url; } +function findApolloListingKey( + apolloState: ApolloRecord, + predicate: (value: Record) => boolean, +): string | undefined { + return Object.keys(apolloState).find((key) => { + if (!key.startsWith("Listing:")) return false; + + const value = apolloState[key]; + return isRecord(value) && predicate(value); + }); +} + /** * Slugifies a string for Kijiji search URLs */ @@ -497,9 +510,9 @@ function _parseListing( const apolloState = extractApolloState(htmlString); if (!apolloState) return null; - // Find the listing root key - const listingKey = Object.keys(apolloState).find((k) => - k.includes("Listing"), + const listingKey = findApolloListingKey( + apolloState, + (value) => typeof value.url === "string" && typeof value.title === "string", ); if (!listingKey) return null; @@ -570,9 +583,12 @@ export async function parseDetailedListing( const apolloState = extractApolloState(htmlString); if (!apolloState) return null; - // Find the listing root key - const listingKey = Object.keys(apolloState).find((k) => - k.includes("Listing"), + const listingKey = findApolloListingKey( + apolloState, + (value) => + typeof value.url === "string" && + typeof value.title === "string" && + isRecord(value.price), ); if (!listingKey) return null; @@ -921,8 +937,12 @@ export default async function fetchKijijiItems( return true; }); - console.log(`\nParsed ${filteredListings.length} detailed listings.`); - return finalizeResults(filteredListings); + console.log( + `\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`, + ); + return finalizeResults( + unstableMode.hideUnstableResults ? allListings : filteredListings, + ); } // Re-export error classes for convenience diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index e685dd9..e3ab383 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -177,6 +177,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => { try { const result = await fetchFacebookItem("123"); expect(result).toBeNull(); + expect(global.fetch).toHaveBeenCalledTimes(1); expect(warnMock).toHaveBeenCalledWith( "Authentication error: Invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.", ); @@ -309,6 +310,54 @@ describe("Facebook Marketplace Scraper Core Tests", () => { expect(result?.listingStatus).toBe("SOLD"); }); + test("should still parse sold items when structured data exists", async () => { + const soldStructuredHtml = ` + +
This item has been sold
+ + + + `; + + global.fetch = mock(() => + Promise.resolve({ + ok: true, + text: () => Promise.resolve(soldStructuredHtml), + url: "https://www.facebook.com/marketplace/item/457/", + headers: { + get: () => null, + }, + }), + ); + + const result = await fetchFacebookItem("457"); + + expect(result).toEqual( + expect.objectContaining({ + title: "Structured Sold Item", + listingStatus: "SOLD", + }), + ); + }); + test("should handle successful item extraction", async () => { const mockData = { require: [ diff --git a/packages/core/test/kijiji-core.test.ts b/packages/core/test/kijiji-core.test.ts index e226dac..f71a8d6 100644 --- a/packages/core/test/kijiji-core.test.ts +++ b/packages/core/test/kijiji-core.test.ts @@ -4,6 +4,7 @@ import { default as fetchKijijiItems, type DetailedListing, NetworkError, + parseDetailedListing, ParseError, RateLimitError, resolveCategoryId, @@ -124,6 +125,7 @@ describe("URL Construction", () => { sortBy: "date", sortOrder: "asc", }); + expect(dateUrl.match(/sort=/g)?.length).toBe(1); expect(dateUrl).toContain("sort=DATE"); expect(dateUrl).toContain("order=ASC"); @@ -289,6 +291,141 @@ describe("fetchKijijiItems", () => { ]); }); + test("classifies unstable mode using all parsed listings before price filtering", async () => { + const searchHtml = ` + + + + `; + + const listingHtml = (title: string, amount: number, slug: string) => ` + + + + `; + + global.fetch = mock((input: string | URL | Request) => { + const url = typeof input === "string" ? input : input.toString(); + + if (url.includes("/k0c0l1700272")) { + return Promise.resolve({ + ok: true, + text: () => Promise.resolve(searchHtml), + headers: { get: () => null }, + url, + }); + } + + if (url.endsWith("/v-stable-one/k0l0")) { + return Promise.resolve({ + ok: true, + text: () => Promise.resolve(listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0")), + headers: { get: () => null }, + url, + }); + } + + if (url.endsWith("/v-stable-two/k0l0")) { + return Promise.resolve({ + ok: true, + text: () => Promise.resolve(listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0")), + headers: { get: () => null }, + url, + }); + } + + if (url.endsWith("/v-unstable/k0l0")) { + return Promise.resolve({ + ok: true, + text: () => Promise.resolve(listingHtml("Unstable Listing", 7000, "v-unstable/k0l0")), + headers: { get: () => null }, + url, + }); + } + + throw new Error(`Unexpected URL: ${url}`); + }) as typeof fetch; + + const results = await fetchKijijiItems( + "phone", + 1000, + "https://www.kijiji.ca", + { maxPages: 1, priceMin: 8000 }, + {}, + { hideUnstableResults: true }, + ); + + expect(results).toEqual({ + results: [ + expect.objectContaining({ title: "Stable Listing One" }), + expect.objectContaining({ title: "Stable Listing Two" }), + ], + unstableResults: [expect.objectContaining({ title: "Unstable Listing" })], + }); + }); + + test("parseDetailedListing ignores non-root listing-like entities", async () => { + const html = ` + + + + `; + + const result = await parseDetailedListing(html, "https://www.kijiji.ca"); + + expect(result).toEqual( + expect.objectContaining({ title: "Detailed Listing" }), + ); + }); + test("returns results and unstableResults when unstable mode is enabled", async () => { const searchHtml = `