diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index e472faf..9e2bc8e 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -75,13 +75,6 @@ interface FacebookEdge { [k: string]: unknown; } -interface FacebookMarketplaceSearch { - feed_units?: { - edges?: FacebookEdge[]; - }; - [k: string]: unknown; -} - interface FacebookMarketplaceItem { // Basic identification id: string; @@ -432,89 +425,108 @@ export function extractFacebookBootstrapCandidates( return candidates; } +function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] { + return ( + Array.isArray(value) && + value.length > 0 && + value.every( + (edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing), + ) + ); +} + +function scoreSearchEdges(edges: FacebookEdge[], score: number): number { + return score + Math.min(edges.length, 3); +} + +function findSearchEdges( + candidate: unknown, + score = 0, +): { edges: FacebookEdge[]; score: number } | null { + if (Array.isArray(candidate)) { + let bestMatch: { edges: FacebookEdge[]; score: number } | null = null; + + for (const item of candidate) { + const result = findSearchEdges(item, score); + if (result && (!bestMatch || result.score > bestMatch.score)) { + bestMatch = result; + } + } + + return bestMatch; + } + + if (!isRecord(candidate)) { + return null; + } + + let bestMatch: { edges: FacebookEdge[]; score: number } | null = null; + + const feedUnits = candidate.feed_units; + if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) { + bestMatch = { + edges: feedUnits.edges, + score: scoreSearchEdges(feedUnits.edges, score + 2), + }; + } + + const resultGroups = candidate.resultGroups; + if (Array.isArray(resultGroups)) { + for (const group of resultGroups) { + if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) { + const result = { + edges: group.edges, + score: scoreSearchEdges(group.edges, score + 4), + }; + + if (!bestMatch || result.score > bestMatch.score) { + bestMatch = result; + } + } + } + } + + for (const [key, value] of Object.entries(candidate)) { + const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0)); + if (result && (!bestMatch || result.score > bestMatch.score)) { + bestMatch = result; + } + } + + return bestMatch; +} + /** Extract marketplace search data from Facebook page script tags */ export function extractFacebookMarketplaceData( htmlString: HTMLString, ): FacebookAdNode[] | null { - const { document } = parseHTML(htmlString); - const scripts = document.querySelectorAll("script"); + const candidates = extractFacebookBootstrapCandidates(htmlString); + let bestEdges: FacebookEdge[] | null = null; + let bestScore = -1; - let marketplaceData: FacebookMarketplaceSearch | null = null; + for (const candidate of candidates) { + const result = findSearchEdges(candidate); + if (!result?.edges.length) { + continue; + } - // Find the script containing the require data with marketplace_search - for (const script of Array.from(scripts) as HTMLScriptElement[]) { - const scriptText = script.textContent; - if (!scriptText) continue; - - try { - const parsed = JSON.parse(scriptText); - - // First check if this is the direct data structure (like in examples) - if (parsed.require && Array.isArray(parsed.require)) { - // Try multiple navigation paths to find marketplace_search - const paths = [ - // Original path from example - () => - parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data - .marketplace_search, - // Alternative path structure - () => - parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search, - // Another variation - () => parsed.require[0][3][0].__bbox.result.data.marketplace_search, - // Direct access for some responses - () => { - for (const item of parsed.require) { - if (item && item.length >= 4 && item[3]) { - const bbox = item[3]?.__bbox?.result?.data?.marketplace_search; - if (bbox) return bbox; - } - } - return null; - }, - ]; - - for (const getData of paths) { - try { - const result = getData(); - if ( - result && - isRecord(result) && - (result as Record).feed_units?.edges?.length > 0 - ) { - marketplaceData = result as FacebookMarketplaceSearch; - break; - } - } catch {} - } - - if (marketplaceData) break; - } - - // Also check for direct marketplace_search in the parsed data - if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) { - const searchData = - parsed.marketplace_search as FacebookMarketplaceSearch; - const feedLength = searchData.feed_units?.edges?.length ?? 0; - if (feedLength > 0) { - marketplaceData = searchData; - break; - } - } - } catch {} + if (result.score > bestScore) { + bestScore = result.score; + bestEdges = result.edges; + } } - if (!marketplaceData?.feed_units?.edges?.length) { + if (!bestEdges?.length) { console.warn("No marketplace data found in HTML response"); return null; } console.log( - `Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`, + `Successfully parsed ${bestEdges.length} Facebook marketplace listings`, ); - return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node })); + return bestEdges.map((edge) => ({ node: edge.node })); } /** diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index f0a7bda..60684c1 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -727,6 +727,151 @@ describe("Facebook Marketplace Scraper Core Tests", () => { const candidates = extractFacebookBootstrapCandidates(html); expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]); }); + + test("extracts search results from Comet bootstrap candidates", () => { + const html = ` + + + + + `; + + const ads = extractFacebookMarketplaceData(html); + expect(ads).toHaveLength(1); + expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike"); + }); + + test("prefers the strongest marketplace edge set when multiple edges arrays exist", () => { + const html = ` + + + + + `; + + const ads = extractFacebookMarketplaceData(html); + expect(ads).toHaveLength(1); + expect(ads?.[0].node.listing.id).toBe("right-1"); + }); + + test("rejects mixed edge arrays that contain non-listing entries", () => { + const html = ` + + + + + `; + + const ads = extractFacebookMarketplaceData(html); + expect(ads).toBeNull(); + }); }); }); diff --git a/packages/core/test/facebook-integration.test.ts b/packages/core/test/facebook-integration.test.ts index 3c4021c..22c7c60 100644 --- a/packages/core/test/facebook-integration.test.ts +++ b/packages/core/test/facebook-integration.test.ts @@ -27,77 +27,40 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { describe("Main Search Function", () => { test("should successfully fetch search results", async () => { - const mockSearchData = { - require: [ - [ - null, - null, - null, + const mockSearchHtml = ``; global.fetch = mock(() => Promise.resolve({ ok: true, - text: () => - Promise.resolve( - ``, - ), + text: () => Promise.resolve(mockSearchHtml), headers: { get: () => null, }, @@ -105,9 +68,8 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ); const results = await fetchFacebookItems("iPhone", 1, "toronto", 25); - expect(results).toHaveLength(2); - expect(results[0].title).toBe("iPhone 13 Pro"); - expect(results[1].title).toBe("Samsung Galaxy"); + expect(results).toHaveLength(1); + expect(results[0].title).toBe("iPhone 13"); }); test("should filter out items without price", async () => {