From 63ca0066967dcb7870a52b09d44cbd9a31042edc Mon Sep 17 00:00:00 2001 From: Dmytro Stanchiev Date: Wed, 22 Apr 2026 02:44:17 -0400 Subject: [PATCH] refactor: rewrite facebook item parser for comet bootstrap --- packages/core/src/scrapers/facebook.ts | 212 ++++++++++------------- packages/core/test/facebook-core.test.ts | 99 +++++++---- 2 files changed, 156 insertions(+), 155 deletions(-) diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index 9e2bc8e..e119a13 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -496,6 +496,80 @@ function findSearchEdges( return bestMatch; } +interface FacebookMarketplaceItemMatch { + item: FacebookMarketplaceItem; + score: number; + path: string[]; +} + +function scoreMarketplaceItemPath(path: string[]): number { + let score = 0; + + if (path.includes("payload")) { + score += 2; + } + + if (path.includes("viewer")) { + score += 2; + } + + if (path.includes("marketplace_product_details_page")) { + score += 6; + } + + if (path.includes("target")) { + score += 8; + } + + if (path.includes("listing")) { + score += 6; + } + + if ( + path.some( + (segment) => + segment.includes("recommend") || segment.includes("related"), + ) + ) { + score -= 10; + } + + return score - path.length; +} + +function collectMarketplaceItemCandidates( + candidate: unknown, + path: string[] = [], +): FacebookMarketplaceItemMatch[] { + if (Array.isArray(candidate)) { + return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path)); + } + + if (!isRecord(candidate)) { + return []; + } + + const matches: FacebookMarketplaceItemMatch[] = []; + + if ( + typeof candidate.id === "string" && + candidate.__typename === "GroupCommerceProductItem" && + typeof candidate.marketplace_listing_title === "string" + ) { + matches.push({ + item: candidate as FacebookMarketplaceItem, + score: scoreMarketplaceItemPath(path), + path, + }); + } + + for (const [key, value] of Object.entries(candidate)) { + matches.push(...collectMarketplaceItemCandidates(value, [...path, key])); + } + + return matches; +} + /** Extract marketplace search data from Facebook page script tags */ @@ -531,139 +605,29 @@ export function extractFacebookMarketplaceData( /** Extract marketplace item details from Facebook item page HTML - Updated for 2026 Facebook Marketplace API structure with multiple extraction paths + Updated for 2026 Facebook Marketplace bootstrap candidates */ export function extractFacebookItemData( htmlString: HTMLString, ): FacebookMarketplaceItem | null { - const { document } = parseHTML(htmlString); - const scripts = document.querySelectorAll("script"); + const candidates = extractFacebookBootstrapCandidates(htmlString); + let bestMatch: FacebookMarketplaceItemMatch | null = null; - for (const script of scripts) { - const scriptText = script.textContent; - if (!scriptText) continue; + for (const candidate of candidates) { + const matches = collectMarketplaceItemCandidates(candidate); - try { - const parsed = JSON.parse(scriptText); - - // Check for the require structure with marketplace product details - if (parsed.require && Array.isArray(parsed.require)) { - // Try multiple extraction paths discovered from reverse engineering - const extractionPaths = [ - // Path 1: Primary path from current API structure - () => - parsed.require[0][3].__bbox.result.data.viewer - .marketplace_product_details_page.target, - // Path 2: Alternative path with nested require - () => - parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data - .viewer.marketplace_product_details_page.target, - // Path 3: Variation without the [0] index - () => - parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data - .viewer.marketplace_product_details_page.target, - // Path 4-5: Additional fallback paths for edge cases - () => - parsed.require[0][3][1]?.__bbox?.result?.data?.viewer - ?.marketplace_product_details_page?.target, - () => - parsed.require[0][3][2]?.__bbox?.result?.data?.viewer - ?.marketplace_product_details_page?.target, - ]; - - let pathIndex = 0; - for (const getPath of extractionPaths) { - try { - const targetData = getPath(); - if ( - targetData && - typeof targetData === "object" && - targetData.id && - targetData.marketplace_listing_title && - targetData.__typename === "GroupCommerceProductItem" - ) { - console.log( - `Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`, - ); - return targetData as FacebookMarketplaceItem; - } - } catch { - // Path not found or invalid, try next path - } - pathIndex++; - } - - // Fallback: Search recursively for marketplace data in the parsed structure - const findMarketplaceData = ( - obj: unknown, - depth = 0, - maxDepth = 10, - ): FacebookMarketplaceItem | null => { - if (depth > maxDepth) return null; // Prevent infinite recursion - if (isRecord(obj)) { - // Check if this object matches the expected marketplace item structure - const candidate = obj as Record; - if ( - candidate.marketplace_listing_title && - candidate.id && - candidate.__typename === "GroupCommerceProductItem" && - candidate.redacted_description - ) { - return candidate as unknown as FacebookMarketplaceItem; - } - // Recursively search nested objects and arrays - for (const key in obj) { - const value = obj[key]; - if (isRecord(value) || Array.isArray(value)) { - const result = findMarketplaceData(value, depth + 1, maxDepth); - if (result) return result; - } - } - } else if (Array.isArray(obj)) { - // Search through arrays - for (const item of obj) { - const result = findMarketplaceData(item, depth + 1, maxDepth); - if (result) return result; - } - } - return null; - }; - - // Search through the entire require structure - const recursiveResult = findMarketplaceData(parsed.require); - if (recursiveResult) { - console.log( - "Successfully extracted Facebook item data using recursive search", - ); - return recursiveResult; - } - - // Additional search in other potential locations - if ( - parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page - ?.target - ) { - const bboxData = - parsed.__bbox.result.data.viewer.marketplace_product_details_page - .target; - if ( - bboxData && - typeof bboxData === "object" && - bboxData.id && - bboxData.marketplace_listing_title && - bboxData.__typename === "GroupCommerceProductItem" - ) { - console.log( - "Successfully extracted Facebook item data from __bbox structure", - ); - return bboxData as FacebookMarketplaceItem; - } - } + for (const match of matches) { + if ( + !bestMatch || + match.score > bestMatch.score || + (match.score === bestMatch.score && match.path.length < bestMatch.path.length) + ) { + bestMatch = match; } - } catch {} + } } - return null; + return bestMatch?.item ?? null; } /** diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index 60684c1..f627f5a 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -369,43 +369,80 @@ describe("Facebook Marketplace Scraper Core Tests", () => { describe("Data Extraction", () => { describe("extractFacebookItemData", () => { - test("should extract item data from standard require structure", () => { - const mockItemData = { - id: "123456", - __typename: "GroupCommerceProductItem", - marketplace_listing_title: "Test Item", - formatted_price: { text: "$100.00" }, - listing_price: { amount: "100.00", currency: "CAD" }, - is_live: true, - }; - const mockData = { - require: [ - [ - null, - null, - null, - { - __bbox: { - result: { - data: { - viewer: { - marketplace_product_details_page: { - target: mockItemData, - }, - }, + test("extracts item details from Comet permalink bootstrap candidates", () => { + const html = ` + + + `; + })} + + + `; const result = extractFacebookItemData(html); expect(result).not.toBeNull(); - expect(result?.id).toBe("123456"); - expect(result?.marketplace_listing_title).toBe("Test Item"); + expect(result?.id).toBe("123"); + expect(result?.marketplace_listing_title).toBe("Vintage Chair"); + }); + + test("prefers the canonical permalink target over earlier decoy items", () => { + const html = ` + + + + + `; + + const result = extractFacebookItemData(html); + expect(result).not.toBeNull(); + expect(result?.id).toBe("real-123"); + expect(result?.marketplace_listing_title).toBe("Canonical Chair"); }); test("should handle missing item data", () => {