From cfd7619737374638ef8d55d7c12de590f5f287ea Mon Sep 17 00:00:00 2001 From: Dmytro Stanchiev Date: Tue, 21 Apr 2026 23:46:00 -0400 Subject: [PATCH] refactor: add facebook bootstrap candidate extraction --- packages/core/src/scrapers/facebook.ts | 24 ++++++++++++++++ packages/core/test/facebook-core.test.ts | 35 ++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index 6eca4cb..e472faf 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -408,6 +408,30 @@ export function classifyFacebookResponse( return { kind: "unknown" as const, authGated: false, unavailable: false }; } +export function extractFacebookBootstrapCandidates( + htmlString: HTMLString, +): Record[] { + const { document } = parseHTML(htmlString); + const scripts = document.querySelectorAll("script"); + const candidates: Record[] = []; + + for (const script of Array.from(scripts) as HTMLScriptElement[]) { + const scriptText = script.textContent?.trim(); + if (!scriptText) continue; + + try { + const parsed = JSON.parse(scriptText); + if (isRecord(parsed)) { + candidates.push(parsed as Record); + } + } catch { + // skip non-JSON script bodies + } + } + + return candidates; +} + /** Extract marketplace search data from Facebook page script tags */ diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index 82bdd20..f0a7bda 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; import { classifyFacebookResponse, ensureFacebookCookies, + extractFacebookBootstrapCandidates, extractFacebookItemData, extractFacebookMarketplaceData, fetchFacebookItem, @@ -693,6 +694,40 @@ describe("Facebook Marketplace Scraper Core Tests", () => { }); }); }); + + describe("extractFacebookBootstrapCandidates", () => { + test("extracts Comet bootstrap candidates from script tags", () => { + const html = ` + + + + + + `; + + const candidates = extractFacebookBootstrapCandidates(html); + expect(candidates).toHaveLength(2); + expect(candidates[1]).toEqual({ + data: { + marketplace_search_bootstrap: { + edges: [{ node: { listing: { id: "1" } } }], + }, + }, + }); + }); + + test("keeps candidate order stable for later scoring", () => { + const html = ` + + + + + `; + + const candidates = extractFacebookBootstrapCandidates(html); + expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]); + }); + }); }); describe("Data Parsing", () => {