refactor: add facebook bootstrap candidate extraction

This commit is contained in:
2026-04-21 23:46:00 -04:00
parent b072599bc6
commit cfd7619737
2 changed files with 59 additions and 0 deletions

View File

@@ -408,6 +408,30 @@ export function classifyFacebookResponse(
return { kind: "unknown" as const, authGated: false, unavailable: false }; return { kind: "unknown" as const, authGated: false, unavailable: false };
} }
export function extractFacebookBootstrapCandidates(
htmlString: HTMLString,
): Record<string, unknown>[] {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates: Record<string, unknown>[] = [];
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent?.trim();
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
if (isRecord(parsed)) {
candidates.push(parsed as Record<string, unknown>);
}
} catch {
// skip non-JSON script bodies
}
}
return candidates;
}
/** /**
Extract marketplace search data from Facebook page script tags Extract marketplace search data from Facebook page script tags
*/ */

View File

@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import { import {
classifyFacebookResponse, classifyFacebookResponse,
ensureFacebookCookies, ensureFacebookCookies,
extractFacebookBootstrapCandidates,
extractFacebookItemData, extractFacebookItemData,
extractFacebookMarketplaceData, extractFacebookMarketplaceData,
fetchFacebookItem, fetchFacebookItem,
@@ -693,6 +694,40 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
}); });
}); });
}); });
describe("extractFacebookBootstrapCandidates", () => {
test("extracts Comet bootstrap candidates from script tags", () => {
const html = `
<html><body>
<script>{"routing_namespace":"fb_comet"}</script>
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
<script>not json</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates).toHaveLength(2);
expect(candidates[1]).toEqual({
data: {
marketplace_search_bootstrap: {
edges: [{ node: { listing: { id: "1" } } }],
},
},
});
});
test("keeps candidate order stable for later scoring", () => {
const html = `
<html><body>
<script>{"marker":"first"}</script>
<script>{"marker":"second"}</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
});
});
}); });
describe("Data Parsing", () => { describe("Data Parsing", () => {