refactor: add facebook bootstrap candidate extraction

This commit is contained in:
2026-04-21 23:46:00 -04:00
parent b072599bc6
commit cfd7619737
2 changed files with 59 additions and 0 deletions

View File

@@ -408,6 +408,30 @@ export function classifyFacebookResponse(
return { kind: "unknown" as const, authGated: false, unavailable: false };
}
export function extractFacebookBootstrapCandidates(
htmlString: HTMLString,
): Record<string, unknown>[] {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates: Record<string, unknown>[] = [];
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent?.trim();
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
if (isRecord(parsed)) {
candidates.push(parsed as Record<string, unknown>);
}
} catch {
// skip non-JSON script bodies
}
}
return candidates;
}
/**
Extract marketplace search data from Facebook page script tags
*/

View File

@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import {
classifyFacebookResponse,
ensureFacebookCookies,
extractFacebookBootstrapCandidates,
extractFacebookItemData,
extractFacebookMarketplaceData,
fetchFacebookItem,
@@ -693,6 +694,40 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
});
});
});
describe("extractFacebookBootstrapCandidates", () => {
test("extracts Comet bootstrap candidates from script tags", () => {
const html = `
<html><body>
<script>{"routing_namespace":"fb_comet"}</script>
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
<script>not json</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates).toHaveLength(2);
expect(candidates[1]).toEqual({
data: {
marketplace_search_bootstrap: {
edges: [{ node: { listing: { id: "1" } } }],
},
},
});
});
test("keeps candidate order stable for later scoring", () => {
const html = `
<html><body>
<script>{"marker":"first"}</script>
<script>{"marker":"second"}</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
});
});
});
describe("Data Parsing", () => {