refactor: add facebook bootstrap candidate extraction
This commit is contained in:
@@ -408,6 +408,30 @@ export function classifyFacebookResponse(
|
||||
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
export function extractFacebookBootstrapCandidates(
|
||||
htmlString: HTMLString,
|
||||
): Record<string, unknown>[] {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates: Record<string, unknown>[] = [];
|
||||
|
||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||
const scriptText = script.textContent?.trim();
|
||||
if (!scriptText) continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
if (isRecord(parsed)) {
|
||||
candidates.push(parsed as Record<string, unknown>);
|
||||
}
|
||||
} catch {
|
||||
// skip non-JSON script bodies
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
|
||||
@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||
import {
|
||||
classifyFacebookResponse,
|
||||
ensureFacebookCookies,
|
||||
extractFacebookBootstrapCandidates,
|
||||
extractFacebookItemData,
|
||||
extractFacebookMarketplaceData,
|
||||
fetchFacebookItem,
|
||||
@@ -693,6 +694,40 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("extractFacebookBootstrapCandidates", () => {
|
||||
test("extracts Comet bootstrap candidates from script tags", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>{"routing_namespace":"fb_comet"}</script>
|
||||
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
|
||||
<script>not json</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const candidates = extractFacebookBootstrapCandidates(html);
|
||||
expect(candidates).toHaveLength(2);
|
||||
expect(candidates[1]).toEqual({
|
||||
data: {
|
||||
marketplace_search_bootstrap: {
|
||||
edges: [{ node: { listing: { id: "1" } } }],
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("keeps candidate order stable for later scoring", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>{"marker":"first"}</script>
|
||||
<script>{"marker":"second"}</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const candidates = extractFacebookBootstrapCandidates(html);
|
||||
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("Data Parsing", () => {
|
||||
|
||||
Reference in New Issue
Block a user