refactor: add facebook bootstrap candidate extraction
This commit is contained in:
@@ -408,6 +408,30 @@ export function classifyFacebookResponse(
|
|||||||
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function extractFacebookBootstrapCandidates(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): Record<string, unknown>[] {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const scripts = document.querySelectorAll("script");
|
||||||
|
const candidates: Record<string, unknown>[] = [];
|
||||||
|
|
||||||
|
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||||
|
const scriptText = script.textContent?.trim();
|
||||||
|
if (!scriptText) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(scriptText);
|
||||||
|
if (isRecord(parsed)) {
|
||||||
|
candidates.push(parsed as Record<string, unknown>);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// skip non-JSON script bodies
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace search data from Facebook page script tags
|
Extract marketplace search data from Facebook page script tags
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
|||||||
import {
|
import {
|
||||||
classifyFacebookResponse,
|
classifyFacebookResponse,
|
||||||
ensureFacebookCookies,
|
ensureFacebookCookies,
|
||||||
|
extractFacebookBootstrapCandidates,
|
||||||
extractFacebookItemData,
|
extractFacebookItemData,
|
||||||
extractFacebookMarketplaceData,
|
extractFacebookMarketplaceData,
|
||||||
fetchFacebookItem,
|
fetchFacebookItem,
|
||||||
@@ -693,6 +694,40 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("extractFacebookBootstrapCandidates", () => {
|
||||||
|
test("extracts Comet bootstrap candidates from script tags", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>{"routing_namespace":"fb_comet"}</script>
|
||||||
|
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
|
||||||
|
<script>not json</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
|
expect(candidates).toHaveLength(2);
|
||||||
|
expect(candidates[1]).toEqual({
|
||||||
|
data: {
|
||||||
|
marketplace_search_bootstrap: {
|
||||||
|
edges: [{ node: { listing: { id: "1" } } }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("keeps candidate order stable for later scoring", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>{"marker":"first"}</script>
|
||||||
|
<script>{"marker":"second"}</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
|
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("Data Parsing", () => {
|
describe("Data Parsing", () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user