refactor: rewrite facebook search parser for comet bootstrap

This commit is contained in:
2026-04-22 02:32:55 -04:00
parent cfd7619737
commit c90ee54cc1
3 changed files with 256 additions and 137 deletions

View File

@@ -75,13 +75,6 @@ interface FacebookEdge {
[k: string]: unknown;
}
interface FacebookMarketplaceSearch {
feed_units?: {
edges?: FacebookEdge[];
};
[k: string]: unknown;
}
interface FacebookMarketplaceItem {
// Basic identification
id: string;
@@ -432,89 +425,108 @@ export function extractFacebookBootstrapCandidates(
return candidates;
}
function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] {
return (
Array.isArray(value) &&
value.length > 0 &&
value.every(
(edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing),
)
);
}
function scoreSearchEdges(edges: FacebookEdge[], score: number): number {
return score + Math.min(edges.length, 3);
}
function findSearchEdges(
candidate: unknown,
score = 0,
): { edges: FacebookEdge[]; score: number } | null {
if (Array.isArray(candidate)) {
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
for (const item of candidate) {
const result = findSearchEdges(item, score);
if (result && (!bestMatch || result.score > bestMatch.score)) {
bestMatch = result;
}
}
return bestMatch;
}
if (!isRecord(candidate)) {
return null;
}
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
const feedUnits = candidate.feed_units;
if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) {
bestMatch = {
edges: feedUnits.edges,
score: scoreSearchEdges(feedUnits.edges, score + 2),
};
}
const resultGroups = candidate.resultGroups;
if (Array.isArray(resultGroups)) {
for (const group of resultGroups) {
if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) {
const result = {
edges: group.edges,
score: scoreSearchEdges(group.edges, score + 4),
};
if (!bestMatch || result.score > bestMatch.score) {
bestMatch = result;
}
}
}
}
for (const [key, value] of Object.entries(candidate)) {
const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0));
if (result && (!bestMatch || result.score > bestMatch.score)) {
bestMatch = result;
}
}
return bestMatch;
}
/**
Extract marketplace search data from Facebook page script tags
*/
export function extractFacebookMarketplaceData(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates = extractFacebookBootstrapCandidates(htmlString);
let bestEdges: FacebookEdge[] | null = null;
let bestScore = -1;
let marketplaceData: FacebookMarketplaceSearch | null = null;
// Find the script containing the require data with marketplace_search
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent;
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
// First check if this is the direct data structure (like in examples)
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple navigation paths to find marketplace_search
const paths = [
// Original path from example
() =>
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
.marketplace_search,
// Alternative path structure
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
// Another variation
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
// Direct access for some responses
() => {
for (const item of parsed.require) {
if (item && item.length >= 4 && item[3]) {
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
if (bbox) return bbox;
}
}
return null;
},
];
for (const getData of paths) {
try {
const result = getData();
if (
result &&
isRecord(result) &&
(result as Record<string, unknown>).feed_units?.edges?.length > 0
) {
marketplaceData = result as FacebookMarketplaceSearch;
break;
}
} catch {}
for (const candidate of candidates) {
const result = findSearchEdges(candidate);
if (!result?.edges.length) {
continue;
}
if (marketplaceData) break;
if (result.score > bestScore) {
bestScore = result.score;
bestEdges = result.edges;
}
}
// Also check for direct marketplace_search in the parsed data
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
const searchData =
parsed.marketplace_search as FacebookMarketplaceSearch;
const feedLength = searchData.feed_units?.edges?.length ?? 0;
if (feedLength > 0) {
marketplaceData = searchData;
break;
}
}
} catch {}
}
if (!marketplaceData?.feed_units?.edges?.length) {
if (!bestEdges?.length) {
console.warn("No marketplace data found in HTML response");
return null;
}
console.log(
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`,
`Successfully parsed ${bestEdges.length} Facebook marketplace listings`,
);
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
return bestEdges.map((edge) => ({ node: edge.node }));
}
/**

View File

@@ -727,6 +727,151 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
});
test("extracts search results from Comet bootstrap candidates", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Bike",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Toronto" },
},
},
is_live: true,
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toHaveLength(1);
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
});
test("prefers the strongest marketplace edge set when multiple edges arrays exist", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
incidental: {
feed_units: {
edges: [
{
node: {
listing: {
id: "wrong-1",
marketplace_listing_title: "Wrong Listing",
listing_price: {
amount: "1.00",
formatted_amount: "CA$1",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
},
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "right-1",
marketplace_listing_title: "Right Listing",
listing_price: {
amount: "250.00",
formatted_amount: "CA$250",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toHaveLength(1);
expect(ads?.[0].node.listing.id).toBe("right-1");
});
test("rejects mixed edge arrays that contain non-listing entries", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Bike",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
is_live: true,
},
},
},
{
node: {
story: {
id: "not-a-listing",
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toBeNull();
});
});
});

View File

@@ -27,27 +27,19 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
describe("Main Search Function", () => {
test("should successfully fetch search results", async () => {
const mockSearchData = {
require: [
[
null,
null,
null,
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
__bbox: {
result: {
data: {
marketplace_search: {
feed_units: {
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "iPhone 13 Pro",
marketplace_listing_title: "iPhone 13",
listing_price: {
amount: "800.00",
formatted_amount: "$800.00",
amount: "500.00",
formatted_amount: "CA$500",
currency: "CAD",
},
location: {
@@ -55,49 +47,20 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
city_page: { display_name: "Toronto" },
},
},
creation_time: 1640995200,
is_live: true,
},
},
},
{
node: {
listing: {
id: "2",
marketplace_listing_title: "Samsung Galaxy",
listing_price: {
amount: "600.00",
formatted_amount: "$600.00",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Mississauga" },
},
},
creation_time: 1640995300,
is_live: true,
},
},
},
],
},
},
},
},
},
},
],
],
};
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
text: () => Promise.resolve(mockSearchHtml),
headers: {
get: () => null,
},
@@ -105,9 +68,8 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
);
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
expect(results).toHaveLength(2);
expect(results[0].title).toBe("iPhone 13 Pro");
expect(results[1].title).toBe("Samsung Galaxy");
expect(results).toHaveLength(1);
expect(results[0].title).toBe("iPhone 13");
});
test("should filter out items without price", async () => {