refactor: rewrite facebook item parser for comet bootstrap

This commit is contained in:
2026-04-22 02:44:17 -04:00
parent c90ee54cc1
commit 63ca006696
2 changed files with 156 additions and 155 deletions

View File

@@ -496,6 +496,80 @@ function findSearchEdges(
return bestMatch; return bestMatch;
} }
interface FacebookMarketplaceItemMatch {
item: FacebookMarketplaceItem;
score: number;
path: string[];
}
function scoreMarketplaceItemPath(path: string[]): number {
let score = 0;
if (path.includes("payload")) {
score += 2;
}
if (path.includes("viewer")) {
score += 2;
}
if (path.includes("marketplace_product_details_page")) {
score += 6;
}
if (path.includes("target")) {
score += 8;
}
if (path.includes("listing")) {
score += 6;
}
if (
path.some(
(segment) =>
segment.includes("recommend") || segment.includes("related"),
)
) {
score -= 10;
}
return score - path.length;
}
function collectMarketplaceItemCandidates(
candidate: unknown,
path: string[] = [],
): FacebookMarketplaceItemMatch[] {
if (Array.isArray(candidate)) {
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
}
if (!isRecord(candidate)) {
return [];
}
const matches: FacebookMarketplaceItemMatch[] = [];
if (
typeof candidate.id === "string" &&
candidate.__typename === "GroupCommerceProductItem" &&
typeof candidate.marketplace_listing_title === "string"
) {
matches.push({
item: candidate as FacebookMarketplaceItem,
score: scoreMarketplaceItemPath(path),
path,
});
}
for (const [key, value] of Object.entries(candidate)) {
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
}
return matches;
}
/** /**
Extract marketplace search data from Facebook page script tags Extract marketplace search data from Facebook page script tags
*/ */
@@ -531,139 +605,29 @@ export function extractFacebookMarketplaceData(
/** /**
Extract marketplace item details from Facebook item page HTML Extract marketplace item details from Facebook item page HTML
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths Updated for 2026 Facebook Marketplace bootstrap candidates
*/ */
export function extractFacebookItemData( export function extractFacebookItemData(
htmlString: HTMLString, htmlString: HTMLString,
): FacebookMarketplaceItem | null { ): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString); const candidates = extractFacebookBootstrapCandidates(htmlString);
const scripts = document.querySelectorAll("script"); let bestMatch: FacebookMarketplaceItemMatch | null = null;
for (const script of scripts) { for (const candidate of candidates) {
const scriptText = script.textContent; const matches = collectMarketplaceItemCandidates(candidate);
if (!scriptText) continue;
try { for (const match of matches) {
const parsed = JSON.parse(scriptText); if (
!bestMatch ||
// Check for the require structure with marketplace product details match.score > bestMatch.score ||
if (parsed.require && Array.isArray(parsed.require)) { (match.score === bestMatch.score && match.path.length < bestMatch.path.length)
// Try multiple extraction paths discovered from reverse engineering ) {
const extractionPaths = [ bestMatch = match;
// Path 1: Primary path from current API structure
() =>
parsed.require[0][3].__bbox.result.data.viewer
.marketplace_product_details_page.target,
// Path 2: Alternative path with nested require
() =>
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 3: Variation without the [0] index
() =>
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 4-5: Additional fallback paths for edge cases
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
() =>
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
];
let pathIndex = 0;
for (const getPath of extractionPaths) {
try {
const targetData = getPath();
if (
targetData &&
typeof targetData === "object" &&
targetData.id &&
targetData.marketplace_listing_title &&
targetData.__typename === "GroupCommerceProductItem"
) {
console.log(
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
);
return targetData as FacebookMarketplaceItem;
}
} catch {
// Path not found or invalid, try next path
}
pathIndex++;
}
// Fallback: Search recursively for marketplace data in the parsed structure
const findMarketplaceData = (
obj: unknown,
depth = 0,
maxDepth = 10,
): FacebookMarketplaceItem | null => {
if (depth > maxDepth) return null; // Prevent infinite recursion
if (isRecord(obj)) {
// Check if this object matches the expected marketplace item structure
const candidate = obj as Record<string, unknown>;
if (
candidate.marketplace_listing_title &&
candidate.id &&
candidate.__typename === "GroupCommerceProductItem" &&
candidate.redacted_description
) {
return candidate as unknown as FacebookMarketplaceItem;
}
// Recursively search nested objects and arrays
for (const key in obj) {
const value = obj[key];
if (isRecord(value) || Array.isArray(value)) {
const result = findMarketplaceData(value, depth + 1, maxDepth);
if (result) return result;
}
}
} else if (Array.isArray(obj)) {
// Search through arrays
for (const item of obj) {
const result = findMarketplaceData(item, depth + 1, maxDepth);
if (result) return result;
}
}
return null;
};
// Search through the entire require structure
const recursiveResult = findMarketplaceData(parsed.require);
if (recursiveResult) {
console.log(
"Successfully extracted Facebook item data using recursive search",
);
return recursiveResult;
}
// Additional search in other potential locations
if (
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
?.target
) {
const bboxData =
parsed.__bbox.result.data.viewer.marketplace_product_details_page
.target;
if (
bboxData &&
typeof bboxData === "object" &&
bboxData.id &&
bboxData.marketplace_listing_title &&
bboxData.__typename === "GroupCommerceProductItem"
) {
console.log(
"Successfully extracted Facebook item data from __bbox structure",
);
return bboxData as FacebookMarketplaceItem;
}
}
} }
} catch {} }
} }
return null; return bestMatch?.item ?? null;
} }
/** /**

View File

@@ -369,43 +369,80 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
describe("Data Extraction", () => { describe("Data Extraction", () => {
describe("extractFacebookItemData", () => { describe("extractFacebookItemData", () => {
test("should extract item data from standard require structure", () => { test("extracts item details from Comet permalink bootstrap candidates", () => {
const mockItemData = { const html = `
id: "123456", <html><body>
__typename: "GroupCommerceProductItem", <script>"XCometMarketplacePermalinkController"</script>
marketplace_listing_title: "Test Item", <script>
formatted_price: { text: "$100.00" }, ${JSON.stringify({
listing_price: { amount: "100.00", currency: "CAD" }, payload: {
is_live: true, listing: {
}; id: "123",
const mockData = { __typename: "GroupCommerceProductItem",
require: [ marketplace_listing_title: "Vintage Chair",
[ formatted_price: { text: "CA$80" },
null, listing_price: {
null, amount: "80.00",
null, currency: "CAD",
{ amount_with_offset: "80.00",
__bbox: {
result: {
data: {
viewer: {
marketplace_product_details_page: {
target: mockItemData,
},
},
}, },
redacted_description: { text: "Solid wood chair" },
location_text: { text: "Toronto, ON" },
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
condition: "USED",
is_live: true,
}, },
}, },
}, })}
], </script>
], </body></html>
}; `;
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
const result = extractFacebookItemData(html); const result = extractFacebookItemData(html);
expect(result).not.toBeNull(); expect(result).not.toBeNull();
expect(result?.id).toBe("123456"); expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Test Item"); expect(result?.marketplace_listing_title).toBe("Vintage Chair");
});
test("prefers the canonical permalink target over earlier decoy items", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
recommendation_units: [
{
listing: {
id: "decoy-1",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Recommended Chair",
is_live: true,
},
},
],
target: {
id: "real-123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Canonical Chair",
formatted_price: { text: "CA$120" },
listing_price: {
amount: "120.00",
currency: "CAD",
amount_with_offset: "120.00",
},
is_live: true,
},
},
})}
</script>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("real-123");
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
}); });
test("should handle missing item data", () => { test("should handle missing item data", () => {