refactor: rewrite facebook item parser for comet bootstrap

This commit is contained in:
2026-04-22 02:44:17 -04:00
parent c90ee54cc1
commit 63ca006696
2 changed files with 156 additions and 155 deletions

View File

@@ -496,6 +496,80 @@ function findSearchEdges(
return bestMatch;
}
interface FacebookMarketplaceItemMatch {
item: FacebookMarketplaceItem;
score: number;
path: string[];
}
function scoreMarketplaceItemPath(path: string[]): number {
let score = 0;
if (path.includes("payload")) {
score += 2;
}
if (path.includes("viewer")) {
score += 2;
}
if (path.includes("marketplace_product_details_page")) {
score += 6;
}
if (path.includes("target")) {
score += 8;
}
if (path.includes("listing")) {
score += 6;
}
if (
path.some(
(segment) =>
segment.includes("recommend") || segment.includes("related"),
)
) {
score -= 10;
}
return score - path.length;
}
function collectMarketplaceItemCandidates(
candidate: unknown,
path: string[] = [],
): FacebookMarketplaceItemMatch[] {
if (Array.isArray(candidate)) {
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
}
if (!isRecord(candidate)) {
return [];
}
const matches: FacebookMarketplaceItemMatch[] = [];
if (
typeof candidate.id === "string" &&
candidate.__typename === "GroupCommerceProductItem" &&
typeof candidate.marketplace_listing_title === "string"
) {
matches.push({
item: candidate as FacebookMarketplaceItem,
score: scoreMarketplaceItemPath(path),
path,
});
}
for (const [key, value] of Object.entries(candidate)) {
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
}
return matches;
}
/**
Extract marketplace search data from Facebook page script tags
*/
@@ -531,139 +605,29 @@ export function extractFacebookMarketplaceData(
/**
Extract marketplace item details from Facebook item page HTML
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
Updated for 2026 Facebook Marketplace bootstrap candidates
*/
export function extractFacebookItemData(
htmlString: HTMLString,
): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates = extractFacebookBootstrapCandidates(htmlString);
let bestMatch: FacebookMarketplaceItemMatch | null = null;
for (const script of scripts) {
const scriptText = script.textContent;
if (!scriptText) continue;
for (const candidate of candidates) {
const matches = collectMarketplaceItemCandidates(candidate);
try {
const parsed = JSON.parse(scriptText);
// Check for the require structure with marketplace product details
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple extraction paths discovered from reverse engineering
const extractionPaths = [
// Path 1: Primary path from current API structure
() =>
parsed.require[0][3].__bbox.result.data.viewer
.marketplace_product_details_page.target,
// Path 2: Alternative path with nested require
() =>
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 3: Variation without the [0] index
() =>
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 4-5: Additional fallback paths for edge cases
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
() =>
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
];
let pathIndex = 0;
for (const getPath of extractionPaths) {
try {
const targetData = getPath();
for (const match of matches) {
if (
targetData &&
typeof targetData === "object" &&
targetData.id &&
targetData.marketplace_listing_title &&
targetData.__typename === "GroupCommerceProductItem"
!bestMatch ||
match.score > bestMatch.score ||
(match.score === bestMatch.score && match.path.length < bestMatch.path.length)
) {
console.log(
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
);
return targetData as FacebookMarketplaceItem;
bestMatch = match;
}
} catch {
// Path not found or invalid, try next path
}
pathIndex++;
}
// Fallback: Search recursively for marketplace data in the parsed structure
const findMarketplaceData = (
obj: unknown,
depth = 0,
maxDepth = 10,
): FacebookMarketplaceItem | null => {
if (depth > maxDepth) return null; // Prevent infinite recursion
if (isRecord(obj)) {
// Check if this object matches the expected marketplace item structure
const candidate = obj as Record<string, unknown>;
if (
candidate.marketplace_listing_title &&
candidate.id &&
candidate.__typename === "GroupCommerceProductItem" &&
candidate.redacted_description
) {
return candidate as unknown as FacebookMarketplaceItem;
}
// Recursively search nested objects and arrays
for (const key in obj) {
const value = obj[key];
if (isRecord(value) || Array.isArray(value)) {
const result = findMarketplaceData(value, depth + 1, maxDepth);
if (result) return result;
}
}
} else if (Array.isArray(obj)) {
// Search through arrays
for (const item of obj) {
const result = findMarketplaceData(item, depth + 1, maxDepth);
if (result) return result;
}
}
return null;
};
// Search through the entire require structure
const recursiveResult = findMarketplaceData(parsed.require);
if (recursiveResult) {
console.log(
"Successfully extracted Facebook item data using recursive search",
);
return recursiveResult;
}
// Additional search in other potential locations
if (
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
?.target
) {
const bboxData =
parsed.__bbox.result.data.viewer.marketplace_product_details_page
.target;
if (
bboxData &&
typeof bboxData === "object" &&
bboxData.id &&
bboxData.marketplace_listing_title &&
bboxData.__typename === "GroupCommerceProductItem"
) {
console.log(
"Successfully extracted Facebook item data from __bbox structure",
);
return bboxData as FacebookMarketplaceItem;
}
}
}
} catch {}
}
return null;
return bestMatch?.item ?? null;
}
/**

View File

@@ -369,43 +369,80 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
describe("Data Extraction", () => {
describe("extractFacebookItemData", () => {
test("should extract item data from standard require structure", () => {
const mockItemData = {
id: "123456",
test("extracts item details from Comet permalink bootstrap candidates", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
listing: {
id: "123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Test Item",
formatted_price: { text: "$100.00" },
listing_price: { amount: "100.00", currency: "CAD" },
marketplace_listing_title: "Vintage Chair",
formatted_price: { text: "CA$80" },
listing_price: {
amount: "80.00",
currency: "CAD",
amount_with_offset: "80.00",
},
redacted_description: { text: "Solid wood chair" },
location_text: { text: "Toronto, ON" },
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
condition: "USED",
is_live: true,
};
const mockData = {
require: [
[
null,
null,
null,
{
__bbox: {
result: {
data: {
viewer: {
marketplace_product_details_page: {
target: mockItemData,
},
},
},
},
},
},
],
],
};
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
})}
</script>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("123456");
expect(result?.marketplace_listing_title).toBe("Test Item");
expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
});
test("prefers the canonical permalink target over earlier decoy items", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
recommendation_units: [
{
listing: {
id: "decoy-1",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Recommended Chair",
is_live: true,
},
},
],
target: {
id: "real-123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Canonical Chair",
formatted_price: { text: "CA$120" },
listing_price: {
amount: "120.00",
currency: "CAD",
amount_with_offset: "120.00",
},
is_live: true,
},
},
})}
</script>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("real-123");
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
});
test("should handle missing item data", () => {