refactor: rewrite facebook item parser for comet bootstrap
This commit is contained in:
@@ -496,6 +496,80 @@ function findSearchEdges(
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceItemMatch {
|
||||
item: FacebookMarketplaceItem;
|
||||
score: number;
|
||||
path: string[];
|
||||
}
|
||||
|
||||
function scoreMarketplaceItemPath(path: string[]): number {
|
||||
let score = 0;
|
||||
|
||||
if (path.includes("payload")) {
|
||||
score += 2;
|
||||
}
|
||||
|
||||
if (path.includes("viewer")) {
|
||||
score += 2;
|
||||
}
|
||||
|
||||
if (path.includes("marketplace_product_details_page")) {
|
||||
score += 6;
|
||||
}
|
||||
|
||||
if (path.includes("target")) {
|
||||
score += 8;
|
||||
}
|
||||
|
||||
if (path.includes("listing")) {
|
||||
score += 6;
|
||||
}
|
||||
|
||||
if (
|
||||
path.some(
|
||||
(segment) =>
|
||||
segment.includes("recommend") || segment.includes("related"),
|
||||
)
|
||||
) {
|
||||
score -= 10;
|
||||
}
|
||||
|
||||
return score - path.length;
|
||||
}
|
||||
|
||||
function collectMarketplaceItemCandidates(
|
||||
candidate: unknown,
|
||||
path: string[] = [],
|
||||
): FacebookMarketplaceItemMatch[] {
|
||||
if (Array.isArray(candidate)) {
|
||||
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
|
||||
}
|
||||
|
||||
if (!isRecord(candidate)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const matches: FacebookMarketplaceItemMatch[] = [];
|
||||
|
||||
if (
|
||||
typeof candidate.id === "string" &&
|
||||
candidate.__typename === "GroupCommerceProductItem" &&
|
||||
typeof candidate.marketplace_listing_title === "string"
|
||||
) {
|
||||
matches.push({
|
||||
item: candidate as FacebookMarketplaceItem,
|
||||
score: scoreMarketplaceItemPath(path),
|
||||
path,
|
||||
});
|
||||
}
|
||||
|
||||
for (const [key, value] of Object.entries(candidate)) {
|
||||
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
@@ -531,139 +605,29 @@ export function extractFacebookMarketplaceData(
|
||||
|
||||
/**
|
||||
Extract marketplace item details from Facebook item page HTML
|
||||
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
|
||||
Updated for 2026 Facebook Marketplace bootstrap candidates
|
||||
*/
|
||||
export function extractFacebookItemData(
|
||||
htmlString: HTMLString,
|
||||
): FacebookMarketplaceItem | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||
let bestMatch: FacebookMarketplaceItemMatch | null = null;
|
||||
|
||||
for (const script of scripts) {
|
||||
const scriptText = script.textContent;
|
||||
if (!scriptText) continue;
|
||||
for (const candidate of candidates) {
|
||||
const matches = collectMarketplaceItemCandidates(candidate);
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
|
||||
// Check for the require structure with marketplace product details
|
||||
if (parsed.require && Array.isArray(parsed.require)) {
|
||||
// Try multiple extraction paths discovered from reverse engineering
|
||||
const extractionPaths = [
|
||||
// Path 1: Primary path from current API structure
|
||||
() =>
|
||||
parsed.require[0][3].__bbox.result.data.viewer
|
||||
.marketplace_product_details_page.target,
|
||||
// Path 2: Alternative path with nested require
|
||||
() =>
|
||||
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
|
||||
.viewer.marketplace_product_details_page.target,
|
||||
// Path 3: Variation without the [0] index
|
||||
() =>
|
||||
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
|
||||
.viewer.marketplace_product_details_page.target,
|
||||
// Path 4-5: Additional fallback paths for edge cases
|
||||
() =>
|
||||
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
|
||||
?.marketplace_product_details_page?.target,
|
||||
() =>
|
||||
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
|
||||
?.marketplace_product_details_page?.target,
|
||||
];
|
||||
|
||||
let pathIndex = 0;
|
||||
for (const getPath of extractionPaths) {
|
||||
try {
|
||||
const targetData = getPath();
|
||||
if (
|
||||
targetData &&
|
||||
typeof targetData === "object" &&
|
||||
targetData.id &&
|
||||
targetData.marketplace_listing_title &&
|
||||
targetData.__typename === "GroupCommerceProductItem"
|
||||
) {
|
||||
console.log(
|
||||
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
|
||||
);
|
||||
return targetData as FacebookMarketplaceItem;
|
||||
}
|
||||
} catch {
|
||||
// Path not found or invalid, try next path
|
||||
}
|
||||
pathIndex++;
|
||||
}
|
||||
|
||||
// Fallback: Search recursively for marketplace data in the parsed structure
|
||||
const findMarketplaceData = (
|
||||
obj: unknown,
|
||||
depth = 0,
|
||||
maxDepth = 10,
|
||||
): FacebookMarketplaceItem | null => {
|
||||
if (depth > maxDepth) return null; // Prevent infinite recursion
|
||||
if (isRecord(obj)) {
|
||||
// Check if this object matches the expected marketplace item structure
|
||||
const candidate = obj as Record<string, unknown>;
|
||||
if (
|
||||
candidate.marketplace_listing_title &&
|
||||
candidate.id &&
|
||||
candidate.__typename === "GroupCommerceProductItem" &&
|
||||
candidate.redacted_description
|
||||
) {
|
||||
return candidate as unknown as FacebookMarketplaceItem;
|
||||
}
|
||||
// Recursively search nested objects and arrays
|
||||
for (const key in obj) {
|
||||
const value = obj[key];
|
||||
if (isRecord(value) || Array.isArray(value)) {
|
||||
const result = findMarketplaceData(value, depth + 1, maxDepth);
|
||||
if (result) return result;
|
||||
}
|
||||
}
|
||||
} else if (Array.isArray(obj)) {
|
||||
// Search through arrays
|
||||
for (const item of obj) {
|
||||
const result = findMarketplaceData(item, depth + 1, maxDepth);
|
||||
if (result) return result;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
// Search through the entire require structure
|
||||
const recursiveResult = findMarketplaceData(parsed.require);
|
||||
if (recursiveResult) {
|
||||
console.log(
|
||||
"Successfully extracted Facebook item data using recursive search",
|
||||
);
|
||||
return recursiveResult;
|
||||
}
|
||||
|
||||
// Additional search in other potential locations
|
||||
if (
|
||||
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
|
||||
?.target
|
||||
) {
|
||||
const bboxData =
|
||||
parsed.__bbox.result.data.viewer.marketplace_product_details_page
|
||||
.target;
|
||||
if (
|
||||
bboxData &&
|
||||
typeof bboxData === "object" &&
|
||||
bboxData.id &&
|
||||
bboxData.marketplace_listing_title &&
|
||||
bboxData.__typename === "GroupCommerceProductItem"
|
||||
) {
|
||||
console.log(
|
||||
"Successfully extracted Facebook item data from __bbox structure",
|
||||
);
|
||||
return bboxData as FacebookMarketplaceItem;
|
||||
}
|
||||
}
|
||||
for (const match of matches) {
|
||||
if (
|
||||
!bestMatch ||
|
||||
match.score > bestMatch.score ||
|
||||
(match.score === bestMatch.score && match.path.length < bestMatch.path.length)
|
||||
) {
|
||||
bestMatch = match;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
return bestMatch?.item ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -369,43 +369,80 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
|
||||
describe("Data Extraction", () => {
|
||||
describe("extractFacebookItemData", () => {
|
||||
test("should extract item data from standard require structure", () => {
|
||||
const mockItemData = {
|
||||
id: "123456",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Test Item",
|
||||
formatted_price: { text: "$100.00" },
|
||||
listing_price: { amount: "100.00", currency: "CAD" },
|
||||
is_live: true,
|
||||
};
|
||||
const mockData = {
|
||||
require: [
|
||||
[
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
{
|
||||
__bbox: {
|
||||
result: {
|
||||
data: {
|
||||
viewer: {
|
||||
marketplace_product_details_page: {
|
||||
target: mockItemData,
|
||||
},
|
||||
},
|
||||
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
listing: {
|
||||
id: "123",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Vintage Chair",
|
||||
formatted_price: { text: "CA$80" },
|
||||
listing_price: {
|
||||
amount: "80.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "80.00",
|
||||
},
|
||||
redacted_description: { text: "Solid wood chair" },
|
||||
location_text: { text: "Toronto, ON" },
|
||||
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
|
||||
condition: "USED",
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
],
|
||||
};
|
||||
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("123456");
|
||||
expect(result?.marketplace_listing_title).toBe("Test Item");
|
||||
expect(result?.id).toBe("123");
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
});
|
||||
|
||||
test("prefers the canonical permalink target over earlier decoy items", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
recommendation_units: [
|
||||
{
|
||||
listing: {
|
||||
id: "decoy-1",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Recommended Chair",
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
],
|
||||
target: {
|
||||
id: "real-123",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Canonical Chair",
|
||||
formatted_price: { text: "CA$120" },
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "120.00",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("real-123");
|
||||
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
|
||||
});
|
||||
|
||||
test("should handle missing item data", () => {
|
||||
|
||||
Reference in New Issue
Block a user