refactor: rewrite facebook item parser for comet bootstrap
This commit is contained in:
@@ -496,6 +496,80 @@ function findSearchEdges(
|
|||||||
return bestMatch;
|
return bestMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface FacebookMarketplaceItemMatch {
|
||||||
|
item: FacebookMarketplaceItem;
|
||||||
|
score: number;
|
||||||
|
path: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreMarketplaceItemPath(path: string[]): number {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
if (path.includes("payload")) {
|
||||||
|
score += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("viewer")) {
|
||||||
|
score += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("marketplace_product_details_page")) {
|
||||||
|
score += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("target")) {
|
||||||
|
score += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("listing")) {
|
||||||
|
score += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
path.some(
|
||||||
|
(segment) =>
|
||||||
|
segment.includes("recommend") || segment.includes("related"),
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
score -= 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
return score - path.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectMarketplaceItemCandidates(
|
||||||
|
candidate: unknown,
|
||||||
|
path: string[] = [],
|
||||||
|
): FacebookMarketplaceItemMatch[] {
|
||||||
|
if (Array.isArray(candidate)) {
|
||||||
|
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRecord(candidate)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const matches: FacebookMarketplaceItemMatch[] = [];
|
||||||
|
|
||||||
|
if (
|
||||||
|
typeof candidate.id === "string" &&
|
||||||
|
candidate.__typename === "GroupCommerceProductItem" &&
|
||||||
|
typeof candidate.marketplace_listing_title === "string"
|
||||||
|
) {
|
||||||
|
matches.push({
|
||||||
|
item: candidate as FacebookMarketplaceItem,
|
||||||
|
score: scoreMarketplaceItemPath(path),
|
||||||
|
path,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(candidate)) {
|
||||||
|
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace search data from Facebook page script tags
|
Extract marketplace search data from Facebook page script tags
|
||||||
*/
|
*/
|
||||||
@@ -531,139 +605,29 @@ export function extractFacebookMarketplaceData(
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace item details from Facebook item page HTML
|
Extract marketplace item details from Facebook item page HTML
|
||||||
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
|
Updated for 2026 Facebook Marketplace bootstrap candidates
|
||||||
*/
|
*/
|
||||||
export function extractFacebookItemData(
|
export function extractFacebookItemData(
|
||||||
htmlString: HTMLString,
|
htmlString: HTMLString,
|
||||||
): FacebookMarketplaceItem | null {
|
): FacebookMarketplaceItem | null {
|
||||||
const { document } = parseHTML(htmlString);
|
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||||
const scripts = document.querySelectorAll("script");
|
let bestMatch: FacebookMarketplaceItemMatch | null = null;
|
||||||
|
|
||||||
for (const script of scripts) {
|
for (const candidate of candidates) {
|
||||||
const scriptText = script.textContent;
|
const matches = collectMarketplaceItemCandidates(candidate);
|
||||||
if (!scriptText) continue;
|
|
||||||
|
|
||||||
try {
|
for (const match of matches) {
|
||||||
const parsed = JSON.parse(scriptText);
|
if (
|
||||||
|
!bestMatch ||
|
||||||
// Check for the require structure with marketplace product details
|
match.score > bestMatch.score ||
|
||||||
if (parsed.require && Array.isArray(parsed.require)) {
|
(match.score === bestMatch.score && match.path.length < bestMatch.path.length)
|
||||||
// Try multiple extraction paths discovered from reverse engineering
|
) {
|
||||||
const extractionPaths = [
|
bestMatch = match;
|
||||||
// Path 1: Primary path from current API structure
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3].__bbox.result.data.viewer
|
|
||||||
.marketplace_product_details_page.target,
|
|
||||||
// Path 2: Alternative path with nested require
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
|
|
||||||
.viewer.marketplace_product_details_page.target,
|
|
||||||
// Path 3: Variation without the [0] index
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
|
|
||||||
.viewer.marketplace_product_details_page.target,
|
|
||||||
// Path 4-5: Additional fallback paths for edge cases
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
|
|
||||||
?.marketplace_product_details_page?.target,
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
|
|
||||||
?.marketplace_product_details_page?.target,
|
|
||||||
];
|
|
||||||
|
|
||||||
let pathIndex = 0;
|
|
||||||
for (const getPath of extractionPaths) {
|
|
||||||
try {
|
|
||||||
const targetData = getPath();
|
|
||||||
if (
|
|
||||||
targetData &&
|
|
||||||
typeof targetData === "object" &&
|
|
||||||
targetData.id &&
|
|
||||||
targetData.marketplace_listing_title &&
|
|
||||||
targetData.__typename === "GroupCommerceProductItem"
|
|
||||||
) {
|
|
||||||
console.log(
|
|
||||||
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
|
|
||||||
);
|
|
||||||
return targetData as FacebookMarketplaceItem;
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Path not found or invalid, try next path
|
|
||||||
}
|
|
||||||
pathIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: Search recursively for marketplace data in the parsed structure
|
|
||||||
const findMarketplaceData = (
|
|
||||||
obj: unknown,
|
|
||||||
depth = 0,
|
|
||||||
maxDepth = 10,
|
|
||||||
): FacebookMarketplaceItem | null => {
|
|
||||||
if (depth > maxDepth) return null; // Prevent infinite recursion
|
|
||||||
if (isRecord(obj)) {
|
|
||||||
// Check if this object matches the expected marketplace item structure
|
|
||||||
const candidate = obj as Record<string, unknown>;
|
|
||||||
if (
|
|
||||||
candidate.marketplace_listing_title &&
|
|
||||||
candidate.id &&
|
|
||||||
candidate.__typename === "GroupCommerceProductItem" &&
|
|
||||||
candidate.redacted_description
|
|
||||||
) {
|
|
||||||
return candidate as unknown as FacebookMarketplaceItem;
|
|
||||||
}
|
|
||||||
// Recursively search nested objects and arrays
|
|
||||||
for (const key in obj) {
|
|
||||||
const value = obj[key];
|
|
||||||
if (isRecord(value) || Array.isArray(value)) {
|
|
||||||
const result = findMarketplaceData(value, depth + 1, maxDepth);
|
|
||||||
if (result) return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (Array.isArray(obj)) {
|
|
||||||
// Search through arrays
|
|
||||||
for (const item of obj) {
|
|
||||||
const result = findMarketplaceData(item, depth + 1, maxDepth);
|
|
||||||
if (result) return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Search through the entire require structure
|
|
||||||
const recursiveResult = findMarketplaceData(parsed.require);
|
|
||||||
if (recursiveResult) {
|
|
||||||
console.log(
|
|
||||||
"Successfully extracted Facebook item data using recursive search",
|
|
||||||
);
|
|
||||||
return recursiveResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Additional search in other potential locations
|
|
||||||
if (
|
|
||||||
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
|
|
||||||
?.target
|
|
||||||
) {
|
|
||||||
const bboxData =
|
|
||||||
parsed.__bbox.result.data.viewer.marketplace_product_details_page
|
|
||||||
.target;
|
|
||||||
if (
|
|
||||||
bboxData &&
|
|
||||||
typeof bboxData === "object" &&
|
|
||||||
bboxData.id &&
|
|
||||||
bboxData.marketplace_listing_title &&
|
|
||||||
bboxData.__typename === "GroupCommerceProductItem"
|
|
||||||
) {
|
|
||||||
console.log(
|
|
||||||
"Successfully extracted Facebook item data from __bbox structure",
|
|
||||||
);
|
|
||||||
return bboxData as FacebookMarketplaceItem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} catch {}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return bestMatch?.item ?? null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -369,43 +369,80 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
|
|
||||||
describe("Data Extraction", () => {
|
describe("Data Extraction", () => {
|
||||||
describe("extractFacebookItemData", () => {
|
describe("extractFacebookItemData", () => {
|
||||||
test("should extract item data from standard require structure", () => {
|
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||||
const mockItemData = {
|
const html = `
|
||||||
id: "123456",
|
<html><body>
|
||||||
__typename: "GroupCommerceProductItem",
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
marketplace_listing_title: "Test Item",
|
<script>
|
||||||
formatted_price: { text: "$100.00" },
|
${JSON.stringify({
|
||||||
listing_price: { amount: "100.00", currency: "CAD" },
|
payload: {
|
||||||
is_live: true,
|
listing: {
|
||||||
};
|
id: "123",
|
||||||
const mockData = {
|
__typename: "GroupCommerceProductItem",
|
||||||
require: [
|
marketplace_listing_title: "Vintage Chair",
|
||||||
[
|
formatted_price: { text: "CA$80" },
|
||||||
null,
|
listing_price: {
|
||||||
null,
|
amount: "80.00",
|
||||||
null,
|
currency: "CAD",
|
||||||
{
|
amount_with_offset: "80.00",
|
||||||
__bbox: {
|
|
||||||
result: {
|
|
||||||
data: {
|
|
||||||
viewer: {
|
|
||||||
marketplace_product_details_page: {
|
|
||||||
target: mockItemData,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
|
redacted_description: { text: "Solid wood chair" },
|
||||||
|
location_text: { text: "Toronto, ON" },
|
||||||
|
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
|
||||||
|
condition: "USED",
|
||||||
|
is_live: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
})}
|
||||||
],
|
</script>
|
||||||
],
|
</body></html>
|
||||||
};
|
`;
|
||||||
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
|
|
||||||
|
|
||||||
const result = extractFacebookItemData(html);
|
const result = extractFacebookItemData(html);
|
||||||
expect(result).not.toBeNull();
|
expect(result).not.toBeNull();
|
||||||
expect(result?.id).toBe("123456");
|
expect(result?.id).toBe("123");
|
||||||
expect(result?.marketplace_listing_title).toBe("Test Item");
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("prefers the canonical permalink target over earlier decoy items", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
recommendation_units: [
|
||||||
|
{
|
||||||
|
listing: {
|
||||||
|
id: "decoy-1",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Recommended Chair",
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
target: {
|
||||||
|
id: "real-123",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Canonical Chair",
|
||||||
|
formatted_price: { text: "CA$120" },
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: "120.00",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookItemData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.id).toBe("real-123");
|
||||||
|
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
|
||||||
});
|
});
|
||||||
|
|
||||||
test("should handle missing item data", () => {
|
test("should handle missing item data", () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user