refactor: add facebook html fallbacks
This commit is contained in:
@@ -166,6 +166,10 @@ interface FacebookMarketplaceItem {
|
|||||||
[k: string]: unknown;
|
[k: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
|
||||||
|
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
|
||||||
|
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
|
||||||
|
|
||||||
export interface FacebookListingDetails {
|
export interface FacebookListingDetails {
|
||||||
url: string;
|
url: string;
|
||||||
title: string;
|
title: string;
|
||||||
@@ -570,6 +574,192 @@ function collectMarketplaceItemCandidates(
|
|||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function parseFacebookRenderedPrice(priceText: string) {
|
||||||
|
const trimmed = priceText.trim();
|
||||||
|
if (!trimmed || trimmed.toUpperCase() === "FREE") {
|
||||||
|
return {
|
||||||
|
amount: "0.00",
|
||||||
|
formatted_amount: trimmed || "FREE",
|
||||||
|
currency: "CAD",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
|
||||||
|
if (!amountMatch) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
|
||||||
|
if (!Number.isFinite(amount)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
amount: amount.toFixed(2),
|
||||||
|
formatted_amount: trimmed,
|
||||||
|
currency: "CAD",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractRenderedText(node: ParentNode, selector: string): string[] {
|
||||||
|
return Array.from(node.querySelectorAll(selector))
|
||||||
|
.map((element) => element.textContent?.trim())
|
||||||
|
.filter((text): text is string => Boolean(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
|
||||||
|
const href = element?.getAttribute("href") || "";
|
||||||
|
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookPermalinkItemId(document: Document): string | null {
|
||||||
|
const canonicalId = extractMarketplaceItemIdFromElement(
|
||||||
|
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
|
||||||
|
);
|
||||||
|
if (canonicalId) {
|
||||||
|
return canonicalId;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ogUrl = document
|
||||||
|
.querySelector('meta[property="og:url"]')
|
||||||
|
?.getAttribute("content");
|
||||||
|
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||||
|
if (ogId) {
|
||||||
|
return ogId;
|
||||||
|
}
|
||||||
|
|
||||||
|
const title = document.querySelector("h1")?.textContent?.trim();
|
||||||
|
if (!title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const itemLinks = Array.from(
|
||||||
|
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||||
|
);
|
||||||
|
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
|
||||||
|
|
||||||
|
if (selfLink) {
|
||||||
|
return extractMarketplaceItemIdFromElement(selfLink);
|
||||||
|
}
|
||||||
|
|
||||||
|
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookDescriptionText(document: Document): string | undefined {
|
||||||
|
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
|
||||||
|
|
||||||
|
for (const label of labels) {
|
||||||
|
if (label.textContent?.trim() !== "Description") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let sibling = label.nextElementSibling;
|
||||||
|
while (sibling) {
|
||||||
|
const text = sibling.textContent?.trim();
|
||||||
|
if (text && text !== "Description") {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
sibling = sibling.nextElementSibling;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookMarketplaceHtmlFallback(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): FacebookAdNode[] | null {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const links = Array.from(
|
||||||
|
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||||
|
) as HTMLAnchorElement[];
|
||||||
|
const seenIds = new Set<string>();
|
||||||
|
const results: FacebookAdNode[] = [];
|
||||||
|
|
||||||
|
for (const link of links) {
|
||||||
|
const href = link.getAttribute("href") || "";
|
||||||
|
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||||
|
if (!id || seenIds.has(id)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const texts = extractRenderedText(link, "span, div");
|
||||||
|
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||||
|
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
|
||||||
|
const title = texts.find(
|
||||||
|
(text) => text !== priceText && text !== location && !text.includes("/"),
|
||||||
|
);
|
||||||
|
if (!title || !priceText) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsedPrice = parseFacebookRenderedPrice(priceText);
|
||||||
|
if (!parsedPrice) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id,
|
||||||
|
marketplace_listing_title: title,
|
||||||
|
listing_price: parsedPrice,
|
||||||
|
location: location
|
||||||
|
? {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: {
|
||||||
|
display_name: location,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
: undefined,
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
seenIds.add(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results.length > 0 ? results : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookItemHtmlFallback(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): FacebookMarketplaceItem | null {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const title = document.querySelector("h1")?.textContent?.trim();
|
||||||
|
const id = extractFacebookPermalinkItemId(document);
|
||||||
|
|
||||||
|
if (!id || !title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const texts = extractRenderedText(document, "h1, span, div, p");
|
||||||
|
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||||
|
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
|
||||||
|
const location = texts.find(
|
||||||
|
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
|
||||||
|
);
|
||||||
|
const description = extractFacebookDescriptionText(document);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: title,
|
||||||
|
formatted_price: priceText ? { text: priceText } : undefined,
|
||||||
|
listing_price: parsedPrice
|
||||||
|
? {
|
||||||
|
amount: parsedPrice.amount,
|
||||||
|
currency: parsedPrice.currency,
|
||||||
|
amount_with_offset: parsedPrice.amount,
|
||||||
|
}
|
||||||
|
: undefined,
|
||||||
|
location_text: location ? { text: location } : undefined,
|
||||||
|
redacted_description: description ? { text: description } : undefined,
|
||||||
|
is_live: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace search data from Facebook page script tags
|
Extract marketplace search data from Facebook page script tags
|
||||||
*/
|
*/
|
||||||
@@ -593,6 +783,16 @@ export function extractFacebookMarketplaceData(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!bestEdges?.length) {
|
if (!bestEdges?.length) {
|
||||||
|
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||||
|
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
|
||||||
|
if (htmlFallback?.length) {
|
||||||
|
console.log(
|
||||||
|
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
|
||||||
|
);
|
||||||
|
return htmlFallback;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
console.warn("No marketplace data found in HTML response");
|
console.warn("No marketplace data found in HTML response");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -627,7 +827,15 @@ export function extractFacebookItemData(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return bestMatch?.item ?? null;
|
if (bestMatch) {
|
||||||
|
return bestMatch.item;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||||
|
return extractFacebookItemHtmlFallback(htmlString);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -404,6 +404,60 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{invalid: json}</script>
|
||||||
|
<h1>Vintage Chair</h1>
|
||||||
|
<span>CA$80</span>
|
||||||
|
<div>Toronto, ON</div>
|
||||||
|
<div>Description</div>
|
||||||
|
<div>Solid wood chair</div>
|
||||||
|
<a href="/marketplace/item/123/">View listing</a>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookItemData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.id).toBe("123");
|
||||||
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
expect(result?.formatted_price?.text).toBe("CA$80");
|
||||||
|
expect(result?.location_text?.text).toBe("Toronto, ON");
|
||||||
|
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel="canonical" href="https://www.facebook.com/marketplace/item/123/" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{invalid: json}</script>
|
||||||
|
<a href="/marketplace/item/999/">
|
||||||
|
<span>Related Chair</span>
|
||||||
|
</a>
|
||||||
|
<h1>Vintage Chair</h1>
|
||||||
|
<span>CA$80</span>
|
||||||
|
<div>Toronto, ON</div>
|
||||||
|
<div>Message seller</div>
|
||||||
|
<div>Seller details</div>
|
||||||
|
<div>Description</div>
|
||||||
|
<div>Solid wood chair</div>
|
||||||
|
<a href="/marketplace/item/123/">View listing</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookItemData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.id).toBe("123");
|
||||||
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||||
|
});
|
||||||
|
|
||||||
test("prefers the canonical permalink target over earlier decoy items", () => {
|
test("prefers the canonical permalink target over earlier decoy items", () => {
|
||||||
const html = `
|
const html = `
|
||||||
<html><body>
|
<html><body>
|
||||||
@@ -584,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>{invalid: json}</script>
|
||||||
|
<a href="/marketplace/item/987654321/">
|
||||||
|
<span>Vintage Bike</span>
|
||||||
|
<span>CA$120</span>
|
||||||
|
<span>Toronto, ON</span>
|
||||||
|
</a>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookMarketplaceData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result).toHaveLength(1);
|
||||||
|
expect(result?.[0].node.listing.id).toBe("987654321");
|
||||||
|
expect(result?.[0].node.listing.marketplace_listing_title).toBe(
|
||||||
|
"Vintage Bike",
|
||||||
|
);
|
||||||
|
expect(result?.[0].node.listing.listing_price).toEqual({
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle empty search results", () => {
|
test("should handle empty search results", () => {
|
||||||
const mockData = {
|
const mockData = {
|
||||||
require: [
|
require: [
|
||||||
|
|||||||
Reference in New Issue
Block a user