refactor: add facebook html fallbacks
This commit is contained in:
@@ -166,6 +166,10 @@ interface FacebookMarketplaceItem {
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
|
||||
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
|
||||
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
|
||||
|
||||
export interface FacebookListingDetails {
|
||||
url: string;
|
||||
title: string;
|
||||
@@ -570,6 +574,192 @@ function collectMarketplaceItemCandidates(
|
||||
return matches;
|
||||
}
|
||||
|
||||
function parseFacebookRenderedPrice(priceText: string) {
|
||||
const trimmed = priceText.trim();
|
||||
if (!trimmed || trimmed.toUpperCase() === "FREE") {
|
||||
return {
|
||||
amount: "0.00",
|
||||
formatted_amount: trimmed || "FREE",
|
||||
currency: "CAD",
|
||||
};
|
||||
}
|
||||
|
||||
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
|
||||
if (!amountMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
|
||||
if (!Number.isFinite(amount)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
amount: amount.toFixed(2),
|
||||
formatted_amount: trimmed,
|
||||
currency: "CAD",
|
||||
};
|
||||
}
|
||||
|
||||
function extractRenderedText(node: ParentNode, selector: string): string[] {
|
||||
return Array.from(node.querySelectorAll(selector))
|
||||
.map((element) => element.textContent?.trim())
|
||||
.filter((text): text is string => Boolean(text));
|
||||
}
|
||||
|
||||
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
|
||||
const href = element?.getAttribute("href") || "";
|
||||
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
|
||||
}
|
||||
|
||||
function extractFacebookPermalinkItemId(document: Document): string | null {
|
||||
const canonicalId = extractMarketplaceItemIdFromElement(
|
||||
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
|
||||
);
|
||||
if (canonicalId) {
|
||||
return canonicalId;
|
||||
}
|
||||
|
||||
const ogUrl = document
|
||||
.querySelector('meta[property="og:url"]')
|
||||
?.getAttribute("content");
|
||||
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||
if (ogId) {
|
||||
return ogId;
|
||||
}
|
||||
|
||||
const title = document.querySelector("h1")?.textContent?.trim();
|
||||
if (!title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const itemLinks = Array.from(
|
||||
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||
);
|
||||
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
|
||||
|
||||
if (selfLink) {
|
||||
return extractMarketplaceItemIdFromElement(selfLink);
|
||||
}
|
||||
|
||||
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
|
||||
}
|
||||
|
||||
function extractFacebookDescriptionText(document: Document): string | undefined {
|
||||
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
|
||||
|
||||
for (const label of labels) {
|
||||
if (label.textContent?.trim() !== "Description") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let sibling = label.nextElementSibling;
|
||||
while (sibling) {
|
||||
const text = sibling.textContent?.trim();
|
||||
if (text && text !== "Description") {
|
||||
return text;
|
||||
}
|
||||
sibling = sibling.nextElementSibling;
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function extractFacebookMarketplaceHtmlFallback(
|
||||
htmlString: HTMLString,
|
||||
): FacebookAdNode[] | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const links = Array.from(
|
||||
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||
) as HTMLAnchorElement[];
|
||||
const seenIds = new Set<string>();
|
||||
const results: FacebookAdNode[] = [];
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.getAttribute("href") || "";
|
||||
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||
if (!id || seenIds.has(id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const texts = extractRenderedText(link, "span, div");
|
||||
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
|
||||
const title = texts.find(
|
||||
(text) => text !== priceText && text !== location && !text.includes("/"),
|
||||
);
|
||||
if (!title || !priceText) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parsedPrice = parseFacebookRenderedPrice(priceText);
|
||||
if (!parsedPrice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push({
|
||||
node: {
|
||||
listing: {
|
||||
id,
|
||||
marketplace_listing_title: title,
|
||||
listing_price: parsedPrice,
|
||||
location: location
|
||||
? {
|
||||
reverse_geocode: {
|
||||
city_page: {
|
||||
display_name: location,
|
||||
},
|
||||
},
|
||||
}
|
||||
: undefined,
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
seenIds.add(id);
|
||||
}
|
||||
|
||||
return results.length > 0 ? results : null;
|
||||
}
|
||||
|
||||
function extractFacebookItemHtmlFallback(
|
||||
htmlString: HTMLString,
|
||||
): FacebookMarketplaceItem | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const title = document.querySelector("h1")?.textContent?.trim();
|
||||
const id = extractFacebookPermalinkItemId(document);
|
||||
|
||||
if (!id || !title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const texts = extractRenderedText(document, "h1, span, div, p");
|
||||
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
|
||||
const location = texts.find(
|
||||
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
|
||||
);
|
||||
const description = extractFacebookDescriptionText(document);
|
||||
|
||||
return {
|
||||
id,
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: title,
|
||||
formatted_price: priceText ? { text: priceText } : undefined,
|
||||
listing_price: parsedPrice
|
||||
? {
|
||||
amount: parsedPrice.amount,
|
||||
currency: parsedPrice.currency,
|
||||
amount_with_offset: parsedPrice.amount,
|
||||
}
|
||||
: undefined,
|
||||
location_text: location ? { text: location } : undefined,
|
||||
redacted_description: description ? { text: description } : undefined,
|
||||
is_live: true,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
@@ -593,6 +783,16 @@ export function extractFacebookMarketplaceData(
|
||||
}
|
||||
|
||||
if (!bestEdges?.length) {
|
||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
|
||||
if (htmlFallback?.length) {
|
||||
console.log(
|
||||
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
|
||||
);
|
||||
return htmlFallback;
|
||||
}
|
||||
}
|
||||
|
||||
console.warn("No marketplace data found in HTML response");
|
||||
return null;
|
||||
}
|
||||
@@ -627,7 +827,15 @@ export function extractFacebookItemData(
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch?.item ?? null;
|
||||
if (bestMatch) {
|
||||
return bestMatch.item;
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||
return extractFacebookItemHtmlFallback(htmlString);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -404,6 +404,60 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
});
|
||||
|
||||
test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{invalid: json}</script>
|
||||
<h1>Vintage Chair</h1>
|
||||
<span>CA$80</span>
|
||||
<div>Toronto, ON</div>
|
||||
<div>Description</div>
|
||||
<div>Solid wood chair</div>
|
||||
<a href="/marketplace/item/123/">View listing</a>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("123");
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
expect(result?.formatted_price?.text).toBe("CA$80");
|
||||
expect(result?.location_text?.text).toBe("Toronto, ON");
|
||||
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||
});
|
||||
|
||||
test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<link rel="canonical" href="https://www.facebook.com/marketplace/item/123/" />
|
||||
</head>
|
||||
<body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{invalid: json}</script>
|
||||
<a href="/marketplace/item/999/">
|
||||
<span>Related Chair</span>
|
||||
</a>
|
||||
<h1>Vintage Chair</h1>
|
||||
<span>CA$80</span>
|
||||
<div>Toronto, ON</div>
|
||||
<div>Message seller</div>
|
||||
<div>Seller details</div>
|
||||
<div>Description</div>
|
||||
<div>Solid wood chair</div>
|
||||
<a href="/marketplace/item/123/">View listing</a>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("123");
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||
});
|
||||
|
||||
test("prefers the canonical permalink target over earlier decoy items", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
@@ -584,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
);
|
||||
});
|
||||
|
||||
test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>{invalid: json}</script>
|
||||
<a href="/marketplace/item/987654321/">
|
||||
<span>Vintage Bike</span>
|
||||
<span>CA$120</span>
|
||||
<span>Toronto, ON</span>
|
||||
</a>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookMarketplaceData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result?.[0].node.listing.id).toBe("987654321");
|
||||
expect(result?.[0].node.listing.marketplace_listing_title).toBe(
|
||||
"Vintage Bike",
|
||||
);
|
||||
expect(result?.[0].node.listing.listing_price).toEqual({
|
||||
amount: "120.00",
|
||||
formatted_amount: "CA$120",
|
||||
currency: "CAD",
|
||||
});
|
||||
});
|
||||
|
||||
test("should handle empty search results", () => {
|
||||
const mockData = {
|
||||
require: [
|
||||
|
||||
Reference in New Issue
Block a user