refactor: add facebook html fallbacks

This commit is contained in:
2026-04-22 11:36:47 -04:00
parent 63ca006696
commit 7ddc96dfdf
2 changed files with 290 additions and 1 deletions

View File

@@ -166,6 +166,10 @@ interface FacebookMarketplaceItem {
[k: string]: unknown; [k: string]: unknown;
} }
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
export interface FacebookListingDetails { export interface FacebookListingDetails {
url: string; url: string;
title: string; title: string;
@@ -570,6 +574,192 @@ function collectMarketplaceItemCandidates(
return matches; return matches;
} }
function parseFacebookRenderedPrice(priceText: string) {
const trimmed = priceText.trim();
if (!trimmed || trimmed.toUpperCase() === "FREE") {
return {
amount: "0.00",
formatted_amount: trimmed || "FREE",
currency: "CAD",
};
}
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
if (!amountMatch) {
return null;
}
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
if (!Number.isFinite(amount)) {
return null;
}
return {
amount: amount.toFixed(2),
formatted_amount: trimmed,
currency: "CAD",
};
}
function extractRenderedText(node: ParentNode, selector: string): string[] {
return Array.from(node.querySelectorAll(selector))
.map((element) => element.textContent?.trim())
.filter((text): text is string => Boolean(text));
}
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
const href = element?.getAttribute("href") || "";
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
}
function extractFacebookPermalinkItemId(document: Document): string | null {
const canonicalId = extractMarketplaceItemIdFromElement(
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
);
if (canonicalId) {
return canonicalId;
}
const ogUrl = document
.querySelector('meta[property="og:url"]')
?.getAttribute("content");
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
if (ogId) {
return ogId;
}
const title = document.querySelector("h1")?.textContent?.trim();
if (!title) {
return null;
}
const itemLinks = Array.from(
document.querySelectorAll('a[href*="/marketplace/item/"]'),
);
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
if (selfLink) {
return extractMarketplaceItemIdFromElement(selfLink);
}
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
}
function extractFacebookDescriptionText(document: Document): string | undefined {
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
for (const label of labels) {
if (label.textContent?.trim() !== "Description") {
continue;
}
let sibling = label.nextElementSibling;
while (sibling) {
const text = sibling.textContent?.trim();
if (text && text !== "Description") {
return text;
}
sibling = sibling.nextElementSibling;
}
}
return undefined;
}
function extractFacebookMarketplaceHtmlFallback(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const links = Array.from(
document.querySelectorAll('a[href*="/marketplace/item/"]'),
) as HTMLAnchorElement[];
const seenIds = new Set<string>();
const results: FacebookAdNode[] = [];
for (const link of links) {
const href = link.getAttribute("href") || "";
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
if (!id || seenIds.has(id)) {
continue;
}
const texts = extractRenderedText(link, "span, div");
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
const title = texts.find(
(text) => text !== priceText && text !== location && !text.includes("/"),
);
if (!title || !priceText) {
continue;
}
const parsedPrice = parseFacebookRenderedPrice(priceText);
if (!parsedPrice) {
continue;
}
results.push({
node: {
listing: {
id,
marketplace_listing_title: title,
listing_price: parsedPrice,
location: location
? {
reverse_geocode: {
city_page: {
display_name: location,
},
},
}
: undefined,
is_live: true,
},
},
});
seenIds.add(id);
}
return results.length > 0 ? results : null;
}
function extractFacebookItemHtmlFallback(
htmlString: HTMLString,
): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString);
const title = document.querySelector("h1")?.textContent?.trim();
const id = extractFacebookPermalinkItemId(document);
if (!id || !title) {
return null;
}
const texts = extractRenderedText(document, "h1, span, div, p");
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
const location = texts.find(
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
);
const description = extractFacebookDescriptionText(document);
return {
id,
__typename: "GroupCommerceProductItem",
marketplace_listing_title: title,
formatted_price: priceText ? { text: priceText } : undefined,
listing_price: parsedPrice
? {
amount: parsedPrice.amount,
currency: parsedPrice.currency,
amount_with_offset: parsedPrice.amount,
}
: undefined,
location_text: location ? { text: location } : undefined,
redacted_description: description ? { text: description } : undefined,
is_live: true,
};
}
/** /**
Extract marketplace search data from Facebook page script tags Extract marketplace search data from Facebook page script tags
*/ */
@@ -593,6 +783,16 @@ export function extractFacebookMarketplaceData(
} }
if (!bestEdges?.length) { if (!bestEdges?.length) {
if (htmlString.includes("XCometMarketplaceSearchController")) {
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
if (htmlFallback?.length) {
console.log(
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
);
return htmlFallback;
}
}
console.warn("No marketplace data found in HTML response"); console.warn("No marketplace data found in HTML response");
return null; return null;
} }
@@ -627,7 +827,15 @@ export function extractFacebookItemData(
} }
} }
return bestMatch?.item ?? null; if (bestMatch) {
return bestMatch.item;
}
if (htmlString.includes("XCometMarketplacePermalinkController")) {
return extractFacebookItemHtmlFallback(htmlString);
}
return null;
} }
/** /**

View File

@@ -404,6 +404,60 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
expect(result?.marketplace_listing_title).toBe("Vintage Chair"); expect(result?.marketplace_listing_title).toBe("Vintage Chair");
}); });
test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{invalid: json}</script>
<h1>Vintage Chair</h1>
<span>CA$80</span>
<div>Toronto, ON</div>
<div>Description</div>
<div>Solid wood chair</div>
<a href="/marketplace/item/123/">View listing</a>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
expect(result?.formatted_price?.text).toBe("CA$80");
expect(result?.location_text?.text).toBe("Toronto, ON");
expect(result?.redacted_description?.text).toBe("Solid wood chair");
});
test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => {
const html = `
<html>
<head>
<link rel="canonical" href="https://www.facebook.com/marketplace/item/123/" />
</head>
<body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{invalid: json}</script>
<a href="/marketplace/item/999/">
<span>Related Chair</span>
</a>
<h1>Vintage Chair</h1>
<span>CA$80</span>
<div>Toronto, ON</div>
<div>Message seller</div>
<div>Seller details</div>
<div>Description</div>
<div>Solid wood chair</div>
<a href="/marketplace/item/123/">View listing</a>
</body>
</html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
expect(result?.redacted_description?.text).toBe("Solid wood chair");
});
test("prefers the canonical permalink target over earlier decoy items", () => { test("prefers the canonical permalink target over earlier decoy items", () => {
const html = ` const html = `
<html><body> <html><body>
@@ -584,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
); );
}); });
test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>{invalid: json}</script>
<a href="/marketplace/item/987654321/">
<span>Vintage Bike</span>
<span>CA$120</span>
<span>Toronto, ON</span>
</a>
</body></html>
`;
const result = extractFacebookMarketplaceData(html);
expect(result).not.toBeNull();
expect(result).toHaveLength(1);
expect(result?.[0].node.listing.id).toBe("987654321");
expect(result?.[0].node.listing.marketplace_listing_title).toBe(
"Vintage Bike",
);
expect(result?.[0].node.listing.listing_price).toEqual({
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
});
});
test("should handle empty search results", () => { test("should handle empty search results", () => {
const mockData = { const mockData = {
require: [ require: [