From 7ddc96dfdf3f09398d6f8784404283e766d6b78e Mon Sep 17 00:00:00 2001 From: Dmytro Stanchiev Date: Wed, 22 Apr 2026 11:36:47 -0400 Subject: [PATCH] refactor: add facebook html fallbacks --- packages/core/src/scrapers/facebook.ts | 210 ++++++++++++++++++++++- packages/core/test/facebook-core.test.ts | 81 +++++++++ 2 files changed, 290 insertions(+), 1 deletion(-) diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index e119a13..6ea2132 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -166,6 +166,10 @@ interface FacebookMarketplaceItem { [k: string]: unknown; } +const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/; +const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i; +const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/; + export interface FacebookListingDetails { url: string; title: string; @@ -570,6 +574,192 @@ function collectMarketplaceItemCandidates( return matches; } +function parseFacebookRenderedPrice(priceText: string) { + const trimmed = priceText.trim(); + if (!trimmed || trimmed.toUpperCase() === "FREE") { + return { + amount: "0.00", + formatted_amount: trimmed || "FREE", + currency: "CAD", + }; + } + + const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/); + if (!amountMatch) { + return null; + } + + const amount = Number.parseFloat(amountMatch[0].replaceAll(",", "")); + if (!Number.isFinite(amount)) { + return null; + } + + return { + amount: amount.toFixed(2), + formatted_amount: trimmed, + currency: "CAD", + }; +} + +function extractRenderedText(node: ParentNode, selector: string): string[] { + return Array.from(node.querySelectorAll(selector)) + .map((element) => element.textContent?.trim()) + .filter((text): text is string => Boolean(text)); +} + +function extractMarketplaceItemIdFromElement(element: Element | null): string | null { + const href = element?.getAttribute("href") || ""; + return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null; +} + +function extractFacebookPermalinkItemId(document: Document): string | null { + const canonicalId = extractMarketplaceItemIdFromElement( + document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'), + ); + if (canonicalId) { + return canonicalId; + } + + const ogUrl = document + .querySelector('meta[property="og:url"]') + ?.getAttribute("content"); + const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1]; + if (ogId) { + return ogId; + } + + const title = document.querySelector("h1")?.textContent?.trim(); + if (!title) { + return null; + } + + const itemLinks = Array.from( + document.querySelectorAll('a[href*="/marketplace/item/"]'), + ); + const selfLink = itemLinks.find((link) => link.textContent?.includes(title)); + + if (selfLink) { + return extractMarketplaceItemIdFromElement(selfLink); + } + + return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null); +} + +function extractFacebookDescriptionText(document: Document): string | undefined { + const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p")); + + for (const label of labels) { + if (label.textContent?.trim() !== "Description") { + continue; + } + + let sibling = label.nextElementSibling; + while (sibling) { + const text = sibling.textContent?.trim(); + if (text && text !== "Description") { + return text; + } + sibling = sibling.nextElementSibling; + } + } + + return undefined; +} + +function extractFacebookMarketplaceHtmlFallback( + htmlString: HTMLString, +): FacebookAdNode[] | null { + const { document } = parseHTML(htmlString); + const links = Array.from( + document.querySelectorAll('a[href*="/marketplace/item/"]'), + ) as HTMLAnchorElement[]; + const seenIds = new Set(); + const results: FacebookAdNode[] = []; + + for (const link of links) { + const href = link.getAttribute("href") || ""; + const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1]; + if (!id || seenIds.has(id)) { + continue; + } + + const texts = extractRenderedText(link, "span, div"); + const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text)); + const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text)); + const title = texts.find( + (text) => text !== priceText && text !== location && !text.includes("/"), + ); + if (!title || !priceText) { + continue; + } + + const parsedPrice = parseFacebookRenderedPrice(priceText); + if (!parsedPrice) { + continue; + } + + results.push({ + node: { + listing: { + id, + marketplace_listing_title: title, + listing_price: parsedPrice, + location: location + ? { + reverse_geocode: { + city_page: { + display_name: location, + }, + }, + } + : undefined, + is_live: true, + }, + }, + }); + seenIds.add(id); + } + + return results.length > 0 ? results : null; +} + +function extractFacebookItemHtmlFallback( + htmlString: HTMLString, +): FacebookMarketplaceItem | null { + const { document } = parseHTML(htmlString); + const title = document.querySelector("h1")?.textContent?.trim(); + const id = extractFacebookPermalinkItemId(document); + + if (!id || !title) { + return null; + } + + const texts = extractRenderedText(document, "h1, span, div, p"); + const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text)); + const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null; + const location = texts.find( + (text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text), + ); + const description = extractFacebookDescriptionText(document); + + return { + id, + __typename: "GroupCommerceProductItem", + marketplace_listing_title: title, + formatted_price: priceText ? { text: priceText } : undefined, + listing_price: parsedPrice + ? { + amount: parsedPrice.amount, + currency: parsedPrice.currency, + amount_with_offset: parsedPrice.amount, + } + : undefined, + location_text: location ? { text: location } : undefined, + redacted_description: description ? { text: description } : undefined, + is_live: true, + }; +} + /** Extract marketplace search data from Facebook page script tags */ @@ -593,6 +783,16 @@ export function extractFacebookMarketplaceData( } if (!bestEdges?.length) { + if (htmlString.includes("XCometMarketplaceSearchController")) { + const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString); + if (htmlFallback?.length) { + console.log( + `Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`, + ); + return htmlFallback; + } + } + console.warn("No marketplace data found in HTML response"); return null; } @@ -627,7 +827,15 @@ export function extractFacebookItemData( } } - return bestMatch?.item ?? null; + if (bestMatch) { + return bestMatch.item; + } + + if (htmlString.includes("XCometMarketplacePermalinkController")) { + return extractFacebookItemHtmlFallback(htmlString); + } + + return null; } /** diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index f627f5a..11d0f2e 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -404,6 +404,60 @@ describe("Facebook Marketplace Scraper Core Tests", () => { expect(result?.marketplace_listing_title).toBe("Vintage Chair"); }); + test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => { + const html = ` + + + +

Vintage Chair

+ CA$80 +
Toronto, ON
+
Description
+
Solid wood chair
+ View listing + + `; + + const result = extractFacebookItemData(html); + expect(result).not.toBeNull(); + expect(result?.id).toBe("123"); + expect(result?.marketplace_listing_title).toBe("Vintage Chair"); + expect(result?.formatted_price?.text).toBe("CA$80"); + expect(result?.location_text?.text).toBe("Toronto, ON"); + expect(result?.redacted_description?.text).toBe("Solid wood chair"); + }); + + test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => { + const html = ` + + + + + + + + + Related Chair + +

Vintage Chair

+ CA$80 +
Toronto, ON
+
Message seller
+
Seller details
+
Description
+
Solid wood chair
+ View listing + + + `; + + const result = extractFacebookItemData(html); + expect(result).not.toBeNull(); + expect(result?.id).toBe("123"); + expect(result?.marketplace_listing_title).toBe("Vintage Chair"); + expect(result?.redacted_description?.text).toBe("Solid wood chair"); + }); + test("prefers the canonical permalink target over earlier decoy items", () => { const html = ` @@ -584,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => { ); }); + test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => { + const html = ` + + + + + Vintage Bike + CA$120 + Toronto, ON + + + `; + + const result = extractFacebookMarketplaceData(html); + expect(result).not.toBeNull(); + expect(result).toHaveLength(1); + expect(result?.[0].node.listing.id).toBe("987654321"); + expect(result?.[0].node.listing.marketplace_listing_title).toBe( + "Vintage Bike", + ); + expect(result?.[0].node.listing.listing_price).toEqual({ + amount: "120.00", + formatted_amount: "CA$120", + currency: "CAD", + }); + }); + test("should handle empty search results", () => { const mockData = { require: [