refactor: add facebook html fallbacks

This commit is contained in:
2026-04-22 11:36:47 -04:00
parent 63ca006696
commit 7ddc96dfdf
2 changed files with 290 additions and 1 deletions

View File

@@ -166,6 +166,10 @@ interface FacebookMarketplaceItem {
[k: string]: unknown;
}
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
export interface FacebookListingDetails {
url: string;
title: string;
@@ -570,6 +574,192 @@ function collectMarketplaceItemCandidates(
return matches;
}
function parseFacebookRenderedPrice(priceText: string) {
const trimmed = priceText.trim();
if (!trimmed || trimmed.toUpperCase() === "FREE") {
return {
amount: "0.00",
formatted_amount: trimmed || "FREE",
currency: "CAD",
};
}
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
if (!amountMatch) {
return null;
}
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
if (!Number.isFinite(amount)) {
return null;
}
return {
amount: amount.toFixed(2),
formatted_amount: trimmed,
currency: "CAD",
};
}
function extractRenderedText(node: ParentNode, selector: string): string[] {
return Array.from(node.querySelectorAll(selector))
.map((element) => element.textContent?.trim())
.filter((text): text is string => Boolean(text));
}
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
const href = element?.getAttribute("href") || "";
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
}
function extractFacebookPermalinkItemId(document: Document): string | null {
const canonicalId = extractMarketplaceItemIdFromElement(
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
);
if (canonicalId) {
return canonicalId;
}
const ogUrl = document
.querySelector('meta[property="og:url"]')
?.getAttribute("content");
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
if (ogId) {
return ogId;
}
const title = document.querySelector("h1")?.textContent?.trim();
if (!title) {
return null;
}
const itemLinks = Array.from(
document.querySelectorAll('a[href*="/marketplace/item/"]'),
);
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
if (selfLink) {
return extractMarketplaceItemIdFromElement(selfLink);
}
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
}
function extractFacebookDescriptionText(document: Document): string | undefined {
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
for (const label of labels) {
if (label.textContent?.trim() !== "Description") {
continue;
}
let sibling = label.nextElementSibling;
while (sibling) {
const text = sibling.textContent?.trim();
if (text && text !== "Description") {
return text;
}
sibling = sibling.nextElementSibling;
}
}
return undefined;
}
function extractFacebookMarketplaceHtmlFallback(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const links = Array.from(
document.querySelectorAll('a[href*="/marketplace/item/"]'),
) as HTMLAnchorElement[];
const seenIds = new Set<string>();
const results: FacebookAdNode[] = [];
for (const link of links) {
const href = link.getAttribute("href") || "";
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
if (!id || seenIds.has(id)) {
continue;
}
const texts = extractRenderedText(link, "span, div");
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
const title = texts.find(
(text) => text !== priceText && text !== location && !text.includes("/"),
);
if (!title || !priceText) {
continue;
}
const parsedPrice = parseFacebookRenderedPrice(priceText);
if (!parsedPrice) {
continue;
}
results.push({
node: {
listing: {
id,
marketplace_listing_title: title,
listing_price: parsedPrice,
location: location
? {
reverse_geocode: {
city_page: {
display_name: location,
},
},
}
: undefined,
is_live: true,
},
},
});
seenIds.add(id);
}
return results.length > 0 ? results : null;
}
function extractFacebookItemHtmlFallback(
htmlString: HTMLString,
): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString);
const title = document.querySelector("h1")?.textContent?.trim();
const id = extractFacebookPermalinkItemId(document);
if (!id || !title) {
return null;
}
const texts = extractRenderedText(document, "h1, span, div, p");
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
const location = texts.find(
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
);
const description = extractFacebookDescriptionText(document);
return {
id,
__typename: "GroupCommerceProductItem",
marketplace_listing_title: title,
formatted_price: priceText ? { text: priceText } : undefined,
listing_price: parsedPrice
? {
amount: parsedPrice.amount,
currency: parsedPrice.currency,
amount_with_offset: parsedPrice.amount,
}
: undefined,
location_text: location ? { text: location } : undefined,
redacted_description: description ? { text: description } : undefined,
is_live: true,
};
}
/**
Extract marketplace search data from Facebook page script tags
*/
@@ -593,6 +783,16 @@ export function extractFacebookMarketplaceData(
}
if (!bestEdges?.length) {
if (htmlString.includes("XCometMarketplaceSearchController")) {
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
if (htmlFallback?.length) {
console.log(
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
);
return htmlFallback;
}
}
console.warn("No marketplace data found in HTML response");
return null;
}
@@ -627,7 +827,15 @@ export function extractFacebookItemData(
}
}
return bestMatch?.item ?? null;
if (bestMatch) {
return bestMatch.item;
}
if (htmlString.includes("XCometMarketplacePermalinkController")) {
return extractFacebookItemHtmlFallback(htmlString);
}
return null;
}
/**