refactor: add facebook html fallbacks
This commit is contained in:
@@ -166,6 +166,10 @@ interface FacebookMarketplaceItem {
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
|
||||
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
|
||||
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
|
||||
|
||||
export interface FacebookListingDetails {
|
||||
url: string;
|
||||
title: string;
|
||||
@@ -570,6 +574,192 @@ function collectMarketplaceItemCandidates(
|
||||
return matches;
|
||||
}
|
||||
|
||||
function parseFacebookRenderedPrice(priceText: string) {
|
||||
const trimmed = priceText.trim();
|
||||
if (!trimmed || trimmed.toUpperCase() === "FREE") {
|
||||
return {
|
||||
amount: "0.00",
|
||||
formatted_amount: trimmed || "FREE",
|
||||
currency: "CAD",
|
||||
};
|
||||
}
|
||||
|
||||
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
|
||||
if (!amountMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
|
||||
if (!Number.isFinite(amount)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
amount: amount.toFixed(2),
|
||||
formatted_amount: trimmed,
|
||||
currency: "CAD",
|
||||
};
|
||||
}
|
||||
|
||||
function extractRenderedText(node: ParentNode, selector: string): string[] {
|
||||
return Array.from(node.querySelectorAll(selector))
|
||||
.map((element) => element.textContent?.trim())
|
||||
.filter((text): text is string => Boolean(text));
|
||||
}
|
||||
|
||||
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
|
||||
const href = element?.getAttribute("href") || "";
|
||||
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
|
||||
}
|
||||
|
||||
function extractFacebookPermalinkItemId(document: Document): string | null {
|
||||
const canonicalId = extractMarketplaceItemIdFromElement(
|
||||
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
|
||||
);
|
||||
if (canonicalId) {
|
||||
return canonicalId;
|
||||
}
|
||||
|
||||
const ogUrl = document
|
||||
.querySelector('meta[property="og:url"]')
|
||||
?.getAttribute("content");
|
||||
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||
if (ogId) {
|
||||
return ogId;
|
||||
}
|
||||
|
||||
const title = document.querySelector("h1")?.textContent?.trim();
|
||||
if (!title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const itemLinks = Array.from(
|
||||
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||
);
|
||||
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
|
||||
|
||||
if (selfLink) {
|
||||
return extractMarketplaceItemIdFromElement(selfLink);
|
||||
}
|
||||
|
||||
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
|
||||
}
|
||||
|
||||
function extractFacebookDescriptionText(document: Document): string | undefined {
|
||||
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
|
||||
|
||||
for (const label of labels) {
|
||||
if (label.textContent?.trim() !== "Description") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let sibling = label.nextElementSibling;
|
||||
while (sibling) {
|
||||
const text = sibling.textContent?.trim();
|
||||
if (text && text !== "Description") {
|
||||
return text;
|
||||
}
|
||||
sibling = sibling.nextElementSibling;
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function extractFacebookMarketplaceHtmlFallback(
|
||||
htmlString: HTMLString,
|
||||
): FacebookAdNode[] | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const links = Array.from(
|
||||
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||
) as HTMLAnchorElement[];
|
||||
const seenIds = new Set<string>();
|
||||
const results: FacebookAdNode[] = [];
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.getAttribute("href") || "";
|
||||
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||
if (!id || seenIds.has(id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const texts = extractRenderedText(link, "span, div");
|
||||
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
|
||||
const title = texts.find(
|
||||
(text) => text !== priceText && text !== location && !text.includes("/"),
|
||||
);
|
||||
if (!title || !priceText) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parsedPrice = parseFacebookRenderedPrice(priceText);
|
||||
if (!parsedPrice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push({
|
||||
node: {
|
||||
listing: {
|
||||
id,
|
||||
marketplace_listing_title: title,
|
||||
listing_price: parsedPrice,
|
||||
location: location
|
||||
? {
|
||||
reverse_geocode: {
|
||||
city_page: {
|
||||
display_name: location,
|
||||
},
|
||||
},
|
||||
}
|
||||
: undefined,
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
seenIds.add(id);
|
||||
}
|
||||
|
||||
return results.length > 0 ? results : null;
|
||||
}
|
||||
|
||||
function extractFacebookItemHtmlFallback(
|
||||
htmlString: HTMLString,
|
||||
): FacebookMarketplaceItem | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const title = document.querySelector("h1")?.textContent?.trim();
|
||||
const id = extractFacebookPermalinkItemId(document);
|
||||
|
||||
if (!id || !title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const texts = extractRenderedText(document, "h1, span, div, p");
|
||||
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
|
||||
const location = texts.find(
|
||||
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
|
||||
);
|
||||
const description = extractFacebookDescriptionText(document);
|
||||
|
||||
return {
|
||||
id,
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: title,
|
||||
formatted_price: priceText ? { text: priceText } : undefined,
|
||||
listing_price: parsedPrice
|
||||
? {
|
||||
amount: parsedPrice.amount,
|
||||
currency: parsedPrice.currency,
|
||||
amount_with_offset: parsedPrice.amount,
|
||||
}
|
||||
: undefined,
|
||||
location_text: location ? { text: location } : undefined,
|
||||
redacted_description: description ? { text: description } : undefined,
|
||||
is_live: true,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
@@ -593,6 +783,16 @@ export function extractFacebookMarketplaceData(
|
||||
}
|
||||
|
||||
if (!bestEdges?.length) {
|
||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
|
||||
if (htmlFallback?.length) {
|
||||
console.log(
|
||||
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
|
||||
);
|
||||
return htmlFallback;
|
||||
}
|
||||
}
|
||||
|
||||
console.warn("No marketplace data found in HTML response");
|
||||
return null;
|
||||
}
|
||||
@@ -627,7 +827,15 @@ export function extractFacebookItemData(
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch?.item ?? null;
|
||||
if (bestMatch) {
|
||||
return bestMatch.item;
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||
return extractFacebookItemHtmlFallback(htmlString);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user