feat(facebook): add session warming and challenge detection

Facebook Marketplace no longer requires authentication cookies.
Session warming sends proper browser headers. Checkpoint and
login-wall challenges are detected and handled gracefully.
Added marketplace_product_details_page.target extraction path
for current item page structure.
This commit is contained in:
2026-05-02 18:58:53 -04:00
parent 7ab33d0b02
commit 0a246a29bf

View File

@@ -10,8 +10,14 @@ import {
type CookieConfig, type CookieConfig,
ensureCookies, ensureCookies,
formatCookiesForHeader, formatCookiesForHeader,
loadCookiesOptional,
parseCookieString, parseCookieString,
} from "../utils/cookies"; } from "../utils/cookies";
import {
buildFacebookHeaders,
detectFacebookChallenge,
warmFacebookSession,
} from "../utils/facebook-challenge";
import { formatCentsToCurrency } from "../utils/format"; import { formatCentsToCurrency } from "../utils/format";
import { fetchHtml, HttpError, isRecord, RateLimitError } from "../utils/http"; import { fetchHtml, HttpError, isRecord, RateLimitError } from "../utils/http";
import { logger } from "../utils/logger"; import { logger } from "../utils/logger";
@@ -20,9 +26,10 @@ import { classifyUnstableListings } from "../utils/unstable";
/** /**
* Facebook Marketplace Scraper * Facebook Marketplace Scraper
* *
* Note: Facebook Marketplace requires authentication cookies for full access. * Facebook Marketplace returns search results without authentication when
* This implementation will return limited or no results without proper authentication. * proper browser headers are sent. Prices and seller details are hidden on
* This is by design to respect Facebook's authentication requirements. * search results but are available on individual item pages even without
* auth cookies. For full-price search results, provide FACEBOOK_COOKIE.
*/ */
// Facebook cookie configuration // Facebook cookie configuration
@@ -263,20 +270,14 @@ function logExtractionMetrics(success: boolean, itemId?: string) {
// ----------------------------- HTTP Client ----------------------------- // ----------------------------- HTTP Client -----------------------------
function createFacebookHeaders(cookies: string): Record<string, string> { function createFacebookHeaders(cookies: string): Record<string, string> {
return { const jar: Record<string, string> = {};
accept: if (cookies) {
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", for (const pair of cookies.split(";")) {
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8", const [name, ...rest] = pair.trim().split("=");
"cache-control": "no-cache", if (name && rest.length > 0) jar[name.trim()] = rest.join("=").trim();
"upgrade-insecure-requests": "1", }
"sec-fetch-dest": "document", }
"sec-fetch-mode": "navigate", return buildFacebookHeaders(jar);
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
cookie: cookies,
};
} }
// ----------------------------- Parsing ----------------------------- // ----------------------------- Parsing -----------------------------
@@ -286,13 +287,29 @@ export type FacebookResponseKind =
| "item" | "item"
| "auth_gated" | "auth_gated"
| "unavailable" | "unavailable"
| "checkpoint"
| "unknown"; | "unknown";
export function classifyFacebookResponse( export function classifyFacebookResponse(
htmlString: HTMLString, htmlString: HTMLString,
responseUrl: string, responseUrl: string,
status = 200,
) { ) {
const challengeType = detectFacebookChallenge(
status,
htmlString,
responseUrl,
);
if (challengeType === "checkpoint") {
return {
kind: "checkpoint" as const,
authGated: false,
unavailable: false,
};
}
const authGated = const authGated =
challengeType === "login_wall" ||
responseUrl.includes("/login/") || responseUrl.includes("/login/") ||
htmlString.includes("You must log in") || htmlString.includes("You must log in") ||
htmlString.includes("log in to continue"); htmlString.includes("log in to continue");
@@ -764,6 +781,22 @@ export function extractFacebookItemData(
return bestMatch.item; return bestMatch.item;
} }
// Try marketplace_product_details_page.target path (current item page structure)
for (const candidate of candidates) {
const detailsPage = findKeyInObject(
candidate,
"marketplace_product_details_page",
) as Record<string, unknown> | undefined;
const target = detailsPage?.target as Record<string, unknown> | undefined;
if (
target &&
typeof target.id === "string" &&
typeof target.marketplace_listing_title === "string"
) {
return target as unknown as FacebookMarketplaceItem;
}
}
if (htmlString.includes("XCometMarketplacePermalinkController")) { if (htmlString.includes("XCometMarketplacePermalinkController")) {
return extractFacebookItemHtmlFallback(htmlString); return extractFacebookItemHtmlFallback(htmlString);
} }
@@ -771,6 +804,25 @@ export function extractFacebookItemData(
return null; return null;
} }
function findKeyInObject(obj: unknown, targetKey: string): unknown {
if (obj == null) return undefined;
if (Array.isArray(obj)) {
for (const item of obj) {
const found = findKeyInObject(item, targetKey);
if (found !== undefined) return found;
}
return undefined;
}
if (typeof obj !== "object") return undefined;
const record = obj as Record<string, unknown>;
if (targetKey in record) return record[targetKey];
for (const [, value] of Object.entries(record)) {
const found = findKeyInObject(value, targetKey);
if (found !== undefined) return found;
}
return undefined;
}
/** /**
Parse Facebook marketplace search results into ListingDetails[] Parse Facebook marketplace search results into ListingDetails[]
*/ */
@@ -1027,16 +1079,18 @@ export default async function fetchFacebookItems(
}; };
}; };
const cookies = await ensureFacebookCookies(); const warmupCookies = await warmFacebookSession();
const warmupHeader = Object.entries(warmupCookies)
.map(([k, v]) => `${k}=${v}`)
.join("; ");
const userCookies = await loadCookiesOptional(FACEBOOK_COOKIE_CONFIG);
// Format cookies for HTTP header
const domain = "www.facebook.com"; const domain = "www.facebook.com";
const cookiesHeader = formatCookiesForHeader(cookies, domain); const userCookiesHeader = formatCookiesForHeader(userCookies, domain);
if (!cookiesHeader) { const cookiesHeader = [warmupHeader, userCookiesHeader]
throw new Error( .filter(Boolean)
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.", .join("; ");
);
}
const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond)); const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
@@ -1047,7 +1101,9 @@ export default async function fetchFacebookItems(
const searchUrl = `https://www.facebook.com/marketplace/${LOCATION}/search?query=${encodedQuery}&sortBy=creation_time_descend&exact=false`; const searchUrl = `https://www.facebook.com/marketplace/${LOCATION}/search?query=${encodedQuery}&sortBy=creation_time_descend&exact=false`;
logger.log(`Fetching Facebook marketplace: ${searchUrl}`); logger.log(`Fetching Facebook marketplace: ${searchUrl}`);
logger.log(`Using ${cookies.length} cookies for authentication`); if (userCookies.length > 0) {
logger.log(`Using ${userCookies.length} cookies for authentication`);
}
let searchHtml: string; let searchHtml: string;
let searchResponseUrl = searchUrl; let searchResponseUrl = searchUrl;
@@ -1100,6 +1156,13 @@ export default async function fetchFacebookItems(
return finalizeResults([]); return finalizeResults([]);
} }
if (classification.kind === "checkpoint") {
logger.warn(
"Facebook marketplace returned a checkpoint challenge. This may require manual verification.",
);
return finalizeResults([]);
}
if (classification.unavailable) { if (classification.unavailable) {
logger.warn("Facebook marketplace search returned an unavailable route."); logger.warn("Facebook marketplace search returned an unavailable route.");
return finalizeResults([]); return finalizeResults([]);
@@ -1149,15 +1212,8 @@ export default async function fetchFacebookItems(
export async function fetchFacebookItem( export async function fetchFacebookItem(
itemId: string, itemId: string,
): Promise<FacebookListingDetails | null> { ): Promise<FacebookListingDetails | null> {
const cookies = await ensureFacebookCookies(); const userCookies = await loadCookiesOptional(FACEBOOK_COOKIE_CONFIG);
const cookiesHeader = formatCookiesForHeader(userCookies, "www.facebook.com");
// Format cookies for HTTP header
const cookiesHeader = formatCookiesForHeader(cookies, "www.facebook.com");
if (!cookiesHeader) {
throw new Error(
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.",
);
}
const itemUrl = `https://www.facebook.com/marketplace/item/${itemId}/`; const itemUrl = `https://www.facebook.com/marketplace/item/${itemId}/`;
@@ -1230,6 +1286,14 @@ export async function fetchFacebookItem(
const classification = classifyFacebookResponse(itemHtml, itemResponseUrl); const classification = classifyFacebookResponse(itemHtml, itemResponseUrl);
if (classification.kind === "checkpoint") {
logExtractionMetrics(false, itemId);
logger.warn(
`Checkpoint challenge detected for item ${itemId}. Facebook may be limiting access.`,
);
return null;
}
if (classification.authGated) { if (classification.authGated) {
logExtractionMetrics(false, itemId); logExtractionMetrics(false, itemId);
logger.warn( logger.warn(