feat: facebook scraping

Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
2025-10-02 11:50:35 -04:00
parent ee09162faa
commit dcd0da29a2
4 changed files with 651 additions and 0 deletions

562
src/facebook.ts Normal file
View File

@@ -0,0 +1,562 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { parseHTML } from "linkedom";
import cliProgress from "cli-progress";
/**
* Facebook Marketplace Scraper
*
* Note: Facebook Marketplace requires authentication cookies for full access.
* This implementation will return limited or no results without proper authentication.
* This is by design to respect Facebook's authentication requirements.
*/
// ----------------------------- Types -----------------------------
type HTMLString = string;
interface Cookie {
name: string;
value: string;
domain: string;
path: string;
secure?: boolean;
httpOnly?: boolean;
sameSite?: "strict" | "lax" | "none" | "unspecified";
session?: boolean;
expirationDate?: number;
partitionKey?: any;
storeId?: string;
}
interface FacebookAdNode {
node: {
listing: {
id: string;
marketplace_listing_title?: string;
listing_price?: {
amount?: string | number;
currency?: string;
};
location?: {
reverse_geocode?: {
city_page?: {
display_name?: string;
};
};
};
creation_time?: number;
[k: string]: unknown;
};
[k: string]: unknown;
};
}
interface FacebookEdge {
node: FacebookAdNode["node"];
[k: string]: unknown;
}
interface FacebookMarketplaceSearch {
feed_units?: {
edges?: FacebookEdge[];
};
[k: string]: unknown;
}
interface FacebookRequireData {
require?: [number, number, number, FacebookMarketplaceSearch, number][];
[k: string]: unknown;
}
type ListingDetails = {
url: string;
title: string;
description?: string;
listingPrice?: {
amountFormatted: string;
cents?: number;
currency?: string;
};
listingType?: string;
listingStatus?: string;
creationDate?: string;
endDate?: string;
numberOfViews?: number;
address?: string | null;
};
// ----------------------------- Utilities -----------------------------
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null;
}
async function delay(ms: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Load Facebook cookies from file or string
*/
async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
// First try to load from provided string parameter
if (cookiesSource) {
try {
const cookies = JSON.parse(cookiesSource);
if (Array.isArray(cookies)) {
return cookies.filter(
(cookie): cookie is Cookie =>
cookie &&
typeof cookie.name === "string" &&
typeof cookie.value === "string",
);
}
} catch (e) {
throw new Error(`Invalid cookies JSON provided: ${e}`);
}
}
// Try to load from ./cookies/facebook.json
try {
const cookiesPath = "./cookies/facebook.json";
const file = Bun.file(cookiesPath);
if (await file.exists()) {
const content = await file.text();
const cookies = JSON.parse(content);
if (Array.isArray(cookies)) {
return cookies.filter(
(cookie): cookie is Cookie =>
cookie &&
typeof cookie.name === "string" &&
typeof cookie.value === "string",
);
}
}
} catch (e) {
console.warn(`Could not load cookies from ./cookies/facebook.json: ${e}`);
}
return [];
}
/**
* Format cookies array into Cookie header string
*/
function formatCookiesForHeader(cookies: Cookie[], domain: string): string {
const validCookies = cookies
.filter((cookie) => {
// Check if cookie applies to this domain
if (cookie.domain.startsWith(".")) {
// Domain cookie (applies to subdomains)
return (
domain.endsWith(cookie.domain.slice(1)) ||
domain === cookie.domain.slice(1)
);
} else {
// Host-only cookie
return cookie.domain === domain;
}
})
.filter((cookie) => {
// Check expiration
if (cookie.expirationDate && cookie.expirationDate < Date.now() / 1000) {
return false; // Expired
}
return true;
});
return validCookies
.map((cookie) => `${cookie.name}=${cookie.value}`)
.join("; ");
}
class HttpError extends Error {
constructor(
message: string,
public readonly status: number,
public readonly url: string,
) {
super(message);
this.name = "HttpError";
}
}
// ----------------------------- HTTP Client -----------------------------
/**
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
- Retries on 429 and 5xx
- Respects X-RateLimit-Reset when present (seconds)
- Supports custom cookies for Facebook authentication
*/
async function fetchHtml(
url: string,
DELAY_MS: number,
opts?: {
maxRetries?: number;
retryBaseMs?: number;
onRateInfo?: (remaining: string | null, reset: string | null) => void;
cookies?: string;
},
): Promise<HTMLString> {
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const headers: Record<string, string> = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"accept-encoding": "gzip, deflate, br",
"cache-control": "no-cache",
"upgrade-insecure-requests": "1",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
};
// Add cookies if provided
if (opts?.cookies) {
headers["cookie"] = opts.cookies;
}
const res = await fetch(url, {
method: "GET",
headers,
});
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
if (!res.ok) {
// Respect 429 reset if provided
if (res.status === 429) {
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000)
: (attempt + 1) * retryBaseMs;
await delay(waitMs);
continue;
}
// For Facebook, 400 often means authentication required
// Don't retry 4xx client errors except 429
if (res.status >= 400 && res.status < 500 && res.status !== 429) {
throw new HttpError(
`Request failed with status ${res.status} (Facebook may require authentication cookies for access)`,
res.status,
url,
);
}
// Retry on 5xx
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
await delay((attempt + 1) * retryBaseMs);
continue;
}
throw new HttpError(
`Request failed with status ${res.status}`,
res.status,
url,
);
}
const html = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
await delay(DELAY_MS);
return html;
} catch (err) {
if (attempt >= maxRetries) throw err;
await delay((attempt + 1) * retryBaseMs);
}
}
throw new Error("Exhausted retries without response");
}
// ----------------------------- Parsing -----------------------------
/**
Extract marketplace search data from Facebook page script tags
*/
function extractFacebookMarketplaceData(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
let marketplaceData: FacebookMarketplaceSearch | null = null;
// Find the script containing the require data with marketplace_search
for (const script of scripts as unknown as HTMLScriptElement[]) {
const scriptText = script.textContent;
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
// First check if this is the direct data structure (like in examples)
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple navigation paths to find marketplace_search
const paths = [
// Original path from example
() => parsed.require[0][3][0]['__bbox']['require'][0][3][1]['__bbox']['result']['data']['marketplace_search'],
// Alternative path structure
() => parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
// Another variation
() => parsed.require[0][3][0]['__bbox']['result']['data']['marketplace_search'],
// Direct access for some responses
() => {
for (const item of parsed.require) {
if (item && item.length >= 4 && item[3]) {
const bbox = item[3]?.['__bbox']?.result?.data?.marketplace_search;
if (bbox) return bbox;
}
}
return null;
}
];
for (const getData of paths) {
try {
const result = getData();
if (result && isRecord(result) && result.feed_units?.edges) {
marketplaceData = result as FacebookMarketplaceSearch;
break;
}
} catch {
continue;
}
}
if (marketplaceData) break;
}
// Also check for direct marketplace_search in the parsed data
if (parsed.marketplace_search && isRecord(parsed.marketplace_search) && parsed.marketplace_search.feed_units?.edges) {
marketplaceData = parsed.marketplace_search as FacebookMarketplaceSearch;
break;
}
} catch {
// Ignore parsing errors for other scripts
continue;
}
}
if (!marketplaceData?.feed_units?.edges) {
console.warn("No marketplace data found in HTML response");
return null;
}
console.log(`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`);
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
}
/**
* Turns cents to localized currency string.
*/
function formatCentsToCurrency(
num: number | string | undefined,
locale = "en-US",
): string {
if (num == null) return "";
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
if (Number.isNaN(cents)) return "";
const dollars = cents / 100;
const formatter = new Intl.NumberFormat(locale, {
minimumFractionDigits: 2,
maximumFractionDigits: 2,
useGrouping: true,
});
return formatter.format(dollars);
}
/**
Parse Facebook marketplace search results into ListingDetails[]
*/
function parseFacebookAds(ads: FacebookAdNode[]): ListingDetails[] {
const results: ListingDetails[] = [];
for (const adJson of ads) {
try {
const listing = adJson.node.listing;
const title = listing.marketplace_listing_title;
const priceObj = listing.listing_price;
if (!title || !priceObj) continue;
const id = listing.id;
const url = `https://www.facebook.com/marketplace/item/${id}`;
// Facebook stores price in different fields:
// - amount_with_offset_in_currency: Facebook's internal price encoding (not cents)
// - amount: dollars (like "1.00")
// - formatted_amount: human-readable price (like "CA$1")
let cents: number;
if (priceObj.amount != null) {
const dollars = typeof priceObj.amount === 'string'
? Number.parseFloat(priceObj.amount)
: priceObj.amount;
cents = Math.round(dollars * 100);
} else if (priceObj.amount_with_offset_in_currency != null) {
// Fallback: try to extract cents from amount_with_offset_in_currency
// This appears to use some exchange rate/multiplier format
const encodedAmount = Number(priceObj.amount_with_offset_in_currency);
if (!Number.isNaN(encodedAmount) && encodedAmount > 0) {
// Estimate roughly - this field doesn't contain real cents
// Use formatted_amount to get the actual dollar amount
if (priceObj.formatted_amount) {
const match = priceObj.formatted_amount.match(/[\d,]+\.?\d*/);
if (match) {
const dollars = Number.parseFloat(match[0].replace(',', ''));
if (!Number.isNaN(dollars)) {
cents = Math.round(dollars * 100);
} else {
cents = encodedAmount; // fallback
}
} else {
cents = encodedAmount; // fallback
}
} else {
cents = encodedAmount; // fallback
}
} else {
continue; // Invalid price
}
} else {
continue; // No price available
}
if (!Number.isFinite(cents) || cents <= 0) continue;
// Extract address from location data if available
const cityName =
listing.location?.reverse_geocode?.city_page?.display_name;
const address = cityName || null;
// Format creation date if available
const creationDate = listing.creation_time
? new Date(listing.creation_time * 1000).toISOString()
: undefined;
const listingDetails: ListingDetails = {
url,
title,
listingPrice: {
amountFormatted: priceObj.formatted_amount || formatCentsToCurrency(cents),
cents,
currency: priceObj.currency || "CAD", // Facebook marketplace often uses CAD
},
address,
creationDate,
listingType: "item", // Default type for marketplace listings
};
results.push(listingDetails);
} catch {
// Skip malformed ads
continue;
}
}
return results;
}
// ----------------------------- Main -----------------------------
export default async function fetchFacebookItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
LOCATION = "toronto",
MAX_ITEMS = 25,
cookiesSource?: string,
) {
// Load Facebook cookies - required for Facebook Marketplace access
const cookies = await loadFacebookCookies(cookiesSource);
if (cookies.length === 0) {
throw new Error(
"Facebook cookies are required for marketplace access. " +
"Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies.",
);
}
// Format cookies for HTTP header
const domain = "www.facebook.com";
const cookiesHeader = formatCookiesForHeader(cookies, domain);
if (!cookiesHeader) {
throw new Error(
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.",
);
}
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
// Encode search query for URL
const encodedQuery = encodeURIComponent(SEARCH_QUERY);
// Facebook marketplace URL structure
const searchUrl = `https://www.facebook.com/marketplace/${LOCATION}/search?query=${encodedQuery}&sortBy=creation_time_descend&exact=false`;
console.log(`Fetching Facebook marketplace: ${searchUrl}`);
console.log(`Using ${cookies.length} cookies for authentication`);
let searchHtml: string;
try {
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Facebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
cookies: cookiesHeader,
});
} catch (err) {
if (err instanceof HttpError) {
console.warn(
`\nFacebook marketplace access failed (${err.status}): ${err.message}`,
);
if (err.status === 400 || err.status === 401 || err.status === 403) {
console.warn(
"This might indicate invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies.",
);
}
return [];
}
throw err;
}
const ads = extractFacebookMarketplaceData(searchHtml);
if (!ads || ads.length === 0) {
console.warn("No ads parsed from Facebook marketplace page.");
return [];
}
console.log(`\nFound ${ads.length} raw ads. Processing...`);
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = ads.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items = parseFacebookAds(ads);
// Filter to only priced items (already done in parseFacebookAds)
const pricedItems = items.filter(
(item) => item.listingPrice?.cents && item.listingPrice.cents > 0,
);
progressBar.update(totalProgress);
progressBar.stop();
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
return pricedItems.slice(0, MAX_ITEMS); // Limit results
}

View File

@@ -1,4 +1,5 @@
import fetchKijijiItems from "@/kijiji";
import fetchFacebookItems from "@/facebook";
const PORT = process.env.PORT || 4005;
@@ -33,6 +34,41 @@ const server = Bun.serve({
return Response.json(items, { status: 200 });
},
"/api/facebook": async (req: Request) => {
const reqUrl = new URL(req.url);
const SEARCH_QUERY =
req.headers.get("query") || reqUrl.searchParams.get("q") || null;
if (!SEARCH_QUERY)
return Response.json(
{
message:
"Request didn't have 'query' header or 'q' search parameter!",
},
{ status: 400 },
);
const LOCATION = reqUrl.searchParams.get("location") || "toronto";
const COOKIES_SOURCE = reqUrl.searchParams.get("cookies") || undefined;
try {
const items = await fetchFacebookItems(SEARCH_QUERY, 5, LOCATION, 25, COOKIES_SOURCE);
if (!items || items.length === 0)
return Response.json(
{ message: "Search didn't return any results!" },
{ status: 404 },
);
return Response.json(items, { status: 200 });
} catch (error) {
console.error("Facebook scraping error:", error);
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
return Response.json(
{ message: errorMessage },
{ status: 400 },
);
}
},
// Wildcard route for all routes that start with "/api/" and aren't otherwise matched
"/api/*": Response.json({ message: "Not found" }, { status: 404 }),