feat: facebook scraping
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
562
src/facebook.ts
Normal file
562
src/facebook.ts
Normal file
@@ -0,0 +1,562 @@
|
||||
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||
import { parseHTML } from "linkedom";
|
||||
import cliProgress from "cli-progress";
|
||||
|
||||
/**
|
||||
* Facebook Marketplace Scraper
|
||||
*
|
||||
* Note: Facebook Marketplace requires authentication cookies for full access.
|
||||
* This implementation will return limited or no results without proper authentication.
|
||||
* This is by design to respect Facebook's authentication requirements.
|
||||
*/
|
||||
|
||||
// ----------------------------- Types -----------------------------
|
||||
|
||||
type HTMLString = string;
|
||||
|
||||
interface Cookie {
|
||||
name: string;
|
||||
value: string;
|
||||
domain: string;
|
||||
path: string;
|
||||
secure?: boolean;
|
||||
httpOnly?: boolean;
|
||||
sameSite?: "strict" | "lax" | "none" | "unspecified";
|
||||
session?: boolean;
|
||||
expirationDate?: number;
|
||||
partitionKey?: any;
|
||||
storeId?: string;
|
||||
}
|
||||
|
||||
interface FacebookAdNode {
|
||||
node: {
|
||||
listing: {
|
||||
id: string;
|
||||
marketplace_listing_title?: string;
|
||||
listing_price?: {
|
||||
amount?: string | number;
|
||||
currency?: string;
|
||||
};
|
||||
location?: {
|
||||
reverse_geocode?: {
|
||||
city_page?: {
|
||||
display_name?: string;
|
||||
};
|
||||
};
|
||||
};
|
||||
creation_time?: number;
|
||||
[k: string]: unknown;
|
||||
};
|
||||
[k: string]: unknown;
|
||||
};
|
||||
}
|
||||
|
||||
interface FacebookEdge {
|
||||
node: FacebookAdNode["node"];
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceSearch {
|
||||
feed_units?: {
|
||||
edges?: FacebookEdge[];
|
||||
};
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface FacebookRequireData {
|
||||
require?: [number, number, number, FacebookMarketplaceSearch, number][];
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
type ListingDetails = {
|
||||
url: string;
|
||||
title: string;
|
||||
description?: string;
|
||||
listingPrice?: {
|
||||
amountFormatted: string;
|
||||
cents?: number;
|
||||
currency?: string;
|
||||
};
|
||||
listingType?: string;
|
||||
listingStatus?: string;
|
||||
creationDate?: string;
|
||||
endDate?: string;
|
||||
numberOfViews?: number;
|
||||
address?: string | null;
|
||||
};
|
||||
|
||||
// ----------------------------- Utilities -----------------------------
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === "object" && value !== null;
|
||||
}
|
||||
|
||||
async function delay(ms: number): Promise<void> {
|
||||
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Load Facebook cookies from file or string
|
||||
*/
|
||||
async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
|
||||
// First try to load from provided string parameter
|
||||
if (cookiesSource) {
|
||||
try {
|
||||
const cookies = JSON.parse(cookiesSource);
|
||||
if (Array.isArray(cookies)) {
|
||||
return cookies.filter(
|
||||
(cookie): cookie is Cookie =>
|
||||
cookie &&
|
||||
typeof cookie.name === "string" &&
|
||||
typeof cookie.value === "string",
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid cookies JSON provided: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Try to load from ./cookies/facebook.json
|
||||
try {
|
||||
const cookiesPath = "./cookies/facebook.json";
|
||||
const file = Bun.file(cookiesPath);
|
||||
if (await file.exists()) {
|
||||
const content = await file.text();
|
||||
const cookies = JSON.parse(content);
|
||||
if (Array.isArray(cookies)) {
|
||||
return cookies.filter(
|
||||
(cookie): cookie is Cookie =>
|
||||
cookie &&
|
||||
typeof cookie.name === "string" &&
|
||||
typeof cookie.value === "string",
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`Could not load cookies from ./cookies/facebook.json: ${e}`);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Format cookies array into Cookie header string
|
||||
*/
|
||||
function formatCookiesForHeader(cookies: Cookie[], domain: string): string {
|
||||
const validCookies = cookies
|
||||
.filter((cookie) => {
|
||||
// Check if cookie applies to this domain
|
||||
if (cookie.domain.startsWith(".")) {
|
||||
// Domain cookie (applies to subdomains)
|
||||
return (
|
||||
domain.endsWith(cookie.domain.slice(1)) ||
|
||||
domain === cookie.domain.slice(1)
|
||||
);
|
||||
} else {
|
||||
// Host-only cookie
|
||||
return cookie.domain === domain;
|
||||
}
|
||||
})
|
||||
.filter((cookie) => {
|
||||
// Check expiration
|
||||
if (cookie.expirationDate && cookie.expirationDate < Date.now() / 1000) {
|
||||
return false; // Expired
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
return validCookies
|
||||
.map((cookie) => `${cookie.name}=${cookie.value}`)
|
||||
.join("; ");
|
||||
}
|
||||
|
||||
class HttpError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly status: number,
|
||||
public readonly url: string,
|
||||
) {
|
||||
super(message);
|
||||
this.name = "HttpError";
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------- HTTP Client -----------------------------
|
||||
|
||||
/**
|
||||
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
|
||||
- Retries on 429 and 5xx
|
||||
- Respects X-RateLimit-Reset when present (seconds)
|
||||
- Supports custom cookies for Facebook authentication
|
||||
*/
|
||||
async function fetchHtml(
|
||||
url: string,
|
||||
DELAY_MS: number,
|
||||
opts?: {
|
||||
maxRetries?: number;
|
||||
retryBaseMs?: number;
|
||||
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||||
cookies?: string;
|
||||
},
|
||||
): Promise<HTMLString> {
|
||||
const maxRetries = opts?.maxRetries ?? 3;
|
||||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
"cache-control": "no-cache",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "none",
|
||||
"sec-fetch-user": "?1",
|
||||
"user-agent":
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
};
|
||||
|
||||
// Add cookies if provided
|
||||
if (opts?.cookies) {
|
||||
headers["cookie"] = opts.cookies;
|
||||
}
|
||||
|
||||
const res = await fetch(url, {
|
||||
method: "GET",
|
||||
headers,
|
||||
});
|
||||
|
||||
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
||||
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
||||
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
||||
|
||||
if (!res.ok) {
|
||||
// Respect 429 reset if provided
|
||||
if (res.status === 429) {
|
||||
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
|
||||
const waitMs = Number.isFinite(resetSeconds)
|
||||
? Math.max(0, resetSeconds * 1000)
|
||||
: (attempt + 1) * retryBaseMs;
|
||||
await delay(waitMs);
|
||||
continue;
|
||||
}
|
||||
// For Facebook, 400 often means authentication required
|
||||
// Don't retry 4xx client errors except 429
|
||||
if (res.status >= 400 && res.status < 500 && res.status !== 429) {
|
||||
throw new HttpError(
|
||||
`Request failed with status ${res.status} (Facebook may require authentication cookies for access)`,
|
||||
res.status,
|
||||
url,
|
||||
);
|
||||
}
|
||||
// Retry on 5xx
|
||||
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
continue;
|
||||
}
|
||||
throw new HttpError(
|
||||
`Request failed with status ${res.status}`,
|
||||
res.status,
|
||||
url,
|
||||
);
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||||
await delay(DELAY_MS);
|
||||
return html;
|
||||
} catch (err) {
|
||||
if (attempt >= maxRetries) throw err;
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error("Exhausted retries without response");
|
||||
}
|
||||
|
||||
// ----------------------------- Parsing -----------------------------
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
function extractFacebookMarketplaceData(
|
||||
htmlString: HTMLString,
|
||||
): FacebookAdNode[] | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
|
||||
let marketplaceData: FacebookMarketplaceSearch | null = null;
|
||||
|
||||
// Find the script containing the require data with marketplace_search
|
||||
for (const script of scripts as unknown as HTMLScriptElement[]) {
|
||||
const scriptText = script.textContent;
|
||||
if (!scriptText) continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
|
||||
// First check if this is the direct data structure (like in examples)
|
||||
if (parsed.require && Array.isArray(parsed.require)) {
|
||||
// Try multiple navigation paths to find marketplace_search
|
||||
const paths = [
|
||||
// Original path from example
|
||||
() => parsed.require[0][3][0]['__bbox']['require'][0][3][1]['__bbox']['result']['data']['marketplace_search'],
|
||||
// Alternative path structure
|
||||
() => parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
|
||||
// Another variation
|
||||
() => parsed.require[0][3][0]['__bbox']['result']['data']['marketplace_search'],
|
||||
// Direct access for some responses
|
||||
() => {
|
||||
for (const item of parsed.require) {
|
||||
if (item && item.length >= 4 && item[3]) {
|
||||
const bbox = item[3]?.['__bbox']?.result?.data?.marketplace_search;
|
||||
if (bbox) return bbox;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
];
|
||||
|
||||
for (const getData of paths) {
|
||||
try {
|
||||
const result = getData();
|
||||
if (result && isRecord(result) && result.feed_units?.edges) {
|
||||
marketplaceData = result as FacebookMarketplaceSearch;
|
||||
break;
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (marketplaceData) break;
|
||||
}
|
||||
|
||||
// Also check for direct marketplace_search in the parsed data
|
||||
if (parsed.marketplace_search && isRecord(parsed.marketplace_search) && parsed.marketplace_search.feed_units?.edges) {
|
||||
marketplaceData = parsed.marketplace_search as FacebookMarketplaceSearch;
|
||||
break;
|
||||
}
|
||||
} catch {
|
||||
// Ignore parsing errors for other scripts
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!marketplaceData?.feed_units?.edges) {
|
||||
console.warn("No marketplace data found in HTML response");
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`);
|
||||
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
|
||||
}
|
||||
|
||||
/**
|
||||
* Turns cents to localized currency string.
|
||||
*/
|
||||
function formatCentsToCurrency(
|
||||
num: number | string | undefined,
|
||||
locale = "en-US",
|
||||
): string {
|
||||
if (num == null) return "";
|
||||
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
|
||||
if (Number.isNaN(cents)) return "";
|
||||
const dollars = cents / 100;
|
||||
const formatter = new Intl.NumberFormat(locale, {
|
||||
minimumFractionDigits: 2,
|
||||
maximumFractionDigits: 2,
|
||||
useGrouping: true,
|
||||
});
|
||||
return formatter.format(dollars);
|
||||
}
|
||||
|
||||
/**
|
||||
Parse Facebook marketplace search results into ListingDetails[]
|
||||
*/
|
||||
function parseFacebookAds(ads: FacebookAdNode[]): ListingDetails[] {
|
||||
const results: ListingDetails[] = [];
|
||||
|
||||
for (const adJson of ads) {
|
||||
try {
|
||||
const listing = adJson.node.listing;
|
||||
const title = listing.marketplace_listing_title;
|
||||
const priceObj = listing.listing_price;
|
||||
|
||||
if (!title || !priceObj) continue;
|
||||
|
||||
const id = listing.id;
|
||||
const url = `https://www.facebook.com/marketplace/item/${id}`;
|
||||
|
||||
// Facebook stores price in different fields:
|
||||
// - amount_with_offset_in_currency: Facebook's internal price encoding (not cents)
|
||||
// - amount: dollars (like "1.00")
|
||||
// - formatted_amount: human-readable price (like "CA$1")
|
||||
let cents: number;
|
||||
if (priceObj.amount != null) {
|
||||
const dollars = typeof priceObj.amount === 'string'
|
||||
? Number.parseFloat(priceObj.amount)
|
||||
: priceObj.amount;
|
||||
cents = Math.round(dollars * 100);
|
||||
} else if (priceObj.amount_with_offset_in_currency != null) {
|
||||
// Fallback: try to extract cents from amount_with_offset_in_currency
|
||||
// This appears to use some exchange rate/multiplier format
|
||||
const encodedAmount = Number(priceObj.amount_with_offset_in_currency);
|
||||
if (!Number.isNaN(encodedAmount) && encodedAmount > 0) {
|
||||
// Estimate roughly - this field doesn't contain real cents
|
||||
// Use formatted_amount to get the actual dollar amount
|
||||
if (priceObj.formatted_amount) {
|
||||
const match = priceObj.formatted_amount.match(/[\d,]+\.?\d*/);
|
||||
if (match) {
|
||||
const dollars = Number.parseFloat(match[0].replace(',', ''));
|
||||
if (!Number.isNaN(dollars)) {
|
||||
cents = Math.round(dollars * 100);
|
||||
} else {
|
||||
cents = encodedAmount; // fallback
|
||||
}
|
||||
} else {
|
||||
cents = encodedAmount; // fallback
|
||||
}
|
||||
} else {
|
||||
cents = encodedAmount; // fallback
|
||||
}
|
||||
} else {
|
||||
continue; // Invalid price
|
||||
}
|
||||
} else {
|
||||
continue; // No price available
|
||||
}
|
||||
|
||||
if (!Number.isFinite(cents) || cents <= 0) continue;
|
||||
|
||||
// Extract address from location data if available
|
||||
const cityName =
|
||||
listing.location?.reverse_geocode?.city_page?.display_name;
|
||||
const address = cityName || null;
|
||||
|
||||
// Format creation date if available
|
||||
const creationDate = listing.creation_time
|
||||
? new Date(listing.creation_time * 1000).toISOString()
|
||||
: undefined;
|
||||
|
||||
const listingDetails: ListingDetails = {
|
||||
url,
|
||||
title,
|
||||
listingPrice: {
|
||||
amountFormatted: priceObj.formatted_amount || formatCentsToCurrency(cents),
|
||||
cents,
|
||||
currency: priceObj.currency || "CAD", // Facebook marketplace often uses CAD
|
||||
},
|
||||
address,
|
||||
creationDate,
|
||||
listingType: "item", // Default type for marketplace listings
|
||||
};
|
||||
|
||||
results.push(listingDetails);
|
||||
} catch {
|
||||
// Skip malformed ads
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// ----------------------------- Main -----------------------------
|
||||
|
||||
export default async function fetchFacebookItems(
|
||||
SEARCH_QUERY: string,
|
||||
REQUESTS_PER_SECOND = 1,
|
||||
LOCATION = "toronto",
|
||||
MAX_ITEMS = 25,
|
||||
cookiesSource?: string,
|
||||
) {
|
||||
// Load Facebook cookies - required for Facebook Marketplace access
|
||||
const cookies = await loadFacebookCookies(cookiesSource);
|
||||
if (cookies.length === 0) {
|
||||
throw new Error(
|
||||
"Facebook cookies are required for marketplace access. " +
|
||||
"Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies.",
|
||||
);
|
||||
}
|
||||
|
||||
// Format cookies for HTTP header
|
||||
const domain = "www.facebook.com";
|
||||
const cookiesHeader = formatCookiesForHeader(cookies, domain);
|
||||
if (!cookiesHeader) {
|
||||
throw new Error(
|
||||
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.",
|
||||
);
|
||||
}
|
||||
|
||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||||
|
||||
// Encode search query for URL
|
||||
const encodedQuery = encodeURIComponent(SEARCH_QUERY);
|
||||
|
||||
// Facebook marketplace URL structure
|
||||
const searchUrl = `https://www.facebook.com/marketplace/${LOCATION}/search?query=${encodedQuery}&sortBy=creation_time_descend&exact=false`;
|
||||
|
||||
console.log(`Fetching Facebook marketplace: ${searchUrl}`);
|
||||
console.log(`Using ${cookies.length} cookies for authentication`);
|
||||
|
||||
let searchHtml: string;
|
||||
try {
|
||||
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
console.log(
|
||||
"\n" +
|
||||
`Facebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||
);
|
||||
}
|
||||
},
|
||||
cookies: cookiesHeader,
|
||||
});
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.warn(
|
||||
`\nFacebook marketplace access failed (${err.status}): ${err.message}`,
|
||||
);
|
||||
if (err.status === 400 || err.status === 401 || err.status === 403) {
|
||||
console.warn(
|
||||
"This might indicate invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies.",
|
||||
);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
const ads = extractFacebookMarketplaceData(searchHtml);
|
||||
if (!ads || ads.length === 0) {
|
||||
console.warn("No ads parsed from Facebook marketplace page.");
|
||||
return [];
|
||||
}
|
||||
|
||||
console.log(`\nFound ${ads.length} raw ads. Processing...`);
|
||||
|
||||
const progressBar = new cliProgress.SingleBar(
|
||||
{},
|
||||
cliProgress.Presets.shades_classic,
|
||||
);
|
||||
const totalProgress = ads.length;
|
||||
let currentProgress = 0;
|
||||
progressBar.start(totalProgress, currentProgress);
|
||||
|
||||
const items = parseFacebookAds(ads);
|
||||
|
||||
// Filter to only priced items (already done in parseFacebookAds)
|
||||
const pricedItems = items.filter(
|
||||
(item) => item.listingPrice?.cents && item.listingPrice.cents > 0,
|
||||
);
|
||||
|
||||
progressBar.update(totalProgress);
|
||||
progressBar.stop();
|
||||
|
||||
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
|
||||
return pricedItems.slice(0, MAX_ITEMS); // Limit results
|
||||
}
|
||||
36
src/index.ts
36
src/index.ts
@@ -1,4 +1,5 @@
|
||||
import fetchKijijiItems from "@/kijiji";
|
||||
import fetchFacebookItems from "@/facebook";
|
||||
|
||||
const PORT = process.env.PORT || 4005;
|
||||
|
||||
@@ -33,6 +34,41 @@ const server = Bun.serve({
|
||||
return Response.json(items, { status: 200 });
|
||||
},
|
||||
|
||||
"/api/facebook": async (req: Request) => {
|
||||
const reqUrl = new URL(req.url);
|
||||
|
||||
const SEARCH_QUERY =
|
||||
req.headers.get("query") || reqUrl.searchParams.get("q") || null;
|
||||
if (!SEARCH_QUERY)
|
||||
return Response.json(
|
||||
{
|
||||
message:
|
||||
"Request didn't have 'query' header or 'q' search parameter!",
|
||||
},
|
||||
{ status: 400 },
|
||||
);
|
||||
|
||||
const LOCATION = reqUrl.searchParams.get("location") || "toronto";
|
||||
const COOKIES_SOURCE = reqUrl.searchParams.get("cookies") || undefined;
|
||||
|
||||
try {
|
||||
const items = await fetchFacebookItems(SEARCH_QUERY, 5, LOCATION, 25, COOKIES_SOURCE);
|
||||
if (!items || items.length === 0)
|
||||
return Response.json(
|
||||
{ message: "Search didn't return any results!" },
|
||||
{ status: 404 },
|
||||
);
|
||||
return Response.json(items, { status: 200 });
|
||||
} catch (error) {
|
||||
console.error("Facebook scraping error:", error);
|
||||
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
|
||||
return Response.json(
|
||||
{ message: errorMessage },
|
||||
{ status: 400 },
|
||||
);
|
||||
}
|
||||
},
|
||||
|
||||
// Wildcard route for all routes that start with "/api/" and aren't otherwise matched
|
||||
"/api/*": Response.json({ message: "Not found" }, { status: 404 }),
|
||||
|
||||
|
||||
Reference in New Issue
Block a user