diff --git a/.gitignore b/.gitignore index c2715d3..3c6bfcf 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json .DS_Store examples/* +cookies/*.json diff --git a/cookies/README.md b/cookies/README.md new file mode 100644 index 0000000..05dd2c0 --- /dev/null +++ b/cookies/README.md @@ -0,0 +1,52 @@ +# Facebook Marketplace Cookies Setup + +To use the Facebook Marketplace scraper, you need to provide valid Facebook session cookies. + +## Option 1: Cookies File (`facebook.json`) + +1. Log into Facebook in your browser +2. Open Developer Tools → Network tab +3. Visit facebook.com/marketplace (ensure you're logged in) +4. Look for any marketplace-related requests in the Network tab +5. Export cookies from the browser's Application/Storage → Cookies section +6. Save the cookies as a JSON array to `facebook.json` + +The `facebook.json` file should contain Facebook session cookies, particularly: +- `c_user`: Your Facebook user ID +- `xs`: Facebook session token +- `fr`: Facebook request token +- `datr`: Data attribution token +- `sb`: Session browser token + +Example structure: +```json +[ + { + "name": "c_user", + "value": "123456789", + "domain": ".facebook.com", + "path": "/", + "secure": true + }, + // ... other cookies +] +``` + +## Option 2: URL Parameter + +You can pass cookies directly via the `cookies` URL parameter: + +``` +GET /api/facebook?q=laptop&cookies=[{"name":"c_user","value":"123","domain":".facebook.com",...}] +``` + +## Important Notes + +- Cookies must be from an active Facebook session +- Cookies expire, so you may need to refresh them periodically +- Never share real cookies or commit them to version control +- Facebook may block automated scraping even with valid cookies + +## Security + +The cookies file is intentionally left out of version control for security reasons. diff --git a/src/facebook.ts b/src/facebook.ts new file mode 100644 index 0000000..0bce67f --- /dev/null +++ b/src/facebook.ts @@ -0,0 +1,562 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import { parseHTML } from "linkedom"; +import cliProgress from "cli-progress"; + +/** + * Facebook Marketplace Scraper + * + * Note: Facebook Marketplace requires authentication cookies for full access. + * This implementation will return limited or no results without proper authentication. + * This is by design to respect Facebook's authentication requirements. + */ + +// ----------------------------- Types ----------------------------- + +type HTMLString = string; + +interface Cookie { + name: string; + value: string; + domain: string; + path: string; + secure?: boolean; + httpOnly?: boolean; + sameSite?: "strict" | "lax" | "none" | "unspecified"; + session?: boolean; + expirationDate?: number; + partitionKey?: any; + storeId?: string; +} + +interface FacebookAdNode { + node: { + listing: { + id: string; + marketplace_listing_title?: string; + listing_price?: { + amount?: string | number; + currency?: string; + }; + location?: { + reverse_geocode?: { + city_page?: { + display_name?: string; + }; + }; + }; + creation_time?: number; + [k: string]: unknown; + }; + [k: string]: unknown; + }; +} + +interface FacebookEdge { + node: FacebookAdNode["node"]; + [k: string]: unknown; +} + +interface FacebookMarketplaceSearch { + feed_units?: { + edges?: FacebookEdge[]; + }; + [k: string]: unknown; +} + +interface FacebookRequireData { + require?: [number, number, number, FacebookMarketplaceSearch, number][]; + [k: string]: unknown; +} + +type ListingDetails = { + url: string; + title: string; + description?: string; + listingPrice?: { + amountFormatted: string; + cents?: number; + currency?: string; + }; + listingType?: string; + listingStatus?: string; + creationDate?: string; + endDate?: string; + numberOfViews?: number; + address?: string | null; +}; + +// ----------------------------- Utilities ----------------------------- + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +async function delay(ms: number): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Load Facebook cookies from file or string + */ +async function loadFacebookCookies(cookiesSource?: string): Promise { + // First try to load from provided string parameter + if (cookiesSource) { + try { + const cookies = JSON.parse(cookiesSource); + if (Array.isArray(cookies)) { + return cookies.filter( + (cookie): cookie is Cookie => + cookie && + typeof cookie.name === "string" && + typeof cookie.value === "string", + ); + } + } catch (e) { + throw new Error(`Invalid cookies JSON provided: ${e}`); + } + } + + // Try to load from ./cookies/facebook.json + try { + const cookiesPath = "./cookies/facebook.json"; + const file = Bun.file(cookiesPath); + if (await file.exists()) { + const content = await file.text(); + const cookies = JSON.parse(content); + if (Array.isArray(cookies)) { + return cookies.filter( + (cookie): cookie is Cookie => + cookie && + typeof cookie.name === "string" && + typeof cookie.value === "string", + ); + } + } + } catch (e) { + console.warn(`Could not load cookies from ./cookies/facebook.json: ${e}`); + } + + return []; +} + +/** + * Format cookies array into Cookie header string + */ +function formatCookiesForHeader(cookies: Cookie[], domain: string): string { + const validCookies = cookies + .filter((cookie) => { + // Check if cookie applies to this domain + if (cookie.domain.startsWith(".")) { + // Domain cookie (applies to subdomains) + return ( + domain.endsWith(cookie.domain.slice(1)) || + domain === cookie.domain.slice(1) + ); + } else { + // Host-only cookie + return cookie.domain === domain; + } + }) + .filter((cookie) => { + // Check expiration + if (cookie.expirationDate && cookie.expirationDate < Date.now() / 1000) { + return false; // Expired + } + return true; + }); + + return validCookies + .map((cookie) => `${cookie.name}=${cookie.value}`) + .join("; "); +} + +class HttpError extends Error { + constructor( + message: string, + public readonly status: number, + public readonly url: string, + ) { + super(message); + this.name = "HttpError"; + } +} + +// ----------------------------- HTTP Client ----------------------------- + +/** + Fetch HTML with a basic retry strategy and simple rate-limit delay between calls. + - Retries on 429 and 5xx + - Respects X-RateLimit-Reset when present (seconds) + - Supports custom cookies for Facebook authentication +*/ +async function fetchHtml( + url: string, + DELAY_MS: number, + opts?: { + maxRetries?: number; + retryBaseMs?: number; + onRateInfo?: (remaining: string | null, reset: string | null) => void; + cookies?: string; + }, +): Promise { + const maxRetries = opts?.maxRetries ?? 3; + const retryBaseMs = opts?.retryBaseMs ?? 500; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + const headers: Record = { + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", + "accept-encoding": "gzip, deflate, br", + "cache-control": "no-cache", + "upgrade-insecure-requests": "1", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "user-agent": + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }; + + // Add cookies if provided + if (opts?.cookies) { + headers["cookie"] = opts.cookies; + } + + const res = await fetch(url, { + method: "GET", + headers, + }); + + const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining"); + const rateLimitReset = res.headers.get("X-RateLimit-Reset"); + opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset); + + if (!res.ok) { + // Respect 429 reset if provided + if (res.status === 429) { + const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN; + const waitMs = Number.isFinite(resetSeconds) + ? Math.max(0, resetSeconds * 1000) + : (attempt + 1) * retryBaseMs; + await delay(waitMs); + continue; + } + // For Facebook, 400 often means authentication required + // Don't retry 4xx client errors except 429 + if (res.status >= 400 && res.status < 500 && res.status !== 429) { + throw new HttpError( + `Request failed with status ${res.status} (Facebook may require authentication cookies for access)`, + res.status, + url, + ); + } + // Retry on 5xx + if (res.status >= 500 && res.status < 600 && attempt < maxRetries) { + await delay((attempt + 1) * retryBaseMs); + continue; + } + throw new HttpError( + `Request failed with status ${res.status}`, + res.status, + url, + ); + } + + const html = await res.text(); + // Respect per-request delay to keep at or under REQUESTS_PER_SECOND + await delay(DELAY_MS); + return html; + } catch (err) { + if (attempt >= maxRetries) throw err; + await delay((attempt + 1) * retryBaseMs); + } + } + + throw new Error("Exhausted retries without response"); +} + +// ----------------------------- Parsing ----------------------------- + +/** + Extract marketplace search data from Facebook page script tags +*/ +function extractFacebookMarketplaceData( + htmlString: HTMLString, +): FacebookAdNode[] | null { + const { document } = parseHTML(htmlString); + const scripts = document.querySelectorAll("script"); + + let marketplaceData: FacebookMarketplaceSearch | null = null; + + // Find the script containing the require data with marketplace_search + for (const script of scripts as unknown as HTMLScriptElement[]) { + const scriptText = script.textContent; + if (!scriptText) continue; + + try { + const parsed = JSON.parse(scriptText); + + // First check if this is the direct data structure (like in examples) + if (parsed.require && Array.isArray(parsed.require)) { + // Try multiple navigation paths to find marketplace_search + const paths = [ + // Original path from example + () => parsed.require[0][3][0]['__bbox']['require'][0][3][1]['__bbox']['result']['data']['marketplace_search'], + // Alternative path structure + () => parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search, + // Another variation + () => parsed.require[0][3][0]['__bbox']['result']['data']['marketplace_search'], + // Direct access for some responses + () => { + for (const item of parsed.require) { + if (item && item.length >= 4 && item[3]) { + const bbox = item[3]?.['__bbox']?.result?.data?.marketplace_search; + if (bbox) return bbox; + } + } + return null; + } + ]; + + for (const getData of paths) { + try { + const result = getData(); + if (result && isRecord(result) && result.feed_units?.edges) { + marketplaceData = result as FacebookMarketplaceSearch; + break; + } + } catch { + continue; + } + } + + if (marketplaceData) break; + } + + // Also check for direct marketplace_search in the parsed data + if (parsed.marketplace_search && isRecord(parsed.marketplace_search) && parsed.marketplace_search.feed_units?.edges) { + marketplaceData = parsed.marketplace_search as FacebookMarketplaceSearch; + break; + } + } catch { + // Ignore parsing errors for other scripts + continue; + } + } + + if (!marketplaceData?.feed_units?.edges) { + console.warn("No marketplace data found in HTML response"); + return null; + } + + console.log(`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`); + return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node })); +} + +/** + * Turns cents to localized currency string. + */ +function formatCentsToCurrency( + num: number | string | undefined, + locale = "en-US", +): string { + if (num == null) return ""; + const cents = typeof num === "string" ? Number.parseInt(num, 10) : num; + if (Number.isNaN(cents)) return ""; + const dollars = cents / 100; + const formatter = new Intl.NumberFormat(locale, { + minimumFractionDigits: 2, + maximumFractionDigits: 2, + useGrouping: true, + }); + return formatter.format(dollars); +} + +/** + Parse Facebook marketplace search results into ListingDetails[] +*/ +function parseFacebookAds(ads: FacebookAdNode[]): ListingDetails[] { + const results: ListingDetails[] = []; + + for (const adJson of ads) { + try { + const listing = adJson.node.listing; + const title = listing.marketplace_listing_title; + const priceObj = listing.listing_price; + + if (!title || !priceObj) continue; + + const id = listing.id; + const url = `https://www.facebook.com/marketplace/item/${id}`; + + // Facebook stores price in different fields: + // - amount_with_offset_in_currency: Facebook's internal price encoding (not cents) + // - amount: dollars (like "1.00") + // - formatted_amount: human-readable price (like "CA$1") + let cents: number; + if (priceObj.amount != null) { + const dollars = typeof priceObj.amount === 'string' + ? Number.parseFloat(priceObj.amount) + : priceObj.amount; + cents = Math.round(dollars * 100); + } else if (priceObj.amount_with_offset_in_currency != null) { + // Fallback: try to extract cents from amount_with_offset_in_currency + // This appears to use some exchange rate/multiplier format + const encodedAmount = Number(priceObj.amount_with_offset_in_currency); + if (!Number.isNaN(encodedAmount) && encodedAmount > 0) { + // Estimate roughly - this field doesn't contain real cents + // Use formatted_amount to get the actual dollar amount + if (priceObj.formatted_amount) { + const match = priceObj.formatted_amount.match(/[\d,]+\.?\d*/); + if (match) { + const dollars = Number.parseFloat(match[0].replace(',', '')); + if (!Number.isNaN(dollars)) { + cents = Math.round(dollars * 100); + } else { + cents = encodedAmount; // fallback + } + } else { + cents = encodedAmount; // fallback + } + } else { + cents = encodedAmount; // fallback + } + } else { + continue; // Invalid price + } + } else { + continue; // No price available + } + + if (!Number.isFinite(cents) || cents <= 0) continue; + + // Extract address from location data if available + const cityName = + listing.location?.reverse_geocode?.city_page?.display_name; + const address = cityName || null; + + // Format creation date if available + const creationDate = listing.creation_time + ? new Date(listing.creation_time * 1000).toISOString() + : undefined; + + const listingDetails: ListingDetails = { + url, + title, + listingPrice: { + amountFormatted: priceObj.formatted_amount || formatCentsToCurrency(cents), + cents, + currency: priceObj.currency || "CAD", // Facebook marketplace often uses CAD + }, + address, + creationDate, + listingType: "item", // Default type for marketplace listings + }; + + results.push(listingDetails); + } catch { + // Skip malformed ads + continue; + } + } + + return results; +} + +// ----------------------------- Main ----------------------------- + +export default async function fetchFacebookItems( + SEARCH_QUERY: string, + REQUESTS_PER_SECOND = 1, + LOCATION = "toronto", + MAX_ITEMS = 25, + cookiesSource?: string, +) { + // Load Facebook cookies - required for Facebook Marketplace access + const cookies = await loadFacebookCookies(cookiesSource); + if (cookies.length === 0) { + throw new Error( + "Facebook cookies are required for marketplace access. " + + "Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies.", + ); + } + + // Format cookies for HTTP header + const domain = "www.facebook.com"; + const cookiesHeader = formatCookiesForHeader(cookies, domain); + if (!cookiesHeader) { + throw new Error( + "No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.", + ); + } + + const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); + + // Encode search query for URL + const encodedQuery = encodeURIComponent(SEARCH_QUERY); + + // Facebook marketplace URL structure + const searchUrl = `https://www.facebook.com/marketplace/${LOCATION}/search?query=${encodedQuery}&sortBy=creation_time_descend&exact=false`; + + console.log(`Fetching Facebook marketplace: ${searchUrl}`); + console.log(`Using ${cookies.length} cookies for authentication`); + + let searchHtml: string; + try { + searchHtml = await fetchHtml(searchUrl, DELAY_MS, { + onRateInfo: (remaining, reset) => { + if (remaining && reset) { + console.log( + "\n" + + `Facebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`, + ); + } + }, + cookies: cookiesHeader, + }); + } catch (err) { + if (err instanceof HttpError) { + console.warn( + `\nFacebook marketplace access failed (${err.status}): ${err.message}`, + ); + if (err.status === 400 || err.status === 401 || err.status === 403) { + console.warn( + "This might indicate invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies.", + ); + } + return []; + } + throw err; + } + + const ads = extractFacebookMarketplaceData(searchHtml); + if (!ads || ads.length === 0) { + console.warn("No ads parsed from Facebook marketplace page."); + return []; + } + + console.log(`\nFound ${ads.length} raw ads. Processing...`); + + const progressBar = new cliProgress.SingleBar( + {}, + cliProgress.Presets.shades_classic, + ); + const totalProgress = ads.length; + let currentProgress = 0; + progressBar.start(totalProgress, currentProgress); + + const items = parseFacebookAds(ads); + + // Filter to only priced items (already done in parseFacebookAds) + const pricedItems = items.filter( + (item) => item.listingPrice?.cents && item.listingPrice.cents > 0, + ); + + progressBar.update(totalProgress); + progressBar.stop(); + + console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`); + return pricedItems.slice(0, MAX_ITEMS); // Limit results +} diff --git a/src/index.ts b/src/index.ts index d6a36d8..63a7a54 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ import fetchKijijiItems from "@/kijiji"; +import fetchFacebookItems from "@/facebook"; const PORT = process.env.PORT || 4005; @@ -33,6 +34,41 @@ const server = Bun.serve({ return Response.json(items, { status: 200 }); }, + "/api/facebook": async (req: Request) => { + const reqUrl = new URL(req.url); + + const SEARCH_QUERY = + req.headers.get("query") || reqUrl.searchParams.get("q") || null; + if (!SEARCH_QUERY) + return Response.json( + { + message: + "Request didn't have 'query' header or 'q' search parameter!", + }, + { status: 400 }, + ); + + const LOCATION = reqUrl.searchParams.get("location") || "toronto"; + const COOKIES_SOURCE = reqUrl.searchParams.get("cookies") || undefined; + + try { + const items = await fetchFacebookItems(SEARCH_QUERY, 5, LOCATION, 25, COOKIES_SOURCE); + if (!items || items.length === 0) + return Response.json( + { message: "Search didn't return any results!" }, + { status: 404 }, + ); + return Response.json(items, { status: 200 }); + } catch (error) { + console.error("Facebook scraping error:", error); + const errorMessage = error instanceof Error ? error.message : "Unknown error occurred"; + return Response.json( + { message: errorMessage }, + { status: 400 }, + ); + } + }, + // Wildcard route for all routes that start with "/api/" and aren't otherwise matched "/api/*": Response.json({ message: "Not found" }, { status: 404 }),