migrate to monorepo?

This commit is contained in:
2025-12-13 20:20:48 -05:00
parent 7da6408d7a
commit a66b5b2362
29 changed files with 849 additions and 817 deletions

View File

@@ -0,0 +1,17 @@
// Export all scrapers
export { default as fetchKijijiItems, slugify } from "./scrapers/kijiji";
export type { KijijiListingDetails } from "./scrapers/kijiji";
export { default as fetchFacebookItems } from "./scrapers/facebook";
export type { FacebookListingDetails } from "./scrapers/facebook";
export { default as fetchEbayItems } from "./scrapers/ebay";
export type { EbayListingDetails } from "./scrapers/ebay";
// Export shared utilities
export * from "./utils/http";
export * from "./utils/delay";
export * from "./utils/format";
// Export shared types
export * from "./types/common";

View File

@@ -0,0 +1,346 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { parseHTML } from "linkedom";
import { isRecord } from "../utils/http";
import { delay } from "../utils/delay";
import { formatCentsToCurrency } from "../utils/format";
import type { HTMLString } from "../types/common";
// ----------------------------- Types -----------------------------
export interface EbayListingDetails {
url: string;
title: string;
description?: string;
listingPrice?: {
amountFormatted: string;
cents?: number;
currency?: string;
};
listingType?: string;
listingStatus?: string;
creationDate?: string;
endDate?: string;
numberOfViews?: number;
address?: string | null;
}
// ----------------------------- Utilities -----------------------------
/**
* Parse eBay currency string like "$1.50 CAD" or "CA $1.50" into cents
*/
function parseEbayPrice(priceText: string): { cents: number; currency: string } | null {
if (!priceText || typeof priceText !== 'string') return null;
// Clean up the price text and extract currency and amount
const cleaned = priceText.trim();
// Find all numbers in the string (including decimals)
const numberMatches = cleaned.match(/[\d,]+\.?\d*/);
if (!numberMatches) return null;
const amountStr = numberMatches[0].replace(/,/g, '');
const dollars = parseFloat(amountStr);
if (isNaN(dollars)) return null;
const cents = Math.round(dollars * 100);
// Extract currency - look for common formats like "CAD", "USD", "C $", "$CA", etc.
let currency = 'USD'; // Default
if (cleaned.toUpperCase().includes('CAD') || cleaned.includes('CA$') || cleaned.includes('C $')) {
currency = 'CAD';
} else if (cleaned.toUpperCase().includes('USD') || cleaned.includes('$')) {
currency = 'USD';
}
return { cents, currency };
}
class HttpError extends Error {
constructor(
message: string,
public readonly status: number,
public readonly url: string,
) {
super(message);
this.name = "HttpError";
}
}
// ----------------------------- Parsing -----------------------------
/**
Parse eBay search page HTML and extract listings using DOM selectors
*/
function parseEbayListings(
htmlString: HTMLString,
keywords: string[],
exclusions: string[],
strictMode: boolean
): EbayListingDetails[] {
const { document } = parseHTML(htmlString);
const results: EbayListingDetails[] = [];
// Find all listing links by looking for eBay item URLs (/itm/)
const linkElements = document.querySelectorAll('a[href*="itm/"]');
for (const linkElement of linkElements) {
try {
// Get href attribute
let href = linkElement.getAttribute('href');
if (!href) continue;
// Make href absolute
if (!href.startsWith('http')) {
href = href.startsWith('//') ? `https:${href}` : `https://www.ebay.com${href}`;
}
// Find the container - go up several levels to find the item container
// Modern eBay uses complex nested structures
let container = linkElement.parentElement?.parentElement?.parentElement;
if (!container) {
// Try a different level
container = linkElement.parentElement?.parentElement;
}
if (!container) continue;
// Extract title - look for heading or title-related elements near the link
// Modern eBay often uses h3, span, or div with text content near the link
let titleElement = container.querySelector('h3, [role="heading"], .s-item__title span');
// If no direct title element, try finding text content around the link
if (!titleElement) {
// Look for spans or divs with text near this link
const nearbySpans = container.querySelectorAll('span, div');
for (const span of nearbySpans) {
const text = span.textContent?.trim();
if (text && text.length > 10 && text.length < 200 && !text.includes('$') && !text.includes('item')) {
titleElement = span;
break;
}
}
}
let title = titleElement?.textContent?.trim();
// Clean up eBay UI strings that get included in titles
if (title) {
// Remove common eBay UI strings that appear at the end of titles
const uiStrings = [
'Opens in a new window',
'Opens in a new tab',
'Opens in a new window or tab',
'opens in a new window',
'opens in a new tab',
'opens in a new window or tab'
];
for (const uiString of uiStrings) {
const uiIndex = title.indexOf(uiString);
if (uiIndex !== -1) {
title = title.substring(0, uiIndex).trim();
break; // Only remove one UI string per title
}
}
// If the title became empty or too short after cleaning, skip this item
if (title.length < 10) {
continue;
}
}
if (!title) continue;
// Skip irrelevant eBay ads
if (title === "Shop on eBay" || title.length < 3) continue;
// Extract price - look for eBay's price classes, preferring sale/discount prices
let priceElement = container.querySelector('[class*="s-item__price"], .s-item__price, [class*="price"]');
// If no direct price class, look for spans containing $ (but not titles)
if (!priceElement) {
const spansAndElements = container.querySelectorAll('span, div, b, em, strong');
for (const el of spansAndElements) {
const text = el.textContent?.trim();
// Must contain $, be reasonably short (price shouldn't be paragraph), and not contain product words
if (text && text.includes('$') && text.length < 100 &&
!text.includes('laptop') && !text.includes('computer') && !text.includes('intel') &&
!text.includes('core') && !text.includes('ram') && !text.includes('ssd') &&
! /\d{4}/.test(text) && // Avoid years like "2024"
!text.includes('"') // Avoid measurements
) {
priceElement = el;
break;
}
}
}
// For discounted items, eBay shows both original and sale price
// Prefer sale/current price over original/strikethrough price
if (priceElement) {
// Check if this element or its parent contains multiple price elements
const priceContainer = priceElement.closest('[class*="s-item__price"]') || priceElement.parentElement;
if (priceContainer) {
// Look for all price elements within this container, including strikethrough prices
const allPriceElements = priceContainer.querySelectorAll('[class*="s-item__price"], span, b, em, strong, s, del, strike');
// Filter to only elements that actually contain prices (not labels)
const actualPrices: HTMLElement[] = [];
for (const el of allPriceElements) {
const text = el.textContent?.trim();
if (text && /^\s*[\$£¥]/u.test(text) && text.length < 50 && !/\d{4}/.test(text)) {
actualPrices.push(el);
}
}
// Prefer non-strikethrough prices (sale prices) over strikethrough ones (original prices)
if (actualPrices.length > 1) {
// First, look for prices that are NOT struck through
const nonStrikethroughPrices = actualPrices.filter(el => {
const tagName = el.tagName.toLowerCase();
const styles = el.classList.contains('s-strikethrough') || el.classList.contains('u-flStrike') ||
el.closest('s, del, strike');
return tagName !== 's' && tagName !== 'del' && tagName !== 'strike' && !styles;
});
if (nonStrikethroughPrices.length > 0) {
// Use the first non-strikethrough price (sale price)
priceElement = nonStrikethroughPrices[0];
} else {
// Fallback: use the last price (likely the most current)
const lastPrice = actualPrices[actualPrices.length - 1];
priceElement = lastPrice;
}
}
}
}
let priceText = priceElement?.textContent?.trim();
if (!priceText) continue;
// Parse price into cents and currency
const priceInfo = parseEbayPrice(priceText);
if (!priceInfo) continue;
// Apply exclusion filters
if (exclusions.some(exclusion => title.toLowerCase().includes(exclusion.toLowerCase()))) {
continue;
}
// Apply strict mode filter (title must contain at least one keyword)
if (strictMode && !keywords.some(keyword => title!.toLowerCase().includes(keyword.toLowerCase()))) {
continue;
}
const listing: EbayListingDetails = {
url: href,
title,
listingPrice: {
amountFormatted: priceText,
cents: priceInfo.cents,
currency: priceInfo.currency,
},
listingType: "OFFER", // eBay listings are typically offers
listingStatus: "ACTIVE",
address: null, // eBay doesn't typically show detailed addresses in search results
};
results.push(listing);
} catch (err) {
console.warn(`Error parsing eBay listing: ${err}`);
continue;
}
}
return results;
}
// ----------------------------- Main -----------------------------
export default async function fetchEbayItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
opts: {
minPrice?: number;
maxPrice?: number;
strictMode?: boolean;
exclusions?: string[];
keywords?: string[];
} = {},
) {
const {
minPrice = 0,
maxPrice = Number.MAX_SAFE_INTEGER,
strictMode = false,
exclusions = [],
keywords = [SEARCH_QUERY] // Default to search query if no keywords provided
} = opts;
// Build eBay search URL - use Canadian site and tracking parameters like real browser
const searchUrl = `https://www.ebay.ca/sch/i.html?_nkw=${encodeURIComponent(SEARCH_QUERY)}^&_sacat=0^&_from=R40^&_trksid=p4432023.m570.l1313`;
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
console.log(`Fetching eBay search: ${searchUrl}`);
try {
// Use custom headers modeled after real browser requests to bypass bot detection
const headers: Record<string, string> = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.ebay.ca/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Priority': 'u=0, i'
};
const res = await fetch(searchUrl, {
method: "GET",
headers,
});
if (!res.ok) {
throw new HttpError(
`Request failed with status ${res.status}`,
res.status,
searchUrl,
);
}
const searchHtml = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
await delay(DELAY_MS);
console.log(`\nParsing eBay listings...`);
const listings = parseEbayListings(searchHtml, keywords, exclusions, strictMode);
// Filter by price range (additional safety check)
const filteredListings = listings.filter(listing => {
const cents = listing.listingPrice?.cents;
return cents && cents >= minPrice && cents <= maxPrice;
});
console.log(`Parsed ${filteredListings.length} eBay listings.`);
return filteredListings;
} catch (err) {
if (err instanceof HttpError) {
console.error(
`Failed to fetch eBay search (${err.status}): ${err.message}`,
);
return [];
}
throw err;
}
}

View File

@@ -0,0 +1,570 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { parseHTML } from "linkedom";
import cliProgress from "cli-progress";
import { isRecord } from "../utils/http";
import { delay } from "../utils/delay";
import { formatCentsToCurrency } from "../utils/format";
import type { HTMLString } from "../types/common";
/**
* Facebook Marketplace Scraper
*
* Note: Facebook Marketplace requires authentication cookies for full access.
* This implementation will return limited or no results without proper authentication.
* This is by design to respect Facebook's authentication requirements.
*/
// ----------------------------- Types -----------------------------
interface Cookie {
name: string;
value: string;
domain: string;
path: string;
secure?: boolean;
httpOnly?: boolean;
sameSite?: "strict" | "lax" | "none" | "unspecified";
session?: boolean;
expirationDate?: number;
partitionKey?: any;
storeId?: string;
}
interface FacebookAdNode {
node: {
listing: {
id: string;
marketplace_listing_title?: string;
listing_price?: {
amount?: string | number;
currency?: string;
};
location?: {
reverse_geocode?: {
city_page?: {
display_name?: string;
};
};
};
creation_time?: number;
[k: string]: unknown;
};
[k: string]: unknown;
};
}
interface FacebookEdge {
node: FacebookAdNode["node"];
[k: string]: unknown;
}
interface FacebookMarketplaceSearch {
feed_units?: {
edges?: FacebookEdge[];
};
[k: string]: unknown;
}
export interface FacebookListingDetails {
url: string;
title: string;
description?: string;
listingPrice?: {
amountFormatted: string;
cents?: number;
currency?: string;
};
listingType?: string;
listingStatus?: string;
creationDate?: string;
endDate?: string;
numberOfViews?: number;
address?: string | null;
// Facebook-specific fields
imageUrl?: string;
videoUrl?: string;
seller?: {
name?: string;
id?: string;
};
categoryId?: string;
deliveryTypes?: string[];
}
// ----------------------------- Utilities -----------------------------
/**
* Load Facebook cookies from file or string
*/
async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
// First try to load from provided string parameter
if (cookiesSource) {
try {
const cookies = JSON.parse(cookiesSource);
if (Array.isArray(cookies)) {
return cookies.filter(
(cookie): cookie is Cookie =>
cookie &&
typeof cookie.name === "string" &&
typeof cookie.value === "string",
);
}
} catch (e) {
throw new Error(`Invalid cookies JSON provided: ${e}`);
}
}
// Try to load from ./cookies/facebook.json
try {
const cookiesPath = "./cookies/facebook.json";
const file = Bun.file(cookiesPath);
if (await file.exists()) {
const content = await file.text();
const cookies = JSON.parse(content);
if (Array.isArray(cookies)) {
return cookies.filter(
(cookie): cookie is Cookie =>
cookie &&
typeof cookie.name === "string" &&
typeof cookie.value === "string",
);
}
}
} catch (e) {
console.warn(`Could not load cookies from ./cookies/facebook.json: ${e}`);
}
return [];
}
/**
* Format cookies array into Cookie header string
*/
function formatCookiesForHeader(cookies: Cookie[], domain: string): string {
const validCookies = cookies
.filter((cookie) => {
// Check if cookie applies to this domain
if (cookie.domain.startsWith(".")) {
// Domain cookie (applies to subdomains)
return (
domain.endsWith(cookie.domain.slice(1)) ||
domain === cookie.domain.slice(1)
);
} else {
// Host-only cookie
return cookie.domain === domain;
}
})
.filter((cookie) => {
// Check expiration
if (cookie.expirationDate && cookie.expirationDate < Date.now() / 1000) {
return false; // Expired
}
return true;
});
return validCookies
.map((cookie) => `${cookie.name}=${cookie.value}`)
.join("; ");
}
class HttpError extends Error {
constructor(
message: string,
public readonly status: number,
public readonly url: string,
) {
super(message);
this.name = "HttpError";
}
}
// ----------------------------- HTTP Client -----------------------------
/**
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
- Retries on 429 and 5xx
- Respects X-RateLimit-Reset when present (seconds)
- Supports custom cookies for Facebook authentication
*/
async function fetchHtml(
url: string,
DELAY_MS: number,
opts?: {
maxRetries?: number;
retryBaseMs?: number;
onRateInfo?: (remaining: string | null, reset: string | null) => void;
cookies?: string;
},
): Promise<HTMLString> {
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const headers: Record<string, string> = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"accept-encoding": "gzip, deflate, br",
"cache-control": "no-cache",
"upgrade-insecure-requests": "1",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
};
// Add cookies if provided
if (opts?.cookies) {
headers["cookie"] = opts.cookies;
}
const res = await fetch(url, {
method: "GET",
headers,
});
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
if (!res.ok) {
// Respect 429 reset if provided
if (res.status === 429) {
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000)
: (attempt + 1) * retryBaseMs;
await delay(waitMs);
continue;
}
// For Facebook, 400 often means authentication required
// Don't retry 4xx client errors except 429
if (res.status >= 400 && res.status < 500 && res.status !== 429) {
throw new HttpError(
`Request failed with status ${res.status} (Facebook may require authentication cookies for access)`,
res.status,
url,
);
}
// Retry on 5xx
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
await delay((attempt + 1) * retryBaseMs);
continue;
}
throw new HttpError(
`Request failed with status ${res.status}`,
res.status,
url,
);
}
const html = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
await delay(DELAY_MS);
return html;
} catch (err) {
if (attempt >= maxRetries) throw err;
await delay((attempt + 1) * retryBaseMs);
}
}
throw new Error("Exhausted retries without response");
}
// ----------------------------- Parsing -----------------------------
/**
Extract marketplace search data from Facebook page script tags
*/
function extractFacebookMarketplaceData(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
let marketplaceData: FacebookMarketplaceSearch | null = null;
// Find the script containing the require data with marketplace_search
for (const script of scripts as unknown as HTMLScriptElement[]) {
const scriptText = script.textContent;
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
// First check if this is the direct data structure (like in examples)
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple navigation paths to find marketplace_search
const paths = [
// Original path from example
() => parsed.require[0][3][0]['__bbox']['require'][0][3][1]['__bbox']['result']['data']['marketplace_search'],
// Alternative path structure
() => parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
// Another variation
() => parsed.require[0][3][0]['__bbox']['result']['data']['marketplace_search'],
// Direct access for some responses
() => {
for (const item of parsed.require) {
if (item && item.length >= 4 && item[3]) {
const bbox = item[3]?.['__bbox']?.result?.data?.marketplace_search;
if (bbox) return bbox;
}
}
return null;
}
];
for (const getData of paths) {
try {
const result = getData();
if (result && isRecord(result) && result.feed_units?.edges) {
marketplaceData = result as FacebookMarketplaceSearch;
break;
}
} catch {
continue;
}
}
if (marketplaceData) break;
}
// Also check for direct marketplace_search in the parsed data
if (parsed.marketplace_search && isRecord(parsed.marketplace_search) && parsed.marketplace_search.feed_units?.edges) {
marketplaceData = parsed.marketplace_search as FacebookMarketplaceSearch;
break;
}
} catch {
// Ignore parsing errors for other scripts
continue;
}
}
if (!marketplaceData?.feed_units?.edges) {
console.warn("No marketplace data found in HTML response");
return null;
}
console.log(`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`);
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
}
/**
Parse Facebook marketplace search results into ListingDetails[]
*/
function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
const results: FacebookListingDetails[] = [];
for (const adJson of ads) {
try {
const listing = adJson.node.listing;
const title = listing.marketplace_listing_title;
const priceObj = listing.listing_price;
if (!title || !priceObj) continue;
const id = listing.id;
const url = `https://www.facebook.com/marketplace/item/${id}`;
// Facebook stores price in different fields:
// - amount_with_offset_in_currency: Facebook's internal price encoding (not cents)
// - amount: dollars (like "1.00")
// - formatted_amount: human-readable price (like "CA$1")
let cents: number;
if (priceObj.amount != null) {
const dollars = typeof priceObj.amount === 'string'
? Number.parseFloat(priceObj.amount)
: priceObj.amount;
cents = Math.round(dollars * 100);
} else if (priceObj.amount_with_offset_in_currency != null) {
// Fallback: try to extract cents from amount_with_offset_in_currency
// This appears to use some exchange rate/multiplier format
const encodedAmount = Number(priceObj.amount_with_offset_in_currency);
if (!Number.isNaN(encodedAmount) && encodedAmount > 0) {
// Estimate roughly - this field doesn't contain real cents
// Use formatted_amount to get the actual dollar amount
if (priceObj.formatted_amount) {
const match = priceObj.formatted_amount.match(/[\d,]+\.?\d*/);
if (match) {
const dollars = Number.parseFloat(match[0].replace(',', ''));
if (!Number.isNaN(dollars)) {
cents = Math.round(dollars * 100);
} else {
cents = encodedAmount; // fallback
}
} else {
cents = encodedAmount; // fallback
}
} else {
cents = encodedAmount; // fallback
}
} else {
continue; // Invalid price
}
} else {
continue; // No price available
}
if (!Number.isFinite(cents) || cents <= 0) continue;
// Extract address from location data if available
const cityName =
listing.location?.reverse_geocode?.city_page?.display_name;
const address = cityName || null;
// Determine listing status from Facebook flags
let listingStatus: string | undefined = undefined;
if (listing.is_sold) {
listingStatus = "SOLD";
} else if (listing.is_pending) {
listingStatus = "PENDING";
} else if (listing.is_live) {
listingStatus = "ACTIVE";
} else if (listing.is_hidden) {
listingStatus = "HIDDEN";
}
// Format creation date if available
const creationDate = listing.creation_time
? new Date(listing.creation_time * 1000).toISOString()
: undefined;
// Extract image and video URLs
const imageUrl = listing.primary_listing_photo?.image?.uri;
const videoUrl = listing.listing_video ? `https://www.facebook.com/${listing.listing_video.id}/` : undefined;
// Extract seller information
const seller = listing.marketplace_listing_seller ? {
name: listing.marketplace_listing_seller.name,
id: listing.marketplace_listing_seller.id
} : undefined;
const listingDetails: FacebookListingDetails = {
url,
title,
listingPrice: {
amountFormatted: priceObj.formatted_amount || formatCentsToCurrency(cents / 100, "en-CA"),
cents,
currency: priceObj.currency || "CAD", // Facebook marketplace often uses CAD
},
address,
creationDate,
listingType: "item", // Default type for marketplace listings
listingStatus,
categoryId: listing.marketplace_listing_category_id,
imageUrl,
videoUrl,
seller,
deliveryTypes: listing.delivery_types,
};
results.push(listingDetails);
} catch {
// Skip malformed ads
continue;
}
}
return results;
}
// ----------------------------- Main -----------------------------
export default async function fetchFacebookItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
LOCATION = "toronto",
MAX_ITEMS = 25,
cookiesSource?: string,
) {
// Load Facebook cookies - required for Facebook Marketplace access
const cookies = await loadFacebookCookies(cookiesSource);
if (cookies.length === 0) {
throw new Error(
"Facebook cookies are required for marketplace access. " +
"Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies.",
);
}
// Format cookies for HTTP header
const domain = "www.facebook.com";
const cookiesHeader = formatCookiesForHeader(cookies, domain);
if (!cookiesHeader) {
throw new Error(
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.",
);
}
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
// Encode search query for URL
const encodedQuery = encodeURIComponent(SEARCH_QUERY);
// Facebook marketplace URL structure
const searchUrl = `https://www.facebook.com/marketplace/${LOCATION}/search?query=${encodedQuery}&sortBy=creation_time_descend&exact=false`;
console.log(`Fetching Facebook marketplace: ${searchUrl}`);
console.log(`Using ${cookies.length} cookies for authentication`);
let searchHtml: string;
try {
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
maxRetries: 3,
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Facebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
cookies: cookiesHeader,
});
} catch (err) {
if (err instanceof HttpError) {
console.warn(
`\nFacebook marketplace access failed (${err.status}): ${err.message}`,
);
if (err.status === 400 || err.status === 401 || err.status === 403) {
console.warn(
"This might indicate invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies.",
);
}
return [];
}
throw err;
}
const ads = extractFacebookMarketplaceData(searchHtml);
if (!ads || ads.length === 0) {
console.warn("No ads parsed from Facebook marketplace page.");
return [];
}
console.log(`\nFound ${ads.length} raw ads. Processing...`);
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = ads.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items = parseFacebookAds(ads);
// Filter to only priced items (already done in parseFacebookAds)
const pricedItems = items.filter(
(item) => item.listingPrice?.cents && item.listingPrice.cents > 0,
);
progressBar.update(totalProgress);
progressBar.stop();
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
return pricedItems.slice(0, MAX_ITEMS); // Limit results
}

View File

@@ -0,0 +1,290 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { parseHTML } from "linkedom";
import unidecode from "unidecode";
import cliProgress from "cli-progress";
import { fetchHtml, isRecord, HttpError } from "../utils/http";
import { delay } from "../utils/delay";
import { formatCentsToCurrency } from "../utils/format";
import type { HTMLString } from "../types/common";
// ----------------------------- Types -----------------------------
type SearchListing = {
name: string;
listingLink: string;
};
type ApolloRecord = Record<string, unknown>;
interface ApolloSearchItem {
url?: string;
title?: string;
[k: string]: unknown;
}
interface ApolloListingRoot {
url?: string;
title?: string;
description?: string;
price?: { amount?: number | string; currency?: string };
type?: string;
status?: string;
activationDate?: string;
endDate?: string;
metrics?: { views?: number | string };
location?: { address?: string | null };
[k: string]: unknown;
}
export interface KijijiListingDetails {
url: string;
title: string;
description?: string;
listingPrice?: {
amountFormatted: string;
cents?: number;
currency?: string;
};
listingType?: string;
listingStatus?: string;
creationDate?: string;
endDate?: string;
numberOfViews?: number;
address?: string | null;
}
// ----------------------------- Utilities -----------------------------
const SEPS = new Set([" ", "", "—", "/", ":", ";", ",", ".", "-"]);
/**
* Slugifies a string for Kijiji search URLs
*/
export function slugify(input: string): string {
const s = unidecode(input).toLowerCase();
const out: string[] = [];
let lastHyphen = false;
for (let i = 0; i < s.length; i++) {
const ch = s[i];
const code = ch!.charCodeAt(0);
// a-z or 0-9
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
out.push(ch!);
lastHyphen = false;
} else if (SEPS.has(ch!)) {
if (!lastHyphen) {
out.push("-");
lastHyphen = true;
}
}
// else drop character
}
return out.join("");
}
// ----------------------------- Parsing -----------------------------
/**
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
*/
function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
const { document } = parseHTML(htmlString);
const nextData = document.getElementById("__NEXT_DATA__");
if (!nextData || !nextData.textContent) return null;
try {
const jsonData = JSON.parse(nextData.textContent);
const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__;
return isRecord(apollo) ? apollo : null;
} catch {
return null;
}
}
/**
Parse search page apollo state into SearchListing[].
Filters keys likely to be listing entities and ensures url/title exist.
*/
function parseSearch(
htmlString: HTMLString,
BASE_URL: string,
): SearchListing[] {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return [];
const results: SearchListing[] = [];
for (const [key, value] of Object.entries(apolloState)) {
// Heuristic: Kijiji listing keys usually contain "Listing"
if (!key.includes("Listing")) continue;
if (!isRecord(value)) continue;
const item = value as ApolloSearchItem;
if (typeof item.url === "string" && typeof item.title === "string") {
results.push({
listingLink: item.url.startsWith("http")
? item.url
: `${BASE_URL}${item.url}`,
name: item.title,
});
}
}
return results;
}
/**
Parse a listing page into a typed object.
*/
function parseListing(
htmlString: HTMLString,
BASE_URL: string,
): KijijiListingDetails | null {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing"),
);
if (!listingKey) return null;
const root = apolloState[listingKey];
if (!isRecord(root)) return null;
const {
url,
title,
description,
price,
type,
status,
activationDate,
endDate,
metrics,
location,
} = root as ApolloListingRoot;
const cents = price?.amount != null ? Number(price.amount) : undefined;
const amountFormatted =
cents != null
? formatCentsToCurrency(cents / 100, "en-CA")
: undefined;
const numberOfViews =
metrics?.views != null ? Number(metrics.views) : undefined;
const listingUrl =
typeof url === "string"
? url.startsWith("http")
? url
: `${BASE_URL}${url}`
: "";
if (!listingUrl || !title) return null;
return {
url: listingUrl,
title,
description,
listingPrice: amountFormatted
? {
amountFormatted,
cents: Number.isFinite(cents!) ? cents : undefined,
currency: price?.currency,
}
: undefined,
listingType: type,
listingStatus: status,
creationDate: activationDate,
endDate,
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined,
address: location?.address ?? null,
};
}
// ----------------------------- Main -----------------------------
export default async function fetchKijijiItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
BASE_URL = "https://www.kijiji.ca",
) {
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
console.log(`Fetching search: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
maxRetries: 3,
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const searchResults = parseSearch(searchHtml, BASE_URL);
if (searchResults.length === 0) {
console.warn("No search results parsed from page.");
return;
}
// Deduplicate links
const listingLinks = Array.from(
new Set(searchResults.map((r) => r.listingLink)),
);
console.log(
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
);
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = listingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items: KijijiListingDetails[] = [];
for (const link of listingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
maxRetries: 3,
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const parsed = parseListing(html, BASE_URL);
if (parsed) {
if (parsed.listingPrice?.cents) items.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(
"\n" + `Failed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
);
} else {
console.error(
"\n" +
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
}
}
console.log("\n" + `Parsed ${items.length} listings.`);
return items;
}

View File

@@ -0,0 +1,20 @@
/** HTML string alias for better type clarity */
export type HTMLString = string;
/** Currency price object with formatting options */
export interface Price {
amountFormatted: string;
cents: number;
currency: string;
}
/** Base listing details common across all marketplaces */
export interface ListingDetails {
url: string;
title: string;
listingPrice: Price;
listingType: string;
listingStatus: string;
address?: string | null;
creationDate?: string;
}

View File

@@ -0,0 +1,8 @@
/**
* Delay execution for a specified number of milliseconds
* @param ms - Milliseconds to delay
* @returns A promise that resolves after the specified delay
*/
export function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View File

@@ -0,0 +1,21 @@
/**
* Format cents to a human-readable currency string
* @param cents - Amount in cents (integer)
* @param locale - Locale string for formatting (e.g., 'en-CA', 'en-US')
* @returns Formatted currency string
*/
export function formatCentsToCurrency(cents: number, locale: string = "en-CA"): string {
try {
const formatter = new Intl.NumberFormat(locale, {
style: "currency",
currency: "CAD",
minimumFractionDigits: 2,
maximumFractionDigits: 2,
});
return formatter.format(cents / 100);
} catch (error) {
// Fallback if locale is not supported
const dollars = (cents / 100).toFixed(2);
return `$${dollars}`;
}
}

View File

@@ -0,0 +1,87 @@
/** Custom error class for HTTP-related failures */
export class HttpError extends Error {
constructor(
public statusCode: number,
message: string
) {
super(message);
this.name = "HttpError";
}
}
/** Type guard to check if a value is a record (object) */
export function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
/**
* Fetch HTML content from a URL with automatic retries
* @param url - The URL to fetch
* @param delayMs - Delay in milliseconds between retries
* @param opts - Optional fetch options
* @returns The HTML content as a string
* @throws HttpError if all retries are exhausted
*/
export async function fetchHtml(
url: string,
delayMs: number,
opts?: RequestInit
): Promise<string> {
const maxAttempts = 3;
let lastError: Error | null = null;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
try {
const response = await fetch(url, opts);
// Check for rate limiting
if (response.status === 429) {
const retryAfter = response.headers.get("Retry-After");
const waitTime = retryAfter ? parseInt(retryAfter) * 1000 : delayMs * (attempt + 1);
console.warn(
`Rate limited. Retrying after ${waitTime}ms...`
);
await new Promise((resolve) => setTimeout(resolve, waitTime));
continue;
}
// Check for server errors
if (response.status >= 500) {
lastError = new HttpError(
response.status,
`Server error: ${response.status}`
);
if (attempt < maxAttempts - 1) {
await new Promise((resolve) =>
setTimeout(resolve, delayMs * (attempt + 1))
);
continue;
}
throw lastError;
}
// Check for successful response
if (!response.ok) {
throw new HttpError(
response.status,
`HTTP ${response.status}: ${response.statusText}`
);
}
return await response.text();
} catch (error) {
lastError =
error instanceof Error
? error
: new Error("Unknown error during fetch");
if (attempt < maxAttempts - 1) {
await new Promise((resolve) =>
setTimeout(resolve, delayMs * (attempt + 1))
);
}
}
}
throw lastError || new HttpError(0, "Failed to fetch after retries");
}