feat: port upstream scraper improvements to monorepo

Kijiji improvements:
- Add error classes: NetworkError, ParseError, RateLimitError, ValidationError
- Add exponential backoff with jitter for retries
- Add request timeout (30s abort)
- Add pagination support (SearchOptions.maxPages)
- Add location/category mappings and resolution functions
- Add enhanced DetailedListing interface with images, seller info, attributes
- Add GraphQL client for seller details

Facebook improvements:
- Add parseFacebookCookieString() for parsing cookie strings
- Add ensureFacebookCookies() with env var fallback
- Add extractFacebookItemData() with multiple extraction paths
- Add fetchFacebookItem() for individual item fetching
- Add extraction metrics and API stability monitoring
- Add vehicle-specific field extraction
- Improve error handling with specific guidance for auth errors

Shared utilities:
- Update http.ts with new error classes and improved fetchHtml

Documentation:
- Port KIJIJI.md, FMARKETPLACE.md, AGENTS.md from upstream

Tests:
- Port kijiji-core, kijiji-integration, kijiji-utils tests
- Port facebook-core, facebook-integration tests
- Add test setup file

Scripts:
- Port parse-facebook-cookies.ts script

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-23 00:34:50 -05:00
parent 497c7995a2
commit 50d56201af
14 changed files with 4687 additions and 179 deletions

View File

@@ -1,8 +1,36 @@
// Export all scrapers
export { default as fetchKijijiItems, slugify } from "./scrapers/kijiji";
export type { KijijiListingDetails } from "./scrapers/kijiji";
export {
default as fetchKijijiItems,
slugify,
resolveLocationId,
resolveCategoryId,
buildSearchUrl,
extractApolloState,
parseSearch,
parseDetailedListing,
HttpError,
NetworkError,
ParseError,
RateLimitError,
ValidationError,
} from "./scrapers/kijiji";
export type {
KijijiListingDetails,
DetailedListing,
SearchOptions,
ListingFetchOptions,
} from "./scrapers/kijiji";
export { default as fetchFacebookItems } from "./scrapers/facebook";
export {
default as fetchFacebookItems,
fetchFacebookItem,
parseFacebookCookieString,
ensureFacebookCookies,
extractFacebookMarketplaceData,
extractFacebookItemData,
parseFacebookAds,
parseFacebookItem,
} from "./scrapers/facebook";
export type { FacebookListingDetails } from "./scrapers/facebook";
export { default as fetchEbayItems } from "./scrapers/ebay";

View File

@@ -26,7 +26,7 @@ interface Cookie {
sameSite?: "strict" | "lax" | "none" | "unspecified";
session?: boolean;
expirationDate?: number;
partitionKey?: any;
partitionKey?: Record<string, unknown>;
storeId?: string;
}
@@ -38,6 +38,8 @@ interface FacebookAdNode {
listing_price?: {
amount?: string | number;
currency?: string;
amount_with_offset_in_currency?: string | number;
formatted_amount?: string;
};
location?: {
reverse_geocode?: {
@@ -47,6 +49,24 @@ interface FacebookAdNode {
};
};
creation_time?: number;
is_sold?: boolean;
is_pending?: boolean;
is_live?: boolean;
is_hidden?: boolean;
primary_listing_photo?: {
image?: {
uri?: string;
};
};
listing_video?: {
id?: string;
};
marketplace_listing_seller?: {
name?: string;
id?: string;
};
marketplace_listing_category_id?: string;
delivery_types?: string[];
[k: string]: unknown;
};
[k: string]: unknown;
@@ -65,6 +85,97 @@ interface FacebookMarketplaceSearch {
[k: string]: unknown;
}
interface FacebookMarketplaceItem {
// Basic identification
id: string;
__typename: "GroupCommerceProductItem";
// Listing content
marketplace_listing_title: string;
redacted_description?: {
text: string;
};
custom_title?: string;
// Pricing
formatted_price?: {
text: string;
};
listing_price?: {
amount: string;
currency: string;
amount_with_offset: string;
};
// Location
location_text?: {
text: string;
};
location?: {
latitude: number;
longitude: number;
reverse_geocode_detailed?: {
country_alpha_two: string;
postal_code_trimmed: string;
};
};
// Status flags
is_live?: boolean;
is_sold?: boolean;
is_pending?: boolean;
is_hidden?: boolean;
is_draft?: boolean;
// Timing
creation_time?: number;
// Seller information
marketplace_listing_seller?: {
__typename: "User";
id: string;
name: string;
profile_picture?: {
uri: string;
};
join_time?: number;
};
// Vehicle-specific fields (for automotive listings)
vehicle_make_display_name?: string;
vehicle_model_display_name?: string;
vehicle_odometer_data?: {
unit: "KILOMETERS" | "MILES";
value: number;
};
vehicle_transmission_type?: "AUTOMATIC" | "MANUAL";
vehicle_exterior_color?: string;
vehicle_interior_color?: string;
vehicle_condition?: "EXCELLENT" | "GOOD" | "FAIR" | "POOR";
vehicle_fuel_type?: string;
vehicle_trim_display_name?: string;
// Category and commerce
marketplace_listing_category_id?: string;
condition?: string;
// Commerce features
delivery_types?: string[];
is_shipping_offered?: boolean;
is_buy_now_enabled?: boolean;
can_buyer_make_checkout_offer?: boolean;
// Communication
messaging_enabled?: boolean;
first_message_suggested_value?: string;
// Metadata
logging_id?: string;
reportable_ent_id?: string;
[k: string]: unknown;
}
export interface FacebookListingDetails {
url: string;
title: string;
@@ -96,7 +207,10 @@ export interface FacebookListingDetails {
/**
* Load Facebook cookies from file or string
*/
async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
async function loadFacebookCookies(
cookiesSource?: string,
cookiePath = "./cookies/facebook.json"
): Promise<Cookie[]> {
// First try to load from provided string parameter
if (cookiesSource) {
try {
@@ -106,7 +220,7 @@ async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
(cookie): cookie is Cookie =>
cookie &&
typeof cookie.name === "string" &&
typeof cookie.value === "string",
typeof cookie.value === "string"
);
}
} catch (e) {
@@ -114,9 +228,9 @@ async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
}
}
// Try to load from ./cookies/facebook.json
// Try to load from specified path
try {
const cookiesPath = "./cookies/facebook.json";
const cookiesPath = cookiePath;
const file = Bun.file(cookiesPath);
if (await file.exists()) {
const content = await file.text();
@@ -126,17 +240,100 @@ async function loadFacebookCookies(cookiesSource?: string): Promise<Cookie[]> {
(cookie): cookie is Cookie =>
cookie &&
typeof cookie.name === "string" &&
typeof cookie.value === "string",
typeof cookie.value === "string"
);
}
}
} catch (e) {
console.warn(`Could not load cookies from ./cookies/facebook.json: ${e}`);
console.warn(`Could not load cookies from ${cookiePath}: ${e}`);
}
return [];
}
/**
* Parse Facebook cookie string into Cookie array format
*/
export function parseFacebookCookieString(cookieString: string): Cookie[] {
if (!cookieString || !cookieString.trim()) {
return [];
}
return cookieString
.split(";")
.map((pair) => pair.trim())
.filter((pair) => pair.includes("="))
.map((pair) => {
const [name, value] = pair.split("=", 2);
const trimmedName = name.trim();
const trimmedValue = value.trim();
// Skip empty names or values
if (!trimmedName || !trimmedValue) {
return null;
}
return {
name: trimmedName,
value: decodeURIComponent(trimmedValue),
domain: ".facebook.com",
path: "/",
secure: true,
httpOnly: false,
sameSite: "lax" as const,
expirationDate: undefined, // Session cookies
};
})
.filter((cookie): cookie is Cookie => cookie !== null);
}
/**
* Ensure Facebook cookies are available, parsing from env var if needed
*/
export async function ensureFacebookCookies(
cookiePath = "./cookies/facebook.json"
): Promise<Cookie[]> {
// First try to load existing cookies
try {
const existing = await loadFacebookCookies(undefined, cookiePath);
if (existing.length > 0) {
return existing;
}
} catch {
// File doesn't exist or is invalid, continue to check env var
}
// Try to parse from environment variable
const cookieString = process.env.FACEBOOK_COOKIE;
if (!cookieString || !cookieString.trim()) {
throw new Error(
"No valid Facebook cookies found. Either:\n" +
" 1. Set FACEBOOK_COOKIE environment variable with cookie string, or\n" +
" 2. Create ./cookies/facebook.json manually with cookie array"
);
}
// Parse the cookie string
const cookies = parseFacebookCookieString(cookieString);
if (cookies.length === 0) {
throw new Error(
"FACEBOOK_COOKIE environment variable contains no valid cookies. " +
'Expected format: "name1=value1; name2=value2;"'
);
}
// Save to file for future use
try {
await Bun.write(cookiePath, JSON.stringify(cookies, null, 2));
console.log(`Saved ${cookies.length} Facebook cookies to ${cookiePath}`);
} catch (error) {
console.warn(`Could not save cookies to ${cookiePath}: ${error}`);
// Continue anyway, we have the cookies in memory
}
return cookies;
}
/**
* Format cookies array into Cookie header string
*/
@@ -150,10 +347,9 @@ function formatCookiesForHeader(cookies: Cookie[], domain: string): string {
domain.endsWith(cookie.domain.slice(1)) ||
domain === cookie.domain.slice(1)
);
} else {
// Host-only cookie
return cookie.domain === domain;
}
// Host-only cookie
return cookie.domain === domain;
})
.filter((cookie) => {
// Check expiration
@@ -172,13 +368,55 @@ class HttpError extends Error {
constructor(
message: string,
public readonly status: number,
public readonly url: string,
public readonly url: string
) {
super(message);
this.name = "HttpError";
}
}
// ----------------------------- Extraction Metrics -----------------------------
/**
* Monitor API extraction success/failure for detecting changes
*/
const extractionStats = {
totalExtractions: 0,
successfulExtractions: 0,
failedExtractions: 0,
lastApiChangeDetected: null as Date | null,
};
/**
* Log extraction metrics for monitoring API stability
*/
function logExtractionMetrics(success: boolean, itemId?: string) {
extractionStats.totalExtractions++;
if (success) {
extractionStats.successfulExtractions++;
} else {
extractionStats.failedExtractions++;
}
// Log warning if extraction success rate drops below 80%
const successRate =
extractionStats.successfulExtractions / extractionStats.totalExtractions;
if (
extractionStats.totalExtractions > 10 &&
successRate < 0.8 &&
!extractionStats.lastApiChangeDetected
) {
console.warn(
"Facebook Marketplace API extraction success rate dropped below 80%. This may indicate API changes."
);
extractionStats.lastApiChangeDetected = new Date();
}
if (!success && itemId) {
console.warn(`Facebook API extraction failed for item ${itemId}`);
}
}
// ----------------------------- HTTP Client -----------------------------
/**
@@ -195,7 +433,7 @@ async function fetchHtml(
retryBaseMs?: number;
onRateInfo?: (remaining: string | null, reset: string | null) => void;
cookies?: string;
},
}
): Promise<HTMLString> {
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500;
@@ -219,7 +457,7 @@ async function fetchHtml(
// Add cookies if provided
if (opts?.cookies) {
headers["cookie"] = opts.cookies;
headers.cookie = opts.cookies;
}
const res = await fetch(url, {
@@ -234,7 +472,9 @@ async function fetchHtml(
if (!res.ok) {
// Respect 429 reset if provided
if (res.status === 429) {
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
const resetSeconds = rateLimitReset
? Number(rateLimitReset)
: Number.NaN;
const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000)
: (attempt + 1) * retryBaseMs;
@@ -247,7 +487,7 @@ async function fetchHtml(
throw new HttpError(
`Request failed with status ${res.status} (Facebook may require authentication cookies for access)`,
res.status,
url,
url
);
}
// Retry on 5xx
@@ -258,7 +498,7 @@ async function fetchHtml(
throw new HttpError(
`Request failed with status ${res.status}`,
res.status,
url,
url
);
}
@@ -280,8 +520,8 @@ async function fetchHtml(
/**
Extract marketplace search data from Facebook page script tags
*/
function extractFacebookMarketplaceData(
htmlString: HTMLString,
export function extractFacebookMarketplaceData(
htmlString: HTMLString
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
@@ -289,7 +529,7 @@ function extractFacebookMarketplaceData(
let marketplaceData: FacebookMarketplaceSearch | null = null;
// Find the script containing the require data with marketplace_search
for (const script of scripts as unknown as HTMLScriptElement[]) {
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent;
if (!scriptText) continue;
@@ -301,27 +541,34 @@ function extractFacebookMarketplaceData(
// Try multiple navigation paths to find marketplace_search
const paths = [
// Original path from example
() => parsed.require[0][3][0]['__bbox']['require'][0][3][1]['__bbox']['result']['data']['marketplace_search'],
() =>
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
.marketplace_search,
// Alternative path structure
() => parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
// Another variation
() => parsed.require[0][3][0]['__bbox']['result']['data']['marketplace_search'],
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
// Direct access for some responses
() => {
for (const item of parsed.require) {
if (item && item.length >= 4 && item[3]) {
const bbox = item[3]?.['__bbox']?.result?.data?.marketplace_search;
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
if (bbox) return bbox;
}
}
return null;
}
},
];
for (const getData of paths) {
try {
const result = getData();
if (result && isRecord(result) && result.feed_units?.edges) {
if (
result &&
isRecord(result) &&
(result as any).feed_units?.edges?.length > 0
) {
marketplaceData = result as FacebookMarketplaceSearch;
break;
}
@@ -334,9 +581,13 @@ function extractFacebookMarketplaceData(
}
// Also check for direct marketplace_search in the parsed data
if (parsed.marketplace_search && isRecord(parsed.marketplace_search) && parsed.marketplace_search.feed_units?.edges) {
marketplaceData = parsed.marketplace_search as FacebookMarketplaceSearch;
break;
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
const searchData =
parsed.marketplace_search as FacebookMarketplaceSearch;
if (searchData.feed_units?.edges?.length ?? 0 > 0) {
marketplaceData = searchData;
break;
}
}
} catch {
// Ignore parsing errors for other scripts
@@ -344,19 +595,160 @@ function extractFacebookMarketplaceData(
}
}
if (!marketplaceData?.feed_units?.edges) {
if (!marketplaceData?.feed_units?.edges?.length) {
console.warn("No marketplace data found in HTML response");
return null;
}
console.log(`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`);
console.log(
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`
);
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
}
/**
Extract marketplace item details from Facebook item page HTML
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
*/
export function extractFacebookItemData(
htmlString: HTMLString
): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
for (const script of scripts) {
const scriptText = script.textContent;
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
// Check for the require structure with marketplace product details
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple extraction paths discovered from reverse engineering
const extractionPaths = [
// Path 1: Primary path from current API structure
() =>
parsed.require[0][3].__bbox.result.data.viewer
.marketplace_product_details_page.target,
// Path 2: Alternative path with nested require
() =>
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 3: Variation without the [0] index
() =>
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 4-5: Additional fallback paths for edge cases
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
() =>
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
];
let pathIndex = 0;
for (const getPath of extractionPaths) {
try {
const targetData = getPath();
if (
targetData &&
typeof targetData === "object" &&
targetData.id &&
targetData.marketplace_listing_title &&
targetData.__typename === "GroupCommerceProductItem"
) {
console.log(
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`
);
return targetData as FacebookMarketplaceItem;
}
} catch {
// Path not found or invalid, try next path
}
pathIndex++;
}
// Fallback: Search recursively for marketplace data in the parsed structure
const findMarketplaceData = (
obj: unknown,
depth = 0,
maxDepth = 10
): FacebookMarketplaceItem | null => {
if (depth > maxDepth) return null; // Prevent infinite recursion
if (isRecord(obj)) {
// Check if this object matches the expected marketplace item structure
if (
(obj as any).marketplace_listing_title &&
(obj as any).id &&
(obj as any).__typename === "GroupCommerceProductItem" &&
(obj as any).redacted_description
) {
return obj as unknown as FacebookMarketplaceItem;
}
// Recursively search nested objects and arrays
for (const key in obj) {
const value = obj[key];
if (isRecord(value) || Array.isArray(value)) {
const result = findMarketplaceData(value, depth + 1, maxDepth);
if (result) return result;
}
}
} else if (Array.isArray(obj)) {
// Search through arrays
for (const item of obj) {
const result = findMarketplaceData(item, depth + 1, maxDepth);
if (result) return result;
}
}
return null;
};
// Search through the entire require structure
const recursiveResult = findMarketplaceData(parsed.require);
if (recursiveResult) {
console.log(
"Successfully extracted Facebook item data using recursive search"
);
return recursiveResult;
}
// Additional search in other potential locations
if (
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
?.target
) {
const bboxData =
parsed.__bbox.result.data.viewer.marketplace_product_details_page
.target;
if (
bboxData &&
typeof bboxData === "object" &&
bboxData.id &&
bboxData.marketplace_listing_title &&
bboxData.__typename === "GroupCommerceProductItem"
) {
console.log(
"Successfully extracted Facebook item data from __bbox structure"
);
return bboxData as FacebookMarketplaceItem;
}
}
}
} catch {
// Log parsing errors for debugging but continue to next script
continue;
}
}
return null;
}
/**
Parse Facebook marketplace search results into ListingDetails[]
*/
function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
export function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
const results: FacebookListingDetails[] = [];
for (const adJson of ads) {
@@ -376,9 +768,10 @@ function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
// - formatted_amount: human-readable price (like "CA$1")
let cents: number;
if (priceObj.amount != null) {
const dollars = typeof priceObj.amount === 'string'
? Number.parseFloat(priceObj.amount)
: priceObj.amount;
const dollars =
typeof priceObj.amount === "string"
? Number.parseFloat(priceObj.amount)
: priceObj.amount;
cents = Math.round(dollars * 100);
} else if (priceObj.amount_with_offset_in_currency != null) {
// Fallback: try to extract cents from amount_with_offset_in_currency
@@ -390,7 +783,7 @@ function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
if (priceObj.formatted_amount) {
const match = priceObj.formatted_amount.match(/[\d,]+\.?\d*/);
if (match) {
const dollars = Number.parseFloat(match[0].replace(',', ''));
const dollars = Number.parseFloat(match[0].replace(",", ""));
if (!Number.isNaN(dollars)) {
cents = Math.round(dollars * 100);
} else {
@@ -435,19 +828,24 @@ function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
// Extract image and video URLs
const imageUrl = listing.primary_listing_photo?.image?.uri;
const videoUrl = listing.listing_video ? `https://www.facebook.com/${listing.listing_video.id}/` : undefined;
const videoUrl = listing.listing_video
? `https://www.facebook.com/${listing.listing_video.id}/`
: undefined;
// Extract seller information
const seller = listing.marketplace_listing_seller ? {
name: listing.marketplace_listing_seller.name,
id: listing.marketplace_listing_seller.id
} : undefined;
const seller = listing.marketplace_listing_seller
? {
name: listing.marketplace_listing_seller.name,
id: listing.marketplace_listing_seller.id,
}
: undefined;
const listingDetails: FacebookListingDetails = {
url,
title,
listingPrice: {
amountFormatted: priceObj.formatted_amount || formatCentsToCurrency(cents / 100, "en-CA"),
amountFormatted:
priceObj.formatted_amount || formatCentsToCurrency(cents / 100, "en-CA"),
cents,
currency: priceObj.currency || "CAD", // Facebook marketplace often uses CAD
},
@@ -472,6 +870,98 @@ function parseFacebookAds(ads: FacebookAdNode[]): FacebookListingDetails[] {
return results;
}
/**
Parse Facebook marketplace item details into ListingDetails format
Updated for 2026 GroupCommerceProductItem structure
*/
export function parseFacebookItem(
item: FacebookMarketplaceItem
): FacebookListingDetails | null {
try {
const title = item.marketplace_listing_title || item.custom_title;
if (!title) return null;
const url = `https://www.facebook.com/marketplace/item/${item.id}`;
// Extract price information
let cents = 0;
let currency = "CAD"; // Default
let amountFormatted = item.formatted_price?.text || "FREE";
if (item.listing_price) {
currency = item.listing_price.currency || "CAD";
if (item.listing_price.amount && item.listing_price.amount !== "0.00") {
const amount = Number.parseFloat(item.listing_price.amount);
if (!Number.isNaN(amount)) {
cents = Math.round(amount * 100);
amountFormatted =
item.formatted_price?.text || formatCentsToCurrency(cents / 100, "en-CA");
}
}
}
// Extract description
const description = item.redacted_description?.text;
// Extract location
const address = item.location_text?.text || null;
// Extract seller information
const seller = item.marketplace_listing_seller
? {
name: item.marketplace_listing_seller.name,
id: item.marketplace_listing_seller.id,
}
: undefined;
// Determine listing status
let listingStatus: string | undefined;
if (item.is_sold) {
listingStatus = "SOLD";
} else if (item.is_pending) {
listingStatus = "PENDING";
} else if (item.is_live) {
listingStatus = "ACTIVE";
} else if (item.is_hidden) {
listingStatus = "HIDDEN";
}
// Format creation date
const creationDate = item.creation_time
? new Date(item.creation_time * 1000).toISOString()
: undefined;
// Determine listing type based on category or vehicle data
let listingType = "item";
if (item.vehicle_make_display_name || item.vehicle_odometer_data) {
listingType = "vehicle";
}
const listingDetails: FacebookListingDetails = {
url,
title,
description,
listingPrice: {
amountFormatted,
cents,
currency,
},
address,
creationDate,
listingType,
listingStatus,
categoryId: item.marketplace_listing_category_id,
seller,
deliveryTypes: item.delivery_types,
};
return listingDetails;
} catch (error) {
console.warn(`Failed to parse Facebook item ${item.id}:`, error);
return null;
}
}
// ----------------------------- Main -----------------------------
export default async function fetchFacebookItems(
@@ -480,13 +970,22 @@ export default async function fetchFacebookItems(
LOCATION = "toronto",
MAX_ITEMS = 25,
cookiesSource?: string,
cookiePath?: string
) {
// Load Facebook cookies - required for Facebook Marketplace access
const cookies = await loadFacebookCookies(cookiesSource);
let cookies: Cookie[];
if (cookiesSource) {
// Use provided cookie source (backward compatibility)
cookies = await loadFacebookCookies(cookiesSource);
} else {
// Auto-load from file or parse from env var
cookies = await ensureFacebookCookies(cookiePath);
}
if (cookies.length === 0) {
throw new Error(
"Facebook cookies are required for marketplace access. " +
"Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies.",
"Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies."
);
}
@@ -495,7 +994,7 @@ export default async function fetchFacebookItems(
const cookiesHeader = formatCookiesForHeader(cookies, domain);
if (!cookiesHeader) {
throw new Error(
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain.",
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain."
);
}
@@ -517,8 +1016,7 @@ export default async function fetchFacebookItems(
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Facebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
`\nFacebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`
);
}
},
@@ -527,11 +1025,11 @@ export default async function fetchFacebookItems(
} catch (err) {
if (err instanceof HttpError) {
console.warn(
`\nFacebook marketplace access failed (${err.status}): ${err.message}`,
`\nFacebook marketplace access failed (${err.status}): ${err.message}`
);
if (err.status === 400 || err.status === 401 || err.status === 403) {
console.warn(
"This might indicate invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies.",
"This might indicate invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies."
);
}
return [];
@@ -549,17 +1047,17 @@ export default async function fetchFacebookItems(
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
cliProgress.Presets.shades_classic
);
const totalProgress = ads.length;
let currentProgress = 0;
const currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items = parseFacebookAds(ads);
// Filter to only priced items (already done in parseFacebookAds)
const pricedItems = items.filter(
(item) => item.listingPrice?.cents && item.listingPrice.cents > 0,
(item) => item.listingPrice?.cents && item.listingPrice.cents > 0
);
progressBar.update(totalProgress);
@@ -568,3 +1066,158 @@ export default async function fetchFacebookItems(
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
return pricedItems.slice(0, MAX_ITEMS); // Limit results
}
/**
* Fetch individual Facebook marketplace item details with enhanced error handling
*/
export async function fetchFacebookItem(
itemId: string,
cookiesSource?: string,
cookiePath?: string
): Promise<FacebookListingDetails | null> {
// Load Facebook cookies - required for Facebook Marketplace access
let cookies: Cookie[];
if (cookiesSource) {
// Use provided cookie source (backward compatibility)
cookies = await loadFacebookCookies(cookiesSource);
} else {
// Auto-load from file or parse from env var
cookies = await ensureFacebookCookies(cookiePath);
}
if (cookies.length === 0) {
throw new Error(
"Facebook cookies are required for marketplace access. " +
"Please provide cookies via 'cookies' parameter or create ./cookies/facebook.json file with valid Facebook session cookies."
);
}
// Format cookies for HTTP header
const domain = "www.facebook.com";
const cookiesHeader = formatCookiesForHeader(cookies, domain);
if (!cookiesHeader) {
throw new Error(
"No valid Facebook cookies found. Please check that cookies are not expired and apply to facebook.com domain."
);
}
const itemUrl = `https://www.facebook.com/marketplace/item/${itemId}/`;
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
let itemHtml: string;
try {
itemHtml = await fetchHtml(itemUrl, 1000, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
`\nFacebook - Rate limit remaining: ${remaining}, reset in: ${reset}s`
);
}
},
cookies: cookiesHeader,
});
} catch (err) {
if (err instanceof HttpError) {
console.warn(
`\nFacebook marketplace item access failed (${err.status}): ${err.message}`
);
// Enhanced error handling based on status codes
switch (err.status) {
case 400:
case 401:
case 403:
console.warn(
"Authentication error: Invalid or expired cookies. Please update ./cookies/facebook.json with fresh session cookies."
);
console.warn(
"Try logging out and back into Facebook, then export fresh cookies."
);
break;
case 404:
console.warn(
"Listing not found: The marketplace item may have been removed, sold, or the URL is invalid."
);
break;
case 429:
console.warn(
"Rate limited: Too many requests. Facebook is blocking access temporarily."
);
break;
case 500:
case 502:
case 503:
console.warn(
"Facebook server error: Marketplace may be temporarily unavailable."
);
break;
default:
console.warn(`Unexpected error status: ${err.status}`);
}
return null;
}
throw err;
}
const itemData = extractFacebookItemData(itemHtml);
if (!itemData) {
logExtractionMetrics(false, itemId);
// Enhanced checking for specific failure scenarios
if (
itemHtml.includes("This listing is no longer available") ||
itemHtml.includes("listing has been removed") ||
itemHtml.includes("This item has been sold")
) {
console.warn(
`Item ${itemId} appears to be sold or removed from marketplace.`
);
return null;
}
if (
itemHtml.includes("log in to Facebook") ||
itemHtml.includes("You must log in") ||
itemHtml.includes("authentication required")
) {
console.warn(
`Authentication failed for item ${itemId}. Cookies may be expired.`
);
return null;
}
console.warn(
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`
);
console.warn(" - The listing was removed or sold");
console.warn(" - Authentication issues");
console.warn(" - Facebook changed their API structure");
console.warn(" - Network or parsing issues");
return null;
}
logExtractionMetrics(true, itemId);
console.log(`Successfully extracted data for item ${itemId}`);
const parsedItem = parseFacebookItem(itemData);
if (!parsedItem) {
console.warn(`Failed to parse item ${itemId}: Invalid data structure`);
return null;
}
// Check for sold/removed status in the parsed data with proper precedence
if (itemData.is_sold) {
console.warn(`Item ${itemId} is marked as sold in the marketplace.`);
// Still return the data but mark it as sold
parsedItem.listingStatus = "SOLD";
} else if (!itemData.is_live) {
console.warn(`Item ${itemId} is not live/active in the marketplace.`);
parsedItem.listingStatus = itemData.is_hidden
? "HIDDEN"
: itemData.is_pending
? "PENDING"
: "INACTIVE";
}
return parsedItem;
}

View File

@@ -2,7 +2,15 @@
import { parseHTML } from "linkedom";
import unidecode from "unidecode";
import cliProgress from "cli-progress";
import { fetchHtml, isRecord, HttpError } from "../utils/http";
import {
fetchHtml,
isRecord,
HttpError,
NetworkError,
ParseError,
RateLimitError,
ValidationError,
} from "../utils/http";
import { delay } from "../utils/delay";
import { formatCentsToCurrency } from "../utils/format";
import type { HTMLString } from "../types/common";
@@ -26,16 +34,29 @@ interface ApolloListingRoot {
url?: string;
title?: string;
description?: string;
price?: { amount?: number | string; currency?: string };
price?: { amount?: number | string; currency?: string; type?: string };
type?: string;
status?: string;
activationDate?: string;
endDate?: string;
metrics?: { views?: number | string };
location?: { address?: string | null };
location?: {
address?: string | null;
id?: number;
name?: string;
coordinates?: { latitude: number; longitude: number };
};
imageUrls?: string[];
imageCount?: number;
categoryId?: number;
adSource?: string;
flags?: { topAd?: boolean; priceDrop?: boolean };
posterInfo?: { posterId?: string; rating?: number };
attributes?: Array<{ canonicalName?: string; canonicalValues?: string[] }>;
[k: string]: unknown;
}
// Keep existing interface for backward compatibility
export interface KijijiListingDetails {
url: string;
title: string;
@@ -53,10 +74,173 @@ export interface KijijiListingDetails {
address?: string | null;
}
// New comprehensive interface for detailed listings
export interface DetailedListing extends KijijiListingDetails {
images: string[];
categoryId: number;
adSource: string;
flags: {
topAd: boolean;
priceDrop: boolean;
};
attributes: Record<string, string[]>;
location: {
id: number;
name: string;
coordinates?: {
latitude: number;
longitude: number;
};
};
sellerInfo?: {
posterId: string;
rating?: number;
accountType?: string;
memberSince?: string;
reviewCount?: number;
reviewScore?: number;
};
}
// Configuration interfaces
export interface SearchOptions {
location?: number | string; // Location ID or name
category?: number | string; // Category ID or name
keywords?: string;
sortBy?: "relevancy" | "date" | "price" | "distance";
sortOrder?: "desc" | "asc";
maxPages?: number; // Default: 5
priceMin?: number;
priceMax?: number;
}
export interface ListingFetchOptions {
includeImages?: boolean; // Default: true
sellerDataDepth?: "basic" | "detailed" | "full"; // Default: 'detailed'
includeClientSideData?: boolean; // Default: false
}
// ----------------------------- Constants & Mappings -----------------------------
// Location mappings
const LOCATION_MAPPINGS: Record<string, number> = {
canada: 0,
ontario: 9004,
toronto: 1700273,
gta: 1700272,
oshawa: 1700275,
quebec: 9001,
"nova scotia": 9002,
alberta: 9003,
"new brunswick": 9005,
manitoba: 9006,
"british columbia": 9007,
newfoundland: 9008,
saskatchewan: 9009,
territories: 9010,
pei: 9011,
"prince edward island": 9011,
};
// Category mappings (Buy & Sell main categories)
const CATEGORY_MAPPINGS: Record<string, number> = {
all: 0,
"buy-sell": 10,
"arts-collectibles": 12,
audio: 767,
"baby-items": 253,
"bags-luggage": 931,
bikes: 644,
books: 109,
cameras: 103,
cds: 104,
clothing: 274,
computers: 16,
"computer-accessories": 128,
electronics: 29659001,
"free-stuff": 17220001,
furniture: 235,
"garage-sales": 638,
"health-special-needs": 140,
"hobbies-crafts": 139,
"home-appliances": 107,
"home-indoor": 717,
"home-outdoor": 727,
jewellery: 133,
"musical-instruments": 17,
phones: 132,
"sporting-goods": 111,
tools: 110,
"toys-games": 108,
"tvs-video": 15093001,
"video-games": 141,
other: 26,
};
// Sort parameter mappings
const SORT_MAPPINGS: Record<string, string> = {
relevancy: "MATCH",
date: "DATE",
price: "PRICE",
distance: "DISTANCE",
};
// ----------------------------- Utilities -----------------------------
const SEPS = new Set([" ", "", "—", "/", ":", ";", ",", ".", "-"]);
/**
* Resolve location ID from name or return numeric ID
*/
export function resolveLocationId(location?: number | string): number {
if (typeof location === "number") return location;
if (typeof location === "string") {
const normalized = location.toLowerCase().replace(/\s+/g, "-");
return LOCATION_MAPPINGS[normalized] ?? 0; // Default to Canada (0)
}
return 0; // Default to Canada
}
/**
* Resolve category ID from name or return numeric ID
*/
export function resolveCategoryId(category?: number | string): number {
if (typeof category === "number") return category;
if (typeof category === "string") {
const normalized = category.toLowerCase().replace(/\s+/g, "-");
return CATEGORY_MAPPINGS[normalized] ?? 0; // Default to all categories
}
return 0; // Default to all categories
}
/**
* Build search URL with enhanced parameters
*/
export function buildSearchUrl(
keywords: string,
options: SearchOptions & { page?: number },
BASE_URL = "https://www.kijiji.ca"
): string {
const locationId = resolveLocationId(options.location);
const categoryId = resolveCategoryId(options.category);
const categorySlug = categoryId === 0 ? "buy-sell" : "buy-sell";
const locationSlug = locationId === 0 ? "canada" : "canada";
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
const sortParam = options.sortBy
? `&sort=${SORT_MAPPINGS[options.sortBy]}`
: "";
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
const pageParam =
options.page && options.page > 1 ? `&page=${options.page}` : "";
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
return url;
}
/**
* Slugifies a string for Kijiji search URLs
*/
@@ -67,13 +251,14 @@ export function slugify(input: string): string {
for (let i = 0; i < s.length; i++) {
const ch = s[i];
const code = ch!.charCodeAt(0);
if (!ch) continue;
const code = ch.charCodeAt(0);
// a-z or 0-9
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
out.push(ch!);
out.push(ch);
lastHyphen = false;
} else if (SEPS.has(ch!)) {
} else if (SEPS.has(ch)) {
if (!lastHyphen) {
out.push("-");
lastHyphen = true;
@@ -84,12 +269,154 @@ export function slugify(input: string): string {
return out.join("");
}
// ----------------------------- GraphQL Client -----------------------------
// GraphQL response interfaces
interface GraphQLReviewResponse {
user?: {
reviewSummary?: {
count?: number;
score?: number;
};
};
}
interface GraphQLProfileResponse {
user?: {
memberSince?: string;
accountType?: string;
};
}
// GraphQL queries
const GRAPHQL_QUERIES = {
getReviewSummary: `
query GetReviewSummary($userId: String!) {
user(id: $userId) {
reviewSummary {
count
score
__typename
}
__typename
}
}
`,
getProfileMetrics: `
query GetProfileMetrics($profileId: String!) {
user(id: $profileId) {
memberSince
accountType
__typename
}
}
`,
} as const;
/**
* Fetch additional data via GraphQL API
*/
async function fetchGraphQLData(
query: string,
variables: Record<string, unknown>,
BASE_URL = "https://www.kijiji.ca"
): Promise<unknown> {
const endpoint = `${BASE_URL}/anvil/api`;
try {
const response = await fetch(endpoint, {
method: "POST",
headers: {
"Content-Type": "application/json",
"apollo-require-preflight": "true",
},
body: JSON.stringify({
query,
variables,
}),
});
if (!response.ok) {
throw new HttpError(
`GraphQL request failed with status ${response.status}`,
response.status,
endpoint
);
}
const result = await response.json();
if (result.errors) {
throw new ParseError(
`GraphQL errors: ${JSON.stringify(result.errors)}`,
result.errors
);
}
return result.data;
} catch (err) {
if (err instanceof HttpError || err instanceof ParseError) {
throw err;
}
throw new NetworkError(
`Failed to fetch GraphQL data: ${err instanceof Error ? err.message : String(err)}`,
endpoint,
err instanceof Error ? err : undefined
);
}
}
/**
* Fetch additional seller data via GraphQL
*/
async function fetchSellerDetails(
posterId: string,
BASE_URL = "https://www.kijiji.ca"
): Promise<{
reviewCount?: number;
reviewScore?: number;
memberSince?: string;
accountType?: string;
}> {
try {
const [reviewData, profileData] = await Promise.all([
fetchGraphQLData(
GRAPHQL_QUERIES.getReviewSummary,
{ userId: posterId },
BASE_URL
),
fetchGraphQLData(
GRAPHQL_QUERIES.getProfileMetrics,
{ profileId: posterId },
BASE_URL
),
]);
const reviewResponse = reviewData as GraphQLReviewResponse;
const profileResponse = profileData as GraphQLProfileResponse;
return {
reviewCount: reviewResponse?.user?.reviewSummary?.count,
reviewScore: reviewResponse?.user?.reviewSummary?.score,
memberSince: profileResponse?.user?.memberSince,
accountType: profileResponse?.user?.accountType,
};
} catch (err) {
// Silently fail for GraphQL errors - not critical for basic functionality
console.warn(
`Failed to fetch seller details for ${posterId}:`,
err instanceof Error ? err.message : String(err)
);
return {};
}
}
// ----------------------------- Parsing -----------------------------
/**
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
*/
function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
export function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
const { document } = parseHTML(htmlString);
const nextData = document.getElementById("__NEXT_DATA__");
if (!nextData || !nextData.textContent) return null;
@@ -107,9 +434,9 @@ function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
Parse search page apollo state into SearchListing[].
Filters keys likely to be listing entities and ensures url/title exist.
*/
function parseSearch(
export function parseSearch(
htmlString: HTMLString,
BASE_URL: string,
BASE_URL: string
): SearchListing[] {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return [];
@@ -134,18 +461,18 @@ function parseSearch(
}
/**
Parse a listing page into a typed object.
Parse a listing page into a typed object (backward compatible).
*/
function parseListing(
htmlString: HTMLString,
BASE_URL: string,
BASE_URL: string
): KijijiListingDetails | null {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing"),
k.includes("Listing")
);
if (!listingKey) return null;
@@ -167,9 +494,7 @@ function parseListing(
const cents = price?.amount != null ? Number(price.amount) : undefined;
const amountFormatted =
cents != null
? formatCentsToCurrency(cents / 100, "en-CA")
: undefined;
cents != null ? formatCentsToCurrency(cents / 100, "en-CA") : undefined;
const numberOfViews =
metrics?.views != null ? Number(metrics.views) : undefined;
@@ -203,88 +528,291 @@ function parseListing(
};
}
/**
* Parse a listing page into a detailed object with all available fields
*/
export async function parseDetailedListing(
htmlString: HTMLString,
BASE_URL: string,
options: ListingFetchOptions = {}
): Promise<DetailedListing | null> {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing")
);
if (!listingKey) return null;
const root = apolloState[listingKey];
if (!isRecord(root)) return null;
const {
url,
title,
description,
price,
type,
status,
activationDate,
endDate,
metrics,
location,
imageUrls,
categoryId,
adSource,
flags,
posterInfo,
attributes,
} = root as ApolloListingRoot;
const cents = price?.amount != null ? Number(price.amount) : undefined;
const amountFormatted =
cents != null ? formatCentsToCurrency(cents / 100, "en-CA") : undefined;
const numberOfViews =
metrics?.views != null ? Number(metrics.views) : undefined;
const listingUrl =
typeof url === "string"
? url.startsWith("http")
? url
: `${BASE_URL}${url}`
: "";
if (!listingUrl || !title) return null;
// Only include fixed-price listings
if (!amountFormatted || cents === undefined) return null;
// Extract images if requested
const images =
options.includeImages !== false && Array.isArray(imageUrls)
? imageUrls.filter((url): url is string => typeof url === "string")
: [];
// Extract attributes as key-value pairs
const attributeMap: Record<string, string[]> = {};
if (Array.isArray(attributes)) {
for (const attr of attributes) {
if (attr?.canonicalName && Array.isArray(attr.canonicalValues)) {
attributeMap[attr.canonicalName] = attr.canonicalValues;
}
}
}
// Extract seller info based on depth setting
let sellerInfo: DetailedListing["sellerInfo"];
const depth = options.sellerDataDepth ?? "detailed";
if (posterInfo?.posterId) {
sellerInfo = {
posterId: posterInfo.posterId,
rating:
typeof posterInfo.rating === "number" ? posterInfo.rating : undefined,
};
// Add more detailed info if requested and client-side data is enabled
if (
(depth === "detailed" || depth === "full") &&
options.includeClientSideData
) {
try {
const additionalData = await fetchSellerDetails(
posterInfo.posterId,
BASE_URL
);
sellerInfo = {
...sellerInfo,
...additionalData,
};
} catch {
// Silently fail - GraphQL data is optional
console.warn(
`Failed to fetch additional seller data for ${posterInfo.posterId}`
);
}
}
}
return {
url: listingUrl,
title,
description,
listingPrice: {
amountFormatted,
cents,
currency: price?.currency,
},
listingType: type,
listingStatus: status,
creationDate: activationDate,
endDate,
numberOfViews:
numberOfViews !== undefined && Number.isFinite(numberOfViews)
? numberOfViews
: undefined,
address: location?.address ?? null,
images,
categoryId: typeof categoryId === "number" ? categoryId : 0,
adSource: typeof adSource === "string" ? adSource : "UNKNOWN",
flags: {
topAd: flags?.topAd === true,
priceDrop: flags?.priceDrop === true,
},
attributes: attributeMap,
location: {
id: typeof location?.id === "number" ? location.id : 0,
name: typeof location?.name === "string" ? location.name : "Unknown",
coordinates: location?.coordinates
? {
latitude: location.coordinates.latitude,
longitude: location.coordinates.longitude,
}
: undefined,
},
sellerInfo,
};
}
// ----------------------------- Main -----------------------------
export default async function fetchKijijiItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
BASE_URL = "https://www.kijiji.ca",
searchOptions: SearchOptions = {},
listingOptions: ListingFetchOptions = {}
) {
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
// Set defaults for configuration
const finalSearchOptions: Required<SearchOptions> = {
location: searchOptions.location ?? 1700272, // Default to GTA
category: searchOptions.category ?? 0, // Default to all categories
keywords: searchOptions.keywords ?? SEARCH_QUERY,
sortBy: searchOptions.sortBy ?? "relevancy",
sortOrder: searchOptions.sortOrder ?? "desc",
maxPages: searchOptions.maxPages ?? 5, // Default to 5 pages
priceMin: searchOptions.priceMin as number,
priceMax: searchOptions.priceMax as number,
};
console.log(`Fetching search: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
maxRetries: 3,
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
const finalListingOptions: Required<ListingFetchOptions> = {
includeImages: listingOptions.includeImages ?? true,
sellerDataDepth: listingOptions.sellerDataDepth ?? "detailed",
includeClientSideData: listingOptions.includeClientSideData ?? false,
};
const allListings: DetailedListing[] = [];
const seenUrls = new Set<string>();
// Fetch multiple pages
for (let page = 1; page <= finalSearchOptions.maxPages; page++) {
const searchUrl = buildSearchUrl(
finalSearchOptions.keywords,
{
...finalSearchOptions,
// Add page parameter for pagination
...(page > 1 && { page }),
},
BASE_URL
);
console.log(`Fetching search page ${page}: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
`\nSearch - Rate limit remaining: ${remaining}, reset in: ${reset}s`
);
}
},
});
const searchResults = parseSearch(searchHtml, BASE_URL);
if (searchResults.length === 0) {
console.log(
`No more results found on page ${page}. Stopping pagination.`
);
break;
}
// Deduplicate links across pages
const newListingLinks = searchResults
.map((r) => r.listingLink)
.filter((link) => !seenUrls.has(link));
for (const link of newListingLinks) {
seenUrls.add(link);
}
console.log(
`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`
);
// Fetch details for this page's listings
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic
);
const totalProgress = newListingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
for (const link of newListingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`
);
}
},
});
const parsed = await parseDetailedListing(
html,
BASE_URL,
finalListingOptions
);
if (parsed) {
allListings.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`
);
} else {
console.error(
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`
);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
}
},
});
}
const searchResults = parseSearch(searchHtml, BASE_URL);
if (searchResults.length === 0) {
console.warn("No search results parsed from page.");
return;
}
progressBar.stop();
// Deduplicate links
const listingLinks = Array.from(
new Set(searchResults.map((r) => r.listingLink)),
);
console.log(
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
);
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = listingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items: KijijiListingDetails[] = [];
for (const link of listingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
maxRetries: 3,
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const parsed = parseListing(html, BASE_URL);
if (parsed) {
if (parsed.listingPrice?.cents) items.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(
"\n" + `Failed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
);
} else {
console.error(
"\n" +
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
// If we got fewer results than expected (40 per page), we've reached the end
if (searchResults.length < 40) {
break;
}
}
console.log("\n" + `Parsed ${items.length} listings.`);
return items;
console.log(`\nParsed ${allListings.length} detailed listings.`);
return allListings;
}
// Re-export error classes for convenience
export {
HttpError,
NetworkError,
ParseError,
RateLimitError,
ValidationError,
};

View File

@@ -1,87 +1,200 @@
/** Custom error class for HTTP-related failures */
export class HttpError extends Error {
constructor(
public statusCode: number,
message: string
message: string,
public readonly statusCode: number,
public readonly url?: string
) {
super(message);
this.name = "HttpError";
}
}
/** Error class for network failures (timeouts, connection issues) */
export class NetworkError extends Error {
constructor(
message: string,
public readonly url: string,
public readonly cause?: Error
) {
super(message);
this.name = "NetworkError";
}
}
/** Error class for parsing failures */
export class ParseError extends Error {
constructor(
message: string,
public readonly data?: unknown
) {
super(message);
this.name = "ParseError";
}
}
/** Error class for rate limiting */
export class RateLimitError extends Error {
constructor(
message: string,
public readonly url: string,
public readonly resetTime?: number
) {
super(message);
this.name = "RateLimitError";
}
}
/** Error class for validation failures */
export class ValidationError extends Error {
constructor(message: string) {
super(message);
this.name = "ValidationError";
}
}
/** Type guard to check if a value is a record (object) */
export function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
/**
* Fetch HTML content from a URL with automatic retries
* Calculate exponential backoff delay with jitter
*/
function calculateBackoffDelay(attempt: number, baseMs: number): number {
const exponentialDelay = baseMs * 2 ** attempt;
const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter
return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds
}
/** Options for fetchHtml */
export interface FetchHtmlOptions {
maxRetries?: number;
retryBaseMs?: number;
timeoutMs?: number;
onRateInfo?: (remaining: string | null, reset: string | null) => void;
headers?: Record<string, string>;
}
/**
* Fetch HTML content from a URL with automatic retries, timeout, and exponential backoff
* @param url - The URL to fetch
* @param delayMs - Delay in milliseconds between retries
* @param delayMs - Delay in milliseconds between requests (rate limiting)
* @param opts - Optional fetch options
* @returns The HTML content as a string
* @throws HttpError if all retries are exhausted
* @throws HttpError, NetworkError, or RateLimitError on failure
*/
export async function fetchHtml(
url: string,
delayMs: number,
opts?: RequestInit
opts?: FetchHtmlOptions
): Promise<string> {
const maxAttempts = 3;
let lastError: Error | null = null;
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 1000;
const timeoutMs = opts?.timeoutMs ?? 30000;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const defaultHeaders: Record<string, string> = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"upgrade-insecure-requests": "1",
"user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
};
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const response = await fetch(url, opts);
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
// Check for rate limiting
if (response.status === 429) {
const retryAfter = response.headers.get("Retry-After");
const waitTime = retryAfter ? parseInt(retryAfter) * 1000 : delayMs * (attempt + 1);
console.warn(
`Rate limited. Retrying after ${waitTime}ms...`
);
await new Promise((resolve) => setTimeout(resolve, waitTime));
continue;
}
const res = await fetch(url, {
method: "GET",
headers: { ...defaultHeaders, ...opts?.headers },
signal: controller.signal,
});
// Check for server errors
if (response.status >= 500) {
lastError = new HttpError(
response.status,
`Server error: ${response.status}`
);
if (attempt < maxAttempts - 1) {
clearTimeout(timeoutId);
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
if (!res.ok) {
// Handle rate limiting
if (res.status === 429) {
const resetSeconds = rateLimitReset
? Number(rateLimitReset)
: Number.NaN;
const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000)
: calculateBackoffDelay(attempt, retryBaseMs);
if (attempt < maxRetries) {
await new Promise((resolve) => setTimeout(resolve, waitMs));
continue;
}
throw new RateLimitError(
`Rate limit exceeded for ${url}`,
url,
resetSeconds
);
}
// Retry on server errors
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
await new Promise((resolve) =>
setTimeout(resolve, delayMs * (attempt + 1))
setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs))
);
continue;
}
throw lastError;
}
// Check for successful response
if (!response.ok) {
throw new HttpError(
response.status,
`HTTP ${response.status}: ${response.statusText}`
`Request failed with status ${res.status}`,
res.status,
url
);
}
return await response.text();
} catch (error) {
lastError =
error instanceof Error
? error
: new Error("Unknown error during fetch");
const html = await res.text();
if (attempt < maxAttempts - 1) {
// Respect per-request delay to maintain rate limiting
await new Promise((resolve) => setTimeout(resolve, delayMs));
return html;
} catch (err) {
// Re-throw known errors
if (
err instanceof RateLimitError ||
err instanceof HttpError ||
err instanceof NetworkError
) {
throw err;
}
if (err instanceof Error && err.name === "AbortError") {
if (attempt < maxRetries) {
await new Promise((resolve) =>
setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs))
);
continue;
}
throw new NetworkError(`Request timeout for ${url}`, url, err);
}
// Network or other errors
if (attempt < maxRetries) {
await new Promise((resolve) =>
setTimeout(resolve, delayMs * (attempt + 1))
setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs))
);
continue;
}
throw new NetworkError(
`Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`,
url,
err instanceof Error ? err : undefined
);
}
}
throw lastError || new HttpError(0, "Failed to fetch after retries");
throw new NetworkError(`Exhausted retries without response for ${url}`, url);
}