/* eslint-disable @typescript-eslint/no-explicit-any */ import { parseHTML } from "linkedom"; import unidecode from "unidecode"; import cliProgress from "cli-progress"; // const unidecode = require("unidecode"); // ----------------------------- Types ----------------------------- type HTMLString = string; type SearchListing = { name: string; listingLink: string; }; type ApolloRecord = Record; interface ApolloSearchItem { url?: string; title?: string; [k: string]: unknown; } interface ApolloListingRoot { url?: string; title?: string; description?: string; price?: { amount?: number | string; currency?: string; type?: string }; type?: string; status?: string; activationDate?: string; endDate?: string; metrics?: { views?: number | string }; location?: { address?: string | null; id?: number; name?: string; coordinates?: { latitude: number; longitude: number }; }; imageUrls?: string[]; imageCount?: number; categoryId?: number; adSource?: string; flags?: { topAd?: boolean; priceDrop?: boolean }; posterInfo?: { posterId?: string; rating?: number }; attributes?: Array<{ canonicalName?: string; canonicalValues?: string[] }>; [k: string]: unknown; } // Keep existing interface for backward compatibility type ListingDetails = { url: string; title: string; description?: string; listingPrice?: { amountFormatted: string; cents?: number; currency?: string; }; listingType?: string; listingStatus?: string; creationDate?: string; endDate?: string; numberOfViews?: number; address?: string | null; }; // New comprehensive interface for detailed listings interface DetailedListing extends ListingDetails { images: string[]; categoryId: number; adSource: string; flags: { topAd: boolean; priceDrop: boolean; }; attributes: Record; location: { id: number; name: string; coordinates?: { latitude: number; longitude: number; }; }; sellerInfo?: { posterId: string; rating?: number; accountType?: string; memberSince?: string; reviewCount?: number; reviewScore?: number; }; } // Configuration interfaces interface SearchOptions { location?: number | string; // Location ID or name category?: number | string; // Category ID or name keywords?: string; sortBy?: 'relevancy' | 'date' | 'price' | 'distance'; sortOrder?: 'desc' | 'asc'; maxPages?: number; // Default: 5 priceMin?: number; priceMax?: number; } interface ListingFetchOptions { includeImages?: boolean; // Default: true sellerDataDepth?: 'basic' | 'detailed' | 'full'; // Default: 'detailed' includeClientSideData?: boolean; // Default: false } // ----------------------------- Constants & Mappings ----------------------------- // Location mappings from KIJIJI.md const LOCATION_MAPPINGS: Record = { 'canada': 0, 'ontario': 9004, 'toronto': 1700273, 'gta': 1700272, 'oshawa': 1700275, 'quebec': 9001, 'nova scotia': 9002, 'alberta': 9003, 'new brunswick': 9005, 'manitoba': 9006, 'british columbia': 9007, 'newfoundland': 9008, 'saskatchewan': 9009, 'territories': 9010, 'pei': 9011, 'prince edward island': 9011, }; // Category mappings from KIJIJI.md (Buy & Sell main categories) const CATEGORY_MAPPINGS: Record = { 'all': 0, 'buy-sell': 10, 'arts-collectibles': 12, 'audio': 767, 'baby-items': 253, 'bags-luggage': 931, 'bikes': 644, 'books': 109, 'cameras': 103, 'cds': 104, 'clothing': 274, 'computers': 16, 'computer-accessories': 128, 'electronics': 29659001, 'free-stuff': 17220001, 'furniture': 235, 'garage-sales': 638, 'health-special-needs': 140, 'hobbies-crafts': 139, 'home-appliances': 107, 'home-indoor': 717, 'home-outdoor': 727, 'jewellery': 133, 'musical-instruments': 17, 'phones': 132, 'sporting-goods': 111, 'tools': 110, 'toys-games': 108, 'tvs-video': 15093001, 'video-games': 141, 'other': 26, }; // Sort parameter mappings const SORT_MAPPINGS: Record = { 'relevancy': 'MATCH', 'date': 'DATE', 'price': 'PRICE', 'distance': 'DISTANCE', }; // ----------------------------- Exports for Testing ----------------------------- // Note: These are exported for testing purposes only export { resolveLocationId, resolveCategoryId, buildSearchUrl }; export { extractApolloState, parseSearch }; export { parseDetailedListing }; export { HttpError, NetworkError, ParseError, RateLimitError, ValidationError }; // ----------------------------- Utilities ----------------------------- const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]); /** * Resolve location ID from name or return numeric ID */ function resolveLocationId(location?: number | string): number { if (typeof location === 'number') return location; if (typeof location === 'string') { const normalized = location.toLowerCase().replace(/\s+/g, '-'); return LOCATION_MAPPINGS[normalized] ?? 0; // Default to Canada (0) } return 0; // Default to Canada } /** * Resolve category ID from name or return numeric ID */ function resolveCategoryId(category?: number | string): number { if (typeof category === 'number') return category; if (typeof category === 'string') { const normalized = category.toLowerCase().replace(/\s+/g, '-'); return CATEGORY_MAPPINGS[normalized] ?? 0; // Default to all categories } return 0; // Default to all categories } /** * Build search URL with enhanced parameters */ function buildSearchUrl( keywords: string, options: SearchOptions & { page?: number }, BASE_URL = "https://www.kijiji.ca" ): string { const locationId = resolveLocationId(options.location); const categoryId = resolveCategoryId(options.category); const categorySlug = categoryId === 0 ? 'buy-sell' : 'buy-sell'; // Could be enhanced const locationSlug = locationId === 0 ? 'canada' : 'canada'; // Could be enhanced let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`; const sortParam = options.sortBy ? `&sort=${SORT_MAPPINGS[options.sortBy]}` : ''; const sortOrder = options.sortOrder === 'asc' ? 'ASC' : 'DESC'; const pageParam = options.page && options.page > 1 ? `&page=${options.page}` : ''; url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`; return url; } /** * Slugifies a string for search */ export function slugify(input: string): string { const s = unidecode(input).toLowerCase(); const out: string[] = []; let lastHyphen = false; for (let i = 0; i < s.length; i++) { const ch = s[i]; if (!ch) continue; const code = ch.charCodeAt(0); // a-z or 0-9 if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) { out.push(ch); lastHyphen = false; } else if (SEPS.has(ch)) { if (!lastHyphen) { out.push("-"); lastHyphen = true; } } // else drop character } return out.join(""); } /** * Turns cents to localized currency string. */ export function formatCentsToCurrency( num: number | string | undefined, locale = "en-US", ): string { if (num == null) return ""; const cents = typeof num === "string" ? Number.parseInt(num, 10) : num; if (Number.isNaN(cents)) return ""; const dollars = cents / 100; const formatter = new Intl.NumberFormat(locale, { style: 'currency', currency: 'USD', minimumFractionDigits: 2, maximumFractionDigits: 2, }); return formatter.format(dollars); } function isRecord(value: unknown): value is Record { return typeof value === "object" && value !== null && !Array.isArray(value); } async function delay(ms: number): Promise { await new Promise((resolve) => setTimeout(resolve, ms)); } // ----------------------------- Error Classes ----------------------------- class HttpError extends Error { constructor( message: string, public readonly status: number, public readonly url: string, ) { super(message); this.name = "HttpError"; } } class NetworkError extends Error { constructor( message: string, public readonly url: string, public readonly cause?: Error, ) { super(message); this.name = "NetworkError"; } } class ParseError extends Error { constructor( message: string, public readonly data?: unknown, ) { super(message); this.name = "ParseError"; } } class RateLimitError extends Error { constructor( message: string, public readonly url: string, public readonly resetTime?: number, ) { super(message); this.name = "RateLimitError"; } } class ValidationError extends Error { constructor(message: string) { super(message); this.name = "ValidationError"; } } // ----------------------------- HTTP Client ----------------------------- /** Fetch HTML with enhanced retry strategy and exponential backoff. - Retries on 429, 5xx, and network errors - Respects X-RateLimit-Reset when present (seconds) - Exponential backoff with jitter */ async function fetchHtml( url: string, DELAY_MS: number, opts?: { maxRetries?: number; retryBaseMs?: number; onRateInfo?: (remaining: string | null, reset: string | null) => void; }, ): Promise { const maxRetries = opts?.maxRetries ?? 3; const retryBaseMs = opts?.retryBaseMs ?? 1000; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout const res = await fetch(url, { method: "GET", headers: { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", "cache-control": "no-cache", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", }, signal: controller.signal, }); clearTimeout(timeoutId); const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining"); const rateLimitReset = res.headers.get("X-RateLimit-Reset"); opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset); if (!res.ok) { // Handle rate limiting if (res.status === 429) { const resetSeconds = rateLimitReset ? Number(rateLimitReset) : Number.NaN; const waitMs = Number.isFinite(resetSeconds) ? Math.max(0, resetSeconds * 1000) : calculateBackoffDelay(attempt, retryBaseMs); if (attempt < maxRetries) { await delay(waitMs); continue; } throw new RateLimitError( `Rate limit exceeded for ${url}`, url, resetSeconds, ); } // Retry on server errors if (res.status >= 500 && res.status < 600 && attempt < maxRetries) { await delay(calculateBackoffDelay(attempt, retryBaseMs)); continue; } throw new HttpError( `Request failed with status ${res.status}`, res.status, url, ); } const html = await res.text(); // Respect per-request delay to maintain rate limiting await delay(DELAY_MS); return html; } catch (err) { // Handle different error types if (err instanceof RateLimitError || err instanceof HttpError) { throw err; // Re-throw known errors } if (err instanceof Error && err.name === 'AbortError') { if (attempt < maxRetries) { await delay(calculateBackoffDelay(attempt, retryBaseMs)); continue; } throw new NetworkError(`Request timeout for ${url}`, url, err); } // Network or other errors if (attempt < maxRetries) { await delay(calculateBackoffDelay(attempt, retryBaseMs)); continue; } throw new NetworkError( `Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`, url, err instanceof Error ? err : undefined ); } } throw new NetworkError(`Exhausted retries without response for ${url}`, url); } /** * Calculate exponential backoff delay with jitter */ function calculateBackoffDelay(attempt: number, baseMs: number): number { const exponentialDelay = baseMs * (2 ** attempt); const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds } // ----------------------------- GraphQL Client ----------------------------- /** * Fetch additional data via GraphQL API */ async function fetchGraphQLData( query: string, variables: Record, BASE_URL = "https://www.kijiji.ca" ): Promise { const endpoint = `${BASE_URL}/anvil/api`; try { const response = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', 'apollo-require-preflight': 'true', }, body: JSON.stringify({ query, variables, }), }); if (!response.ok) { throw new HttpError( `GraphQL request failed with status ${response.status}`, response.status, endpoint ); } const result = await response.json(); if (result.errors) { throw new ParseError(`GraphQL errors: ${JSON.stringify(result.errors)}`, result.errors); } return result.data; } catch (err) { if (err instanceof HttpError || err instanceof ParseError) { throw err; } throw new NetworkError( `Failed to fetch GraphQL data: ${err instanceof Error ? err.message : String(err)}`, endpoint, err instanceof Error ? err : undefined ); } } // GraphQL response interfaces interface GraphQLReviewResponse { user?: { reviewSummary?: { count?: number; score?: number; }; }; } interface GraphQLProfileResponse { user?: { memberSince?: string; accountType?: string; }; } // GraphQL queries from KIJIJI.md const GRAPHQL_QUERIES = { getReviewSummary: ` query GetReviewSummary($userId: String!) { user(id: $userId) { reviewSummary { count score __typename } __typename } } `, getProfileMetrics: ` query GetProfileMetrics($profileId: String!) { user(id: $profileId) { memberSince accountType __typename } } `, } as const; /** * Fetch additional seller data via GraphQL */ async function fetchSellerDetails( posterId: string, BASE_URL = "https://www.kijiji.ca" ): Promise<{ reviewCount?: number; reviewScore?: number; memberSince?: string; accountType?: string }> { try { const [reviewData, profileData] = await Promise.all([ fetchGraphQLData(GRAPHQL_QUERIES.getReviewSummary, { userId: posterId }, BASE_URL), fetchGraphQLData(GRAPHQL_QUERIES.getProfileMetrics, { profileId: posterId }, BASE_URL), ]); const reviewResponse = reviewData as GraphQLReviewResponse; const profileResponse = profileData as GraphQLProfileResponse; return { reviewCount: reviewResponse?.user?.reviewSummary?.count, reviewScore: reviewResponse?.user?.reviewSummary?.score, memberSince: profileResponse?.user?.memberSince, accountType: profileResponse?.user?.accountType, }; } catch (err) { // Silently fail for GraphQL errors - not critical for basic functionality console.warn(`Failed to fetch seller details for ${posterId}:`, err instanceof Error ? err.message : String(err)); return {}; } } // ----------------------------- Parsing ----------------------------- /** Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML. */ function extractApolloState(htmlString: HTMLString): ApolloRecord | null { const { document } = parseHTML(htmlString); const nextData = document.getElementById("__NEXT_DATA__"); if (!nextData || !nextData.textContent) return null; try { const jsonData = JSON.parse(nextData.textContent); const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__; return isRecord(apollo) ? apollo : null; } catch { return null; } } /** Parse search page apollo state into SearchListing[]. Filters keys likely to be listing entities and ensures url/title exist. */ function parseSearch( htmlString: HTMLString, BASE_URL: string, ): SearchListing[] { const apolloState = extractApolloState(htmlString); if (!apolloState) return []; const results: SearchListing[] = []; for (const [key, value] of Object.entries(apolloState)) { // Heuristic: Kijiji listing keys usually contain "Listing" if (!key.includes("Listing")) continue; if (!isRecord(value)) continue; const item = value as ApolloSearchItem; if (typeof item.url === "string" && typeof item.title === "string") { results.push({ listingLink: item.url.startsWith("http") ? item.url : `${BASE_URL}${item.url}`, name: item.title, }); } } return results; } /** Parse a listing page into a typed object. */ function parseListing( htmlString: HTMLString, BASE_URL: string, ): ListingDetails | null { const apolloState = extractApolloState(htmlString); if (!apolloState) return null; // Find the listing root key const listingKey = Object.keys(apolloState).find((k) => k.includes("Listing"), ); if (!listingKey) return null; const root = apolloState[listingKey]; if (!isRecord(root)) return null; const { url, title, description, price, type, status, activationDate, endDate, metrics, location, } = root as ApolloListingRoot; const cents = price?.amount != null ? Number(price.amount) : undefined; const amountFormatted = formatCentsToCurrency(cents); const numberOfViews = metrics?.views != null ? Number(metrics.views) : undefined; const listingUrl = typeof url === "string" ? url.startsWith("http") ? url : `${BASE_URL}${url}` : ""; if (!listingUrl || !title) return null; return { url: listingUrl, title, description, listingPrice: amountFormatted ? { amountFormatted, cents: cents !== undefined && Number.isFinite(cents) ? cents : undefined, currency: price?.currency, } : undefined, listingType: type, listingStatus: status, creationDate: activationDate, endDate, numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined, address: location?.address ?? null, }; } /** * Parse a listing page into a detailed object with all available fields */ async function parseDetailedListing( htmlString: HTMLString, BASE_URL: string, options: ListingFetchOptions = {} ): Promise { const apolloState = extractApolloState(htmlString); if (!apolloState) return null; // Find the listing root key const listingKey = Object.keys(apolloState).find((k) => k.includes("Listing"), ); if (!listingKey) return null; const root = apolloState[listingKey]; if (!isRecord(root)) return null; const { url, title, description, price, type, status, activationDate, endDate, metrics, location, imageUrls, imageCount, categoryId, adSource, flags, posterInfo, attributes, } = root as ApolloListingRoot; const cents = price?.amount != null ? Number(price.amount) : undefined; const amountFormatted = formatCentsToCurrency(cents); const numberOfViews = metrics?.views != null ? Number(metrics.views) : undefined; const listingUrl = typeof url === "string" ? url.startsWith("http") ? url : `${BASE_URL}${url}` : ""; if (!listingUrl || !title) return null; // Only include fixed-price listings if (!amountFormatted || cents === undefined) return null; // Extract images if requested const images = options.includeImages !== false && Array.isArray(imageUrls) ? imageUrls.filter((url): url is string => typeof url === 'string') : []; // Extract attributes as key-value pairs const attributeMap: Record = {}; if (Array.isArray(attributes)) { for (const attr of attributes) { if (attr?.canonicalName && Array.isArray(attr.canonicalValues)) { attributeMap[attr.canonicalName] = attr.canonicalValues; } } } // Extract seller info based on depth setting let sellerInfo: DetailedListing['sellerInfo']; const depth = options.sellerDataDepth ?? 'detailed'; if (posterInfo?.posterId) { sellerInfo = { posterId: posterInfo.posterId, rating: typeof posterInfo.rating === 'number' ? posterInfo.rating : undefined, }; // Add more detailed info if requested and client-side data is enabled if ((depth === 'detailed' || depth === 'full') && options.includeClientSideData) { try { const additionalData = await fetchSellerDetails(posterInfo.posterId, BASE_URL); sellerInfo = { ...sellerInfo, ...additionalData, }; } catch (err) { // Silently fail - GraphQL data is optional console.warn(`Failed to fetch additional seller data for ${posterInfo.posterId}`); } } } return { url: listingUrl, title, description, listingPrice: { amountFormatted, cents, currency: price?.currency, }, listingType: type, listingStatus: status, creationDate: activationDate, endDate, numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined, address: location?.address ?? null, images, categoryId: typeof categoryId === 'number' ? categoryId : 0, adSource: typeof adSource === 'string' ? adSource : 'UNKNOWN', flags: { topAd: flags?.topAd === true, priceDrop: flags?.priceDrop === true, }, attributes: attributeMap, location: { id: typeof location?.id === 'number' ? location.id : 0, name: typeof location?.name === 'string' ? location.name : 'Unknown', coordinates: location?.coordinates ? { latitude: location.coordinates.latitude, longitude: location.coordinates.longitude, } : undefined, }, sellerInfo, }; } // ----------------------------- Main ----------------------------- export default async function fetchKijijiItems( SEARCH_QUERY: string, REQUESTS_PER_SECOND = 1, BASE_URL = "https://www.kijiji.ca", searchOptions: SearchOptions = {}, listingOptions: ListingFetchOptions = {}, ) { const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); // Set defaults for configuration const finalSearchOptions: Required = { location: searchOptions.location ?? 1700272, // Default to GTA category: searchOptions.category ?? 0, // Default to all categories keywords: searchOptions.keywords ?? SEARCH_QUERY, sortBy: searchOptions.sortBy ?? 'relevancy', sortOrder: searchOptions.sortOrder ?? 'desc', maxPages: searchOptions.maxPages ?? 5, // Default to 5 pages priceMin: searchOptions.priceMin, priceMax: searchOptions.priceMax, }; const finalListingOptions: Required = { includeImages: listingOptions.includeImages ?? true, sellerDataDepth: listingOptions.sellerDataDepth ?? 'detailed', includeClientSideData: listingOptions.includeClientSideData ?? false, }; const allListings: DetailedListing[] = []; const seenUrls = new Set(); // Fetch multiple pages for (let page = 1; page <= finalSearchOptions.maxPages; page++) { const searchUrl = buildSearchUrl(finalSearchOptions.keywords, { ...finalSearchOptions, // Add page parameter for pagination ...(page > 1 && { page }), }, BASE_URL); console.log(`Fetching search page ${page}: ${searchUrl}`); const searchHtml = await fetchHtml(searchUrl, DELAY_MS, { onRateInfo: (remaining, reset) => { if (remaining && reset) { console.log(`\nSearch - Rate limit remaining: ${remaining}, reset in: ${reset}s`); } }, }); const searchResults = parseSearch(searchHtml, BASE_URL); if (searchResults.length === 0) { console.log(`No more results found on page ${page}. Stopping pagination.`); break; } // Deduplicate links across pages const newListingLinks = searchResults .map((r) => r.listingLink) .filter((link) => !seenUrls.has(link)); for (const link of newListingLinks) { seenUrls.add(link); } console.log(`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`); // Fetch details for this page's listings const progressBar = new cliProgress.SingleBar( {}, cliProgress.Presets.shades_classic, ); const totalProgress = newListingLinks.length; let currentProgress = 0; progressBar.start(totalProgress, currentProgress); for (const link of newListingLinks) { try { const html = await fetchHtml(link, DELAY_MS, { onRateInfo: (remaining, reset) => { if (remaining && reset) { console.log(`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`); } }, }); const parsed = await parseDetailedListing(html, BASE_URL, finalListingOptions); if (parsed) { allListings.push(parsed); } } catch (err) { if (err instanceof HttpError) { console.error(`\nFailed to fetch ${link}\n - ${err.status} ${err.message}`); } else { console.error(`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`); } } finally { currentProgress++; progressBar.update(currentProgress); } } progressBar.stop(); // If we got fewer results than expected (40 per page), we've reached the end if (searchResults.length < 40) { break; } } console.log(`\nParsed ${allListings.length} detailed listings.`); return allListings; }