/* eslint-disable @typescript-eslint/no-explicit-any */ import { parseHTML } from "linkedom"; import unidecode from "unidecode"; import cliProgress from "cli-progress"; // const unidecode = require("unidecode"); // ----------------------------- Types ----------------------------- type HTMLString = string; type SearchListing = { name: string; listingLink: string; }; type ApolloRecord = Record; interface ApolloSearchItem { url?: string; title?: string; [k: string]: unknown; } interface ApolloListingRoot { url?: string; title?: string; description?: string; price?: { amount?: number | string; currency?: string }; type?: string; status?: string; activationDate?: string; endDate?: string; metrics?: { views?: number | string }; location?: { address?: string | null }; [k: string]: unknown; } type ListingDetails = { url: string; title: string; description?: string; listingPrice?: { amountFormatted: string; cents?: number; currency?: string; }; listingType?: string; listingStatus?: string; creationDate?: string; endDate?: string; numberOfViews?: number; address?: string | null; }; // ----------------------------- Utilities ----------------------------- const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]); /** * Slugifies a string for search */ export function slugify(input: string): string { const s = unidecode(input).toLowerCase(); const out: string[] = []; let lastHyphen = false; for (let i = 0; i < s.length; i++) { const ch = s[i]; const code = ch!.charCodeAt(0); // a-z or 0-9 if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) { out.push(ch!); lastHyphen = false; } else if (SEPS.has(ch!)) { if (!lastHyphen) { out.push("-"); lastHyphen = true; } } // else drop character } return out.join(""); } /** * Turns cents to localized currency string. */ function formatCentsToCurrency( num: number | string | undefined, locale = "en-US", ): string { if (num == null) return ""; const cents = typeof num === "string" ? Number.parseInt(num, 10) : num; if (Number.isNaN(cents)) return ""; const dollars = cents / 100; const formatter = new Intl.NumberFormat(locale, { minimumFractionDigits: 2, maximumFractionDigits: 2, useGrouping: true, }); return formatter.format(dollars); } function isRecord(value: unknown): value is Record { return typeof value === "object" && value !== null; } async function delay(ms: number): Promise { await new Promise((resolve) => setTimeout(resolve, ms)); } class HttpError extends Error { constructor( message: string, public readonly status: number, public readonly url: string, ) { super(message); this.name = "HttpError"; } } // ----------------------------- HTTP Client ----------------------------- /** Fetch HTML with a basic retry strategy and simple rate-limit delay between calls. - Retries on 429 and 5xx - Respects X-RateLimit-Reset when present (seconds) */ async function fetchHtml( url: string, DELAY_MS: number, opts?: { maxRetries?: number; retryBaseMs?: number; onRateInfo?: (remaining: string | null, reset: string | null) => void; }, ): Promise { const maxRetries = opts?.maxRetries ?? 3; const retryBaseMs = opts?.retryBaseMs ?? 500; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { // console.log(`Fetching: `, url); const res = await fetch(url, { method: "GET", headers: { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", "cache-control": "no-cache", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", }, }); const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining"); const rateLimitReset = res.headers.get("X-RateLimit-Reset"); opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset); if (!res.ok) { // Respect 429 reset if provided if (res.status === 429) { const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN; const waitMs = Number.isFinite(resetSeconds) ? Math.max(0, resetSeconds * 1000) : (attempt + 1) * retryBaseMs; await delay(waitMs); continue; } // Retry on 5xx if (res.status >= 500 && res.status < 600 && attempt < maxRetries) { await delay((attempt + 1) * retryBaseMs); continue; } throw new HttpError( `Request failed with status ${res.status}`, res.status, url, ); } const html = await res.text(); // Respect per-request delay to keep at or under REQUESTS_PER_SECOND await delay(DELAY_MS); return html; } catch (err) { if (attempt >= maxRetries) throw err; await delay((attempt + 1) * retryBaseMs); } } throw new Error("Exhausted retries without response"); } // ----------------------------- Parsing ----------------------------- /** Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML. */ function extractApolloState(htmlString: HTMLString): ApolloRecord | null { const { document } = parseHTML(htmlString); const nextData = document.getElementById("__NEXT_DATA__"); if (!nextData || !nextData.textContent) return null; try { const jsonData = JSON.parse(nextData.textContent); const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__; return isRecord(apollo) ? apollo : null; } catch { return null; } } /** Parse search page apollo state into SearchListing[]. Filters keys likely to be listing entities and ensures url/title exist. */ function parseSearch( htmlString: HTMLString, BASE_URL: string, ): SearchListing[] { const apolloState = extractApolloState(htmlString); if (!apolloState) return []; const results: SearchListing[] = []; for (const [key, value] of Object.entries(apolloState)) { // Heuristic: Kijiji listing keys usually contain "Listing" if (!key.includes("Listing")) continue; if (!isRecord(value)) continue; const item = value as ApolloSearchItem; if (typeof item.url === "string" && typeof item.title === "string") { results.push({ listingLink: item.url.startsWith("http") ? item.url : `${BASE_URL}${item.url}`, name: item.title, }); } } return results; } /** Parse a listing page into a typed object. */ function parseListing( htmlString: HTMLString, BASE_URL: string, ): ListingDetails | null { const apolloState = extractApolloState(htmlString); if (!apolloState) return null; // Find the listing root key const listingKey = Object.keys(apolloState).find((k) => k.includes("Listing"), ); if (!listingKey) return null; const root = apolloState[listingKey]; if (!isRecord(root)) return null; const { url, title, description, price, type, status, activationDate, endDate, metrics, location, } = root as ApolloListingRoot; const cents = price?.amount != null ? Number(price.amount) : undefined; const amountFormatted = formatCentsToCurrency(cents); const numberOfViews = metrics?.views != null ? Number(metrics.views) : undefined; const listingUrl = typeof url === "string" ? url.startsWith("http") ? url : `${BASE_URL}${url}` : ""; if (!listingUrl || !title) return null; return { url: listingUrl, title, description, listingPrice: amountFormatted ? { amountFormatted, cents: Number.isFinite(cents!) ? cents : undefined, currency: price?.currency, } : undefined, listingType: type, listingStatus: status, creationDate: activationDate, endDate, numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined, address: location?.address ?? null, }; } // ----------------------------- Main ----------------------------- export default async function fetchKijijiItems( SEARCH_QUERY: string, REQUESTS_PER_SECOND = 1, BASE_URL = "https://www.kijiji.ca", ) { const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`; console.log(`Fetching search: ${searchUrl}`); const searchHtml = await fetchHtml(searchUrl, DELAY_MS, { onRateInfo: (remaining, reset) => { if (remaining && reset) { console.log( "\n" + `Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`, ); } }, }); const searchResults = parseSearch(searchHtml, BASE_URL); if (searchResults.length === 0) { console.warn("No search results parsed from page."); return; } // Deduplicate links const listingLinks = Array.from( new Set(searchResults.map((r) => r.listingLink)), ); console.log( "\n" + `Found ${listingLinks.length} listing links. Fetching details...`, ); const progressBar = new cliProgress.SingleBar( {}, cliProgress.Presets.shades_classic, ); const totalProgress = listingLinks.length; let currentProgress = 0; progressBar.start(totalProgress, currentProgress); const items: ListingDetails[] = []; for (const link of listingLinks) { try { const html = await fetchHtml(link, DELAY_MS, { onRateInfo: (remaining, reset) => { if (remaining && reset) { console.log( "\n" + `Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`, ); } }, }); const parsed = parseListing(html, BASE_URL); if (parsed) { if (parsed.listingPrice?.cents) items.push(parsed); } } catch (err) { if (err instanceof HttpError) { console.error( "\n" + `Failed to fetch ${link}\n - ${err.status} ${err.message}`, ); } else { console.error( "\n" + `Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`, ); } } finally { currentProgress++; progressBar.update(currentProgress); } } console.log("\n" + `Parsed ${items.length} listings.`); return items; }