Files
ca-marketplace-scraper/src/kijiji.ts

398 lines
9.8 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* eslint-disable @typescript-eslint/no-explicit-any */
import { parseHTML } from "linkedom";
import unidecode from "unidecode";
import cliProgress from "cli-progress";
// const unidecode = require("unidecode");
// ----------------------------- Types -----------------------------
type HTMLString = string;
type SearchListing = {
name: string;
listingLink: string;
};
type ApolloRecord = Record<string, unknown>;
interface ApolloSearchItem {
url?: string;
title?: string;
[k: string]: unknown;
}
interface ApolloListingRoot {
url?: string;
title?: string;
description?: string;
price?: { amount?: number | string; currency?: string };
type?: string;
status?: string;
activationDate?: string;
endDate?: string;
metrics?: { views?: number | string };
location?: { address?: string | null };
[k: string]: unknown;
}
type ListingDetails = {
url: string;
title: string;
description?: string;
listingPrice?: {
amountFormatted: string;
cents?: number;
currency?: string;
};
listingType?: string;
listingStatus?: string;
creationDate?: string;
endDate?: string;
numberOfViews?: number;
address?: string | null;
};
// ----------------------------- Utilities -----------------------------
const SEPS = new Set([" ", "", "—", "/", ":", ";", ",", ".", "-"]);
/**
* Slugifies a string for search
*/
export function slugify(input: string): string {
const s = unidecode(input).toLowerCase();
const out: string[] = [];
let lastHyphen = false;
for (let i = 0; i < s.length; i++) {
const ch = s[i];
const code = ch!.charCodeAt(0);
// a-z or 0-9
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
out.push(ch!);
lastHyphen = false;
} else if (SEPS.has(ch!)) {
if (!lastHyphen) {
out.push("-");
lastHyphen = true;
}
}
// else drop character
}
return out.join("");
}
/**
* Turns cents to localized currency string.
*/
function formatCentsToCurrency(
num: number | string | undefined,
locale = "en-US",
): string {
if (num == null) return "";
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
if (Number.isNaN(cents)) return "";
const dollars = cents / 100;
const formatter = new Intl.NumberFormat(locale, {
minimumFractionDigits: 2,
maximumFractionDigits: 2,
useGrouping: true,
});
return formatter.format(dollars);
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null;
}
async function delay(ms: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, ms));
}
class HttpError extends Error {
constructor(
message: string,
public readonly status: number,
public readonly url: string,
) {
super(message);
this.name = "HttpError";
}
}
// ----------------------------- HTTP Client -----------------------------
/**
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
- Retries on 429 and 5xx
- Respects X-RateLimit-Reset when present (seconds)
*/
async function fetchHtml(
url: string,
DELAY_MS: number,
opts?: {
maxRetries?: number;
retryBaseMs?: number;
onRateInfo?: (remaining: string | null, reset: string | null) => void;
},
): Promise<HTMLString> {
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
// console.log(`Fetching: `, url);
const res = await fetch(url, {
method: "GET",
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"upgrade-insecure-requests": "1",
"user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
},
});
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
if (!res.ok) {
// Respect 429 reset if provided
if (res.status === 429) {
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000)
: (attempt + 1) * retryBaseMs;
await delay(waitMs);
continue;
}
// Retry on 5xx
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
await delay((attempt + 1) * retryBaseMs);
continue;
}
throw new HttpError(
`Request failed with status ${res.status}`,
res.status,
url,
);
}
const html = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
await delay(DELAY_MS);
return html;
} catch (err) {
if (attempt >= maxRetries) throw err;
await delay((attempt + 1) * retryBaseMs);
}
}
throw new Error("Exhausted retries without response");
}
// ----------------------------- Parsing -----------------------------
/**
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
*/
function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
const { document } = parseHTML(htmlString);
const nextData = document.getElementById("__NEXT_DATA__");
if (!nextData || !nextData.textContent) return null;
try {
const jsonData = JSON.parse(nextData.textContent);
const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__;
return isRecord(apollo) ? apollo : null;
} catch {
return null;
}
}
/**
Parse search page apollo state into SearchListing[].
Filters keys likely to be listing entities and ensures url/title exist.
*/
function parseSearch(
htmlString: HTMLString,
BASE_URL: string,
): SearchListing[] {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return [];
const results: SearchListing[] = [];
for (const [key, value] of Object.entries(apolloState)) {
// Heuristic: Kijiji listing keys usually contain "Listing"
if (!key.includes("Listing")) continue;
if (!isRecord(value)) continue;
const item = value as ApolloSearchItem;
if (typeof item.url === "string" && typeof item.title === "string") {
results.push({
listingLink: item.url.startsWith("http")
? item.url
: `${BASE_URL}${item.url}`,
name: item.title,
});
}
}
return results;
}
/**
Parse a listing page into a typed object.
*/
function parseListing(
htmlString: HTMLString,
BASE_URL: string,
): ListingDetails | null {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing"),
);
if (!listingKey) return null;
const root = apolloState[listingKey];
if (!isRecord(root)) return null;
const {
url,
title,
description,
price,
type,
status,
activationDate,
endDate,
metrics,
location,
} = root as ApolloListingRoot;
const cents = price?.amount != null ? Number(price.amount) : undefined;
const amountFormatted = formatCentsToCurrency(cents);
const numberOfViews =
metrics?.views != null ? Number(metrics.views) : undefined;
const listingUrl =
typeof url === "string"
? url.startsWith("http")
? url
: `${BASE_URL}${url}`
: "";
if (!listingUrl || !title) return null;
return {
url: listingUrl,
title,
description,
listingPrice: amountFormatted
? {
amountFormatted,
cents: Number.isFinite(cents!) ? cents : undefined,
currency: price?.currency,
}
: undefined,
listingType: type,
listingStatus: status,
creationDate: activationDate,
endDate,
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined,
address: location?.address ?? null,
};
}
// ----------------------------- Main -----------------------------
export default async function fetchKijijiItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
BASE_URL = "https://www.kijiji.ca",
) {
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
console.log(`Fetching search: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const searchResults = parseSearch(searchHtml, BASE_URL);
if (searchResults.length === 0) {
console.warn("No search results parsed from page.");
return;
}
// Deduplicate links
const listingLinks = Array.from(
new Set(searchResults.map((r) => r.listingLink)),
);
console.log(
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
);
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = listingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items: ListingDetails[] = [];
for (const link of listingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const parsed = parseListing(html, BASE_URL);
if (parsed) {
if (parsed.listingPrice?.cents) items.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(
"\n" + `Failed to fetch ${link}\n - ${err.status} ${err.message}`,
);
} else {
console.error(
"\n" +
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
}
}
console.log("\n" + `Parsed ${items.length} listings.`);
return items;
}