398 lines
9.8 KiB
TypeScript
398 lines
9.8 KiB
TypeScript
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||
import { parseHTML } from "linkedom";
|
||
import unidecode from "unidecode";
|
||
import cliProgress from "cli-progress";
|
||
|
||
// const unidecode = require("unidecode");
|
||
|
||
// ----------------------------- Types -----------------------------
|
||
|
||
type HTMLString = string;
|
||
|
||
type SearchListing = {
|
||
name: string;
|
||
listingLink: string;
|
||
};
|
||
|
||
type ApolloRecord = Record<string, unknown>;
|
||
|
||
interface ApolloSearchItem {
|
||
url?: string;
|
||
title?: string;
|
||
[k: string]: unknown;
|
||
}
|
||
|
||
interface ApolloListingRoot {
|
||
url?: string;
|
||
title?: string;
|
||
description?: string;
|
||
price?: { amount?: number | string; currency?: string };
|
||
type?: string;
|
||
status?: string;
|
||
activationDate?: string;
|
||
endDate?: string;
|
||
metrics?: { views?: number | string };
|
||
location?: { address?: string | null };
|
||
[k: string]: unknown;
|
||
}
|
||
|
||
type ListingDetails = {
|
||
url: string;
|
||
title: string;
|
||
description?: string;
|
||
listingPrice?: {
|
||
amountFormatted: string;
|
||
cents?: number;
|
||
currency?: string;
|
||
};
|
||
listingType?: string;
|
||
listingStatus?: string;
|
||
creationDate?: string;
|
||
endDate?: string;
|
||
numberOfViews?: number;
|
||
address?: string | null;
|
||
};
|
||
|
||
// ----------------------------- Utilities -----------------------------
|
||
|
||
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
||
|
||
/**
|
||
* Slugifies a string for search
|
||
*/
|
||
export function slugify(input: string): string {
|
||
const s = unidecode(input).toLowerCase();
|
||
const out: string[] = [];
|
||
let lastHyphen = false;
|
||
|
||
for (let i = 0; i < s.length; i++) {
|
||
const ch = s[i];
|
||
const code = ch!.charCodeAt(0);
|
||
|
||
// a-z or 0-9
|
||
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
|
||
out.push(ch!);
|
||
lastHyphen = false;
|
||
} else if (SEPS.has(ch!)) {
|
||
if (!lastHyphen) {
|
||
out.push("-");
|
||
lastHyphen = true;
|
||
}
|
||
}
|
||
// else drop character
|
||
}
|
||
return out.join("");
|
||
}
|
||
|
||
/**
|
||
* Turns cents to localized currency string.
|
||
*/
|
||
function formatCentsToCurrency(
|
||
num: number | string | undefined,
|
||
locale = "en-US",
|
||
): string {
|
||
if (num == null) return "";
|
||
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
|
||
if (Number.isNaN(cents)) return "";
|
||
const dollars = cents / 100;
|
||
const formatter = new Intl.NumberFormat(locale, {
|
||
minimumFractionDigits: 2,
|
||
maximumFractionDigits: 2,
|
||
useGrouping: true,
|
||
});
|
||
return formatter.format(dollars);
|
||
}
|
||
|
||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||
return typeof value === "object" && value !== null;
|
||
}
|
||
|
||
async function delay(ms: number): Promise<void> {
|
||
await new Promise((resolve) => setTimeout(resolve, ms));
|
||
}
|
||
|
||
class HttpError extends Error {
|
||
constructor(
|
||
message: string,
|
||
public readonly status: number,
|
||
public readonly url: string,
|
||
) {
|
||
super(message);
|
||
this.name = "HttpError";
|
||
}
|
||
}
|
||
|
||
// ----------------------------- HTTP Client -----------------------------
|
||
|
||
/**
|
||
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
|
||
- Retries on 429 and 5xx
|
||
- Respects X-RateLimit-Reset when present (seconds)
|
||
*/
|
||
async function fetchHtml(
|
||
url: string,
|
||
DELAY_MS: number,
|
||
opts?: {
|
||
maxRetries?: number;
|
||
retryBaseMs?: number;
|
||
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||
},
|
||
): Promise<HTMLString> {
|
||
const maxRetries = opts?.maxRetries ?? 3;
|
||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||
|
||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||
try {
|
||
// console.log(`Fetching: `, url);
|
||
const res = await fetch(url, {
|
||
method: "GET",
|
||
headers: {
|
||
accept:
|
||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
|
||
"cache-control": "no-cache",
|
||
"upgrade-insecure-requests": "1",
|
||
"user-agent":
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
||
},
|
||
});
|
||
|
||
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
||
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
||
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
||
|
||
if (!res.ok) {
|
||
// Respect 429 reset if provided
|
||
if (res.status === 429) {
|
||
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
|
||
const waitMs = Number.isFinite(resetSeconds)
|
||
? Math.max(0, resetSeconds * 1000)
|
||
: (attempt + 1) * retryBaseMs;
|
||
await delay(waitMs);
|
||
continue;
|
||
}
|
||
// Retry on 5xx
|
||
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
||
await delay((attempt + 1) * retryBaseMs);
|
||
continue;
|
||
}
|
||
throw new HttpError(
|
||
`Request failed with status ${res.status}`,
|
||
res.status,
|
||
url,
|
||
);
|
||
}
|
||
|
||
const html = await res.text();
|
||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||
await delay(DELAY_MS);
|
||
return html;
|
||
} catch (err) {
|
||
if (attempt >= maxRetries) throw err;
|
||
await delay((attempt + 1) * retryBaseMs);
|
||
}
|
||
}
|
||
|
||
throw new Error("Exhausted retries without response");
|
||
}
|
||
|
||
// ----------------------------- Parsing -----------------------------
|
||
|
||
/**
|
||
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
|
||
*/
|
||
function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
|
||
const { document } = parseHTML(htmlString);
|
||
const nextData = document.getElementById("__NEXT_DATA__");
|
||
if (!nextData || !nextData.textContent) return null;
|
||
|
||
try {
|
||
const jsonData = JSON.parse(nextData.textContent);
|
||
const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__;
|
||
return isRecord(apollo) ? apollo : null;
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
Parse search page apollo state into SearchListing[].
|
||
Filters keys likely to be listing entities and ensures url/title exist.
|
||
*/
|
||
function parseSearch(
|
||
htmlString: HTMLString,
|
||
BASE_URL: string,
|
||
): SearchListing[] {
|
||
const apolloState = extractApolloState(htmlString);
|
||
if (!apolloState) return [];
|
||
|
||
const results: SearchListing[] = [];
|
||
for (const [key, value] of Object.entries(apolloState)) {
|
||
// Heuristic: Kijiji listing keys usually contain "Listing"
|
||
if (!key.includes("Listing")) continue;
|
||
if (!isRecord(value)) continue;
|
||
|
||
const item = value as ApolloSearchItem;
|
||
if (typeof item.url === "string" && typeof item.title === "string") {
|
||
results.push({
|
||
listingLink: item.url.startsWith("http")
|
||
? item.url
|
||
: `${BASE_URL}${item.url}`,
|
||
name: item.title,
|
||
});
|
||
}
|
||
}
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
Parse a listing page into a typed object.
|
||
*/
|
||
function parseListing(
|
||
htmlString: HTMLString,
|
||
BASE_URL: string,
|
||
): ListingDetails | null {
|
||
const apolloState = extractApolloState(htmlString);
|
||
if (!apolloState) return null;
|
||
|
||
// Find the listing root key
|
||
const listingKey = Object.keys(apolloState).find((k) =>
|
||
k.includes("Listing"),
|
||
);
|
||
if (!listingKey) return null;
|
||
|
||
const root = apolloState[listingKey];
|
||
if (!isRecord(root)) return null;
|
||
|
||
const {
|
||
url,
|
||
title,
|
||
description,
|
||
price,
|
||
type,
|
||
status,
|
||
activationDate,
|
||
endDate,
|
||
metrics,
|
||
location,
|
||
} = root as ApolloListingRoot;
|
||
|
||
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
||
const amountFormatted = formatCentsToCurrency(cents);
|
||
|
||
const numberOfViews =
|
||
metrics?.views != null ? Number(metrics.views) : undefined;
|
||
|
||
const listingUrl =
|
||
typeof url === "string"
|
||
? url.startsWith("http")
|
||
? url
|
||
: `${BASE_URL}${url}`
|
||
: "";
|
||
|
||
if (!listingUrl || !title) return null;
|
||
|
||
return {
|
||
url: listingUrl,
|
||
title,
|
||
description,
|
||
listingPrice: amountFormatted
|
||
? {
|
||
amountFormatted,
|
||
cents: Number.isFinite(cents!) ? cents : undefined,
|
||
currency: price?.currency,
|
||
}
|
||
: undefined,
|
||
listingType: type,
|
||
listingStatus: status,
|
||
creationDate: activationDate,
|
||
endDate,
|
||
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined,
|
||
address: location?.address ?? null,
|
||
};
|
||
}
|
||
|
||
// ----------------------------- Main -----------------------------
|
||
|
||
export default async function fetchKijijiItems(
|
||
SEARCH_QUERY: string,
|
||
REQUESTS_PER_SECOND = 1,
|
||
BASE_URL = "https://www.kijiji.ca",
|
||
) {
|
||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||
|
||
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
|
||
|
||
console.log(`Fetching search: ${searchUrl}`);
|
||
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
||
onRateInfo: (remaining, reset) => {
|
||
if (remaining && reset) {
|
||
console.log(
|
||
"\n" +
|
||
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||
);
|
||
}
|
||
},
|
||
});
|
||
|
||
const searchResults = parseSearch(searchHtml, BASE_URL);
|
||
if (searchResults.length === 0) {
|
||
console.warn("No search results parsed from page.");
|
||
return;
|
||
}
|
||
|
||
// Deduplicate links
|
||
const listingLinks = Array.from(
|
||
new Set(searchResults.map((r) => r.listingLink)),
|
||
);
|
||
|
||
console.log(
|
||
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
|
||
);
|
||
|
||
const progressBar = new cliProgress.SingleBar(
|
||
{},
|
||
cliProgress.Presets.shades_classic,
|
||
);
|
||
const totalProgress = listingLinks.length;
|
||
let currentProgress = 0;
|
||
progressBar.start(totalProgress, currentProgress);
|
||
|
||
const items: ListingDetails[] = [];
|
||
for (const link of listingLinks) {
|
||
try {
|
||
const html = await fetchHtml(link, DELAY_MS, {
|
||
onRateInfo: (remaining, reset) => {
|
||
if (remaining && reset) {
|
||
console.log(
|
||
"\n" +
|
||
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||
);
|
||
}
|
||
},
|
||
});
|
||
const parsed = parseListing(html, BASE_URL);
|
||
if (parsed) {
|
||
if (parsed.listingPrice?.cents) items.push(parsed);
|
||
}
|
||
} catch (err) {
|
||
if (err instanceof HttpError) {
|
||
console.error(
|
||
"\n" + `Failed to fetch ${link}\n - ${err.status} ${err.message}`,
|
||
);
|
||
} else {
|
||
console.error(
|
||
"\n" +
|
||
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
||
);
|
||
}
|
||
} finally {
|
||
currentProgress++;
|
||
progressBar.update(currentProgress);
|
||
}
|
||
}
|
||
|
||
console.log("\n" + `Parsed ${items.length} listings.`);
|
||
return items;
|
||
}
|