refactor: improve Kijiji scraping architecture
Introduces types, utilities, and a dedicated HTTP client for improved structure and maintainability. Includes rate limit handling, retry logic, and error handling.
This commit is contained in:
422
src/kijiji.ts
422
src/kijiji.ts
@@ -1,159 +1,234 @@
|
|||||||
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
|
|
||||||
|
// ----------------------------- Types -----------------------------
|
||||||
|
|
||||||
|
type HTMLString = string;
|
||||||
|
|
||||||
type SearchListing = {
|
type SearchListing = {
|
||||||
name: string;
|
name: string;
|
||||||
listingLink: string;
|
listingLink: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
interface ApolloSearchState {
|
type ApolloRecord = Record<string, unknown>;
|
||||||
[key: string]: {
|
|
||||||
[key: string]: unknown;
|
interface ApolloSearchItem {
|
||||||
|
url?: string;
|
||||||
|
title?: string;
|
||||||
|
[k: string]: unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ApolloListingRoot {
|
||||||
|
url?: string;
|
||||||
|
title?: string;
|
||||||
|
description?: string;
|
||||||
|
price?: { amount?: number | string; currency?: string };
|
||||||
|
type?: string;
|
||||||
|
status?: string;
|
||||||
|
activationDate?: string;
|
||||||
|
endDate?: string;
|
||||||
|
metrics?: { views?: number | string };
|
||||||
|
location?: { address?: string | null };
|
||||||
|
[k: string]: unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ListingDetails = {
|
||||||
url: string;
|
url: string;
|
||||||
title: string;
|
title: string;
|
||||||
|
description?: string;
|
||||||
|
listingPrice?: {
|
||||||
|
amountFormatted: string;
|
||||||
|
cents?: number;
|
||||||
|
currency?: string;
|
||||||
};
|
};
|
||||||
}
|
listingType?: string;
|
||||||
|
listingStatus?: string;
|
||||||
|
creationDate?: string;
|
||||||
|
endDate?: string;
|
||||||
|
numberOfViews?: number;
|
||||||
|
address?: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
interface ApolloListingState {
|
// ----------------------------- Config -----------------------------
|
||||||
[key: string]: any;
|
|
||||||
}
|
|
||||||
|
|
||||||
function formatCentsToCurrency(num: number | string, locale = "en-US") {
|
const REQUESTS_PER_SECOND = 1;
|
||||||
if (typeof num === "string") num = parseInt(num);
|
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||||||
const numberInDollars = num / 100;
|
const BASE_URL = "https://www.kijiji.ca";
|
||||||
|
const SEARCH_QUERY = "playstation 5";
|
||||||
|
|
||||||
|
// ----------------------------- Utilities -----------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Turns cents to localized currency string.
|
||||||
|
*/
|
||||||
|
function formatCentsToCurrency(
|
||||||
|
num: number | string | undefined,
|
||||||
|
locale = "en-US",
|
||||||
|
): string {
|
||||||
|
if (num == null) return "";
|
||||||
|
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
|
||||||
|
if (Number.isNaN(cents)) return "";
|
||||||
|
const dollars = cents / 100;
|
||||||
const formatter = new Intl.NumberFormat(locale, {
|
const formatter = new Intl.NumberFormat(locale, {
|
||||||
minimumFractionDigits: 2,
|
minimumFractionDigits: 2,
|
||||||
maximumFractionDigits: 2,
|
maximumFractionDigits: 2,
|
||||||
useGrouping: true,
|
useGrouping: true,
|
||||||
});
|
});
|
||||||
|
return formatter.format(dollars);
|
||||||
return formatter.format(numberInDollars);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const searchQuery = "playstation 5";
|
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||||
const REQUESTS_PER_SECOND = 1;
|
return typeof value === "object" && value !== null;
|
||||||
const DELAY_MS = 1000 / REQUESTS_PER_SECOND;
|
}
|
||||||
|
|
||||||
// const exampleSearchHTML = Bun.file("./example-kijiji-search.html");
|
async function delay(ms: number): Promise<void> {
|
||||||
// const exampleSearchHTMLData = await exampleSearchHTML.text();
|
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
function parseSearch(htmlString: string) {
|
class HttpError extends Error {
|
||||||
const { document } = parseHTML(htmlString);
|
constructor(
|
||||||
const nextData = document.getElementById("__NEXT_DATA__");
|
message: string,
|
||||||
|
public readonly status: number,
|
||||||
if (!nextData) {
|
public readonly url: string,
|
||||||
console.error("Could not find __NEXT_DATA__ script element.");
|
) {
|
||||||
return [];
|
super(message);
|
||||||
|
this.name = "HttpError";
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- HTTP Client -----------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
|
||||||
|
- Retries on 429 and 5xx
|
||||||
|
- Respects X-RateLimit-Reset when present (seconds)
|
||||||
|
*/
|
||||||
|
async function fetchHtml(
|
||||||
|
url: string,
|
||||||
|
opts?: {
|
||||||
|
maxRetries?: number;
|
||||||
|
retryBaseMs?: number;
|
||||||
|
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||||||
|
},
|
||||||
|
): Promise<HTMLString> {
|
||||||
|
const maxRetries = opts?.maxRetries ?? 3;
|
||||||
|
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||||||
|
|
||||||
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||||
try {
|
try {
|
||||||
if (!nextData.textContent) {
|
const res = await fetch(url, {
|
||||||
console.error("__NEXT_DATA__ element is empty!");
|
method: "GET",
|
||||||
return [];
|
|
||||||
}
|
|
||||||
const jsonData = JSON.parse(nextData.textContent);
|
|
||||||
const apolloState: ApolloSearchState =
|
|
||||||
jsonData.props.pageProps.__APOLLO_STATE__;
|
|
||||||
|
|
||||||
const listingsKeys: string[] = [];
|
|
||||||
|
|
||||||
for (const key in apolloState) {
|
|
||||||
if (key.includes("Listing")) {
|
|
||||||
listingsKeys.push(key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const searchListings: SearchListing[] = listingsKeys.map((key) => {
|
|
||||||
const listing = apolloState[key];
|
|
||||||
return {
|
|
||||||
listingLink: listing!.url,
|
|
||||||
name: listing!.title,
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
// console.log(searchListings);
|
|
||||||
|
|
||||||
return searchListings;
|
|
||||||
} catch (error) {
|
|
||||||
console.error("Error parsing __NEXT_DATA__:", error);
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const makeKijijiRequest = async <T>(url: string): Promise<T> => {
|
|
||||||
console.log(`Making a request at ${new Date()}`);
|
|
||||||
const response = await fetch(url, {
|
|
||||||
headers: {
|
headers: {
|
||||||
accept:
|
accept:
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||||
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
|
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
|
||||||
"cache-control": "max-age=0",
|
"cache-control": "no-cache",
|
||||||
priority: "u=0, i",
|
|
||||||
"sec-ch-ua": '"Not)A;Brand";v="8", "Chromium";v="138"',
|
|
||||||
"sec-ch-ua-mobile": "?0",
|
|
||||||
"sec-ch-ua-platform": '"Linux"',
|
|
||||||
"sec-fetch-dest": "document",
|
|
||||||
"sec-fetch-mode": "navigate",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-user": "?1",
|
|
||||||
"upgrade-insecure-requests": "1",
|
"upgrade-insecure-requests": "1",
|
||||||
|
"user-agent":
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
||||||
},
|
},
|
||||||
body: null,
|
|
||||||
method: "GET",
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const rateLimitRemaining = response.headers.get("X-RateLimit-Remaining");
|
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
||||||
const rateLimitReset = response.headers.get("X-RateLimit-Reset");
|
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
||||||
|
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
||||||
|
|
||||||
if (rateLimitRemaining !== null && rateLimitReset !== null) {
|
if (!res.ok) {
|
||||||
console.log(
|
// Respect 429 reset if provided
|
||||||
`Rate limit remaining: ${rateLimitRemaining}, Reset in: ${rateLimitReset} seconds`,
|
if (res.status === 429) {
|
||||||
|
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
|
||||||
|
const waitMs = Number.isFinite(resetSeconds)
|
||||||
|
? Math.max(0, resetSeconds * 1000)
|
||||||
|
: (attempt + 1) * retryBaseMs;
|
||||||
|
await delay(waitMs);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Retry on 5xx
|
||||||
|
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
||||||
|
await delay((attempt + 1) * retryBaseMs);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw new HttpError(
|
||||||
|
`Request failed with status ${res.status}`,
|
||||||
|
res.status,
|
||||||
|
url,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const data: T = (await response.text()) as T;
|
const html = await res.text();
|
||||||
return data;
|
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||||||
};
|
await delay(DELAY_MS);
|
||||||
|
return html;
|
||||||
|
} catch (err) {
|
||||||
|
if (attempt >= maxRetries) throw err;
|
||||||
|
await delay((attempt + 1) * retryBaseMs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function delay(ms: number): Promise<void> {
|
throw new Error("Exhausted retries without response");
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// const exampleListing = await Bun.file("./examples/apollo_listing.json").json();
|
// ----------------------------- Parsing -----------------------------
|
||||||
// const exampleListingApolloState =
|
|
||||||
// exampleListing.props.pageProps.__APOLLO_STATE__;
|
|
||||||
|
|
||||||
const parseListing = (htmlString: string) => {
|
/**
|
||||||
|
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
|
||||||
|
*/
|
||||||
|
function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
|
||||||
const { document } = parseHTML(htmlString);
|
const { document } = parseHTML(htmlString);
|
||||||
const nextData = document.getElementById("__NEXT_DATA__");
|
const nextData = document.getElementById("__NEXT_DATA__");
|
||||||
|
if (!nextData || !nextData.textContent) return null;
|
||||||
|
|
||||||
if (!nextData) {
|
try {
|
||||||
console.error("Could not find __NEXT_DATA__ script element.");
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nextData.textContent) {
|
|
||||||
console.error("__NEXT_DATA__ element is empty!");
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
const jsonData = JSON.parse(nextData.textContent);
|
const jsonData = JSON.parse(nextData.textContent);
|
||||||
const apolloState: ApolloListingState =
|
const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__;
|
||||||
jsonData.props.pageProps.__APOLLO_STATE__;
|
return isRecord(apollo) ? apollo : null;
|
||||||
|
} catch {
|
||||||
const getListingId = (apolloState: { [key: string]: any }):
|
return null;
|
||||||
| string
|
|
||||||
| undefined => {
|
|
||||||
const apolloStateKeys = Object.keys(apolloState);
|
|
||||||
const key = apolloStateKeys.find((key) => key.includes("Listing"));
|
|
||||||
if (!key) return undefined;
|
|
||||||
return key;
|
|
||||||
};
|
|
||||||
|
|
||||||
const listingKey = getListingId(apolloState);
|
|
||||||
|
|
||||||
if (!listingKey) {
|
|
||||||
throw new Error("No listing key found in listing apolloState!");
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Parse search page apollo state into SearchListing[].
|
||||||
|
Filters keys likely to be listing entities and ensures url/title exist.
|
||||||
|
*/
|
||||||
|
function parseSearch(htmlString: HTMLString): SearchListing[] {
|
||||||
|
const apolloState = extractApolloState(htmlString);
|
||||||
|
if (!apolloState) return [];
|
||||||
|
|
||||||
|
const results: SearchListing[] = [];
|
||||||
|
for (const [key, value] of Object.entries(apolloState)) {
|
||||||
|
// Heuristic: Kijiji listing keys usually contain "Listing"
|
||||||
|
if (!key.includes("Listing")) continue;
|
||||||
|
if (!isRecord(value)) continue;
|
||||||
|
|
||||||
|
const item = value as ApolloSearchItem;
|
||||||
|
if (typeof item.url === "string" && typeof item.title === "string") {
|
||||||
|
results.push({
|
||||||
|
listingLink: item.url.startsWith("http")
|
||||||
|
? item.url
|
||||||
|
: `${BASE_URL}${item.url}`,
|
||||||
|
name: item.title,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Parse a listing page into a typed object.
|
||||||
|
*/
|
||||||
|
function parseListing(htmlString: HTMLString): ListingDetails | null {
|
||||||
|
const apolloState = extractApolloState(htmlString);
|
||||||
|
if (!apolloState) return null;
|
||||||
|
|
||||||
|
// Find the listing root key
|
||||||
|
const listingKey = Object.keys(apolloState).find((k) =>
|
||||||
|
k.includes("Listing"),
|
||||||
|
);
|
||||||
|
if (!listingKey) return null;
|
||||||
|
|
||||||
|
const root = apolloState[listingKey];
|
||||||
|
if (!isRecord(root)) return null;
|
||||||
|
|
||||||
const {
|
const {
|
||||||
url,
|
url,
|
||||||
@@ -165,66 +240,105 @@ const parseListing = (htmlString: string) => {
|
|||||||
activationDate,
|
activationDate,
|
||||||
endDate,
|
endDate,
|
||||||
metrics,
|
metrics,
|
||||||
// attributes,
|
|
||||||
location,
|
location,
|
||||||
} = apolloState[listingKey];
|
} = root as ApolloListingRoot;
|
||||||
|
|
||||||
const listingObject = {
|
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
||||||
url,
|
const amountFormatted = formatCentsToCurrency(cents);
|
||||||
|
|
||||||
|
const numberOfViews =
|
||||||
|
metrics?.views != null ? Number(metrics.views) : undefined;
|
||||||
|
|
||||||
|
const listingUrl =
|
||||||
|
typeof url === "string"
|
||||||
|
? url.startsWith("http")
|
||||||
|
? url
|
||||||
|
: `${BASE_URL}${url}`
|
||||||
|
: "";
|
||||||
|
|
||||||
|
if (!listingUrl || !title) return null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: listingUrl,
|
||||||
title,
|
title,
|
||||||
description,
|
description,
|
||||||
listingPrice: {
|
listingPrice: amountFormatted
|
||||||
amount: formatCentsToCurrency(price.amount),
|
? {
|
||||||
currency: price.currency,
|
amountFormatted,
|
||||||
},
|
cents: Number.isFinite(cents!) ? cents : undefined,
|
||||||
|
currency: price?.currency,
|
||||||
|
}
|
||||||
|
: undefined,
|
||||||
listingType: type,
|
listingType: type,
|
||||||
listingStatus: status,
|
listingStatus: status,
|
||||||
creationDate: activationDate,
|
creationDate: activationDate,
|
||||||
endDate,
|
endDate,
|
||||||
numberOfViews: metrics.views,
|
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined,
|
||||||
// condition: attributes.all.find(
|
address: location?.address ?? null,
|
||||||
// (attr: { [key: string]: unknown }) => attr.canonicalName === "condition",
|
|
||||||
// ).canonicalValues[0],
|
|
||||||
address: location.address,
|
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
return listingObject;
|
// ----------------------------- Main -----------------------------
|
||||||
};
|
|
||||||
|
|
||||||
const searchHtml: string = await makeKijijiRequest(
|
async function main() {
|
||||||
`https://www.kijiji.ca/b-canada/${searchQuery}/k0l0?dc=true&view=list`,
|
const searchUrl = `${BASE_URL}/b-canada/${encodeURIComponent(SEARCH_QUERY)}/k0l0?dc=true&view=list`;
|
||||||
);
|
|
||||||
|
|
||||||
const searchResults = parseSearch(searchHtml);
|
console.log(`Fetching search: ${searchUrl}`);
|
||||||
|
const searchHtml = await fetchHtml(searchUrl, {
|
||||||
|
onRateInfo: (remaining, reset) => {
|
||||||
|
if (remaining && reset) {
|
||||||
|
console.log(
|
||||||
|
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
// if (searchResults.length === 0) {
|
const searchResults = parseSearch(searchHtml);
|
||||||
// throw new Error("Search didn't return an HTML!")
|
if (searchResults.length === 0) {
|
||||||
// }
|
console.warn("No search results parsed from page.");
|
||||||
// console.log(searchResults);
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const fetchAllWithRateLimit = async (links: string[]) => {
|
// Deduplicate links
|
||||||
const results: string[] = [];
|
const listingLinks = Array.from(
|
||||||
for (const link of links) {
|
new Set(searchResults.map((r) => r.listingLink)),
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`Found ${listingLinks.length} listing links. Fetching details...`,
|
||||||
|
);
|
||||||
|
|
||||||
|
const items: ListingDetails[] = [];
|
||||||
|
for (const link of listingLinks) {
|
||||||
try {
|
try {
|
||||||
const data: string = await makeKijijiRequest(link);
|
const html = await fetchHtml(link, {
|
||||||
// console.log(data);
|
onRateInfo: (remaining, reset) => {
|
||||||
results.push(data);
|
if (remaining && reset) {
|
||||||
} catch (error) {
|
console.log(
|
||||||
console.error(`Failed to fetch data from ${link}:`, error);
|
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const parsed = parseListing(html);
|
||||||
|
if (parsed) items.push(parsed);
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof HttpError) {
|
||||||
|
console.error(`Failed to fetch ${link} - ${err.status} ${err.message}`);
|
||||||
|
} else {
|
||||||
|
console.error(
|
||||||
|
`Failed to fetch ${link} - ${String((err as Error)?.message || err)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
await delay(DELAY_MS);
|
|
||||||
}
|
}
|
||||||
return results;
|
|
||||||
};
|
|
||||||
|
|
||||||
const listingsLinks: string[] = searchResults.map((item) => {
|
console.log(`Parsed ${items.length} listings.`);
|
||||||
return item.listingLink;
|
console.log(items);
|
||||||
|
}
|
||||||
|
|
||||||
|
void main().catch((err) => {
|
||||||
|
console.error("Fatal error:", err);
|
||||||
|
process.exitCode = 1;
|
||||||
});
|
});
|
||||||
|
|
||||||
// console.log(listingsLinks);
|
|
||||||
|
|
||||||
const fetchResults = await fetchAllWithRateLimit(listingsLinks);
|
|
||||||
|
|
||||||
const itemsData = fetchResults.map((itemHtml) => parseListing(itemHtml));
|
|
||||||
|
|
||||||
console.log(itemsData);
|
|
||||||
|
|||||||
Reference in New Issue
Block a user