feat: extract Kijiji scraping logic into reusable function

This commit extracts the Kijiji scraping functionality into a reusable function `fetchKijijiItems`. This allows for easier integration into other parts of the application and improves code modularity. The function accepts search query, requests per second, and base URL as parameters, enabling customizable scraping.
This commit is contained in:
2025-09-17 22:03:24 -04:00
parent b7a61423c3
commit 46a8ac92cf
2 changed files with 32 additions and 17 deletions

View File

@@ -51,11 +51,6 @@ type ListingDetails = {
// ----------------------------- Config -----------------------------
const REQUESTS_PER_SECOND = 1;
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const BASE_URL = "https://www.kijiji.ca";
const SEARCH_QUERY = "playstation 5";
// ----------------------------- Utilities -----------------------------
/**
@@ -105,6 +100,7 @@ class HttpError extends Error {
*/
async function fetchHtml(
url: string,
DELAY_MS: number,
opts?: {
maxRetries?: number;
retryBaseMs?: number;
@@ -191,7 +187,10 @@ function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
Parse search page apollo state into SearchListing[].
Filters keys likely to be listing entities and ensures url/title exist.
*/
function parseSearch(htmlString: HTMLString): SearchListing[] {
function parseSearch(
htmlString: HTMLString,
BASE_URL: string,
): SearchListing[] {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return [];
@@ -217,7 +216,10 @@ function parseSearch(htmlString: HTMLString): SearchListing[] {
/**
Parse a listing page into a typed object.
*/
function parseListing(htmlString: HTMLString): ListingDetails | null {
function parseListing(
htmlString: HTMLString,
BASE_URL: string,
): ListingDetails | null {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
@@ -280,11 +282,17 @@ function parseListing(htmlString: HTMLString): ListingDetails | null {
// ----------------------------- Main -----------------------------
async function main() {
export default async function fetchKijijiItems(
SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1,
BASE_URL = "https://www.kijiji.ca",
) {
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const searchUrl = `${BASE_URL}/b-canada/${encodeURIComponent(SEARCH_QUERY)}/k0l0?dc=true&view=list`;
console.log(`Fetching search: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, {
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
@@ -294,7 +302,7 @@ async function main() {
},
});
const searchResults = parseSearch(searchHtml);
const searchResults = parseSearch(searchHtml, BASE_URL);
if (searchResults.length === 0) {
console.warn("No search results parsed from page.");
return;
@@ -312,7 +320,7 @@ async function main() {
const items: ListingDetails[] = [];
for (const link of listingLinks) {
try {
const html = await fetchHtml(link, {
const html = await fetchHtml(link, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
@@ -321,7 +329,7 @@ async function main() {
}
},
});
const parsed = parseListing(html);
const parsed = parseListing(html, BASE_URL);
if (parsed) items.push(parsed);
} catch (err) {
if (err instanceof HttpError) {
@@ -335,10 +343,11 @@ async function main() {
}
console.log(`Parsed ${items.length} listings.`);
console.log(items);
return items;
// console.log(items);
}
void main().catch((err) => {
console.error("Fatal error:", err);
process.exitCode = 1;
});
// void main().catch((err) => {
// console.error("Fatal error:", err);
// process.exitCode = 1;
// });