feat: update kijiji scraper
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
18
src/index.ts
18
src/index.ts
@@ -26,8 +26,12 @@ const server = Bun.serve({
|
|||||||
{ status: 400 },
|
{ status: 400 },
|
||||||
);
|
);
|
||||||
|
|
||||||
const items = await fetchKijijiItems(SEARCH_QUERY, 5);
|
const items = await fetchKijijiItems(SEARCH_QUERY, 1, undefined, {}, {
|
||||||
if (!items)
|
includeImages: true,
|
||||||
|
sellerDataDepth: 'detailed',
|
||||||
|
includeClientSideData: false,
|
||||||
|
});
|
||||||
|
if (!items || items.length === 0)
|
||||||
return Response.json(
|
return Response.json(
|
||||||
{ message: "Search didn't return any results!" },
|
{ message: "Search didn't return any results!" },
|
||||||
{ status: 404 },
|
{ status: 404 },
|
||||||
@@ -85,11 +89,13 @@ const server = Bun.serve({
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Parse optional parameters with defaults
|
// Parse optional parameters with defaults
|
||||||
const minPrice = reqUrl.searchParams.get("minPrice")
|
const minPriceParam = reqUrl.searchParams.get("minPrice");
|
||||||
? parseInt(reqUrl.searchParams.get("minPrice")!)
|
const minPrice = minPriceParam
|
||||||
|
? Number.parseInt(minPriceParam, 10)
|
||||||
: undefined;
|
: undefined;
|
||||||
const maxPrice = reqUrl.searchParams.get("maxPrice")
|
const maxPriceParam = reqUrl.searchParams.get("maxPrice");
|
||||||
? parseInt(reqUrl.searchParams.get("maxPrice")!)
|
const maxPrice = maxPriceParam
|
||||||
|
? Number.parseInt(maxPriceParam, 10)
|
||||||
: undefined;
|
: undefined;
|
||||||
const strictMode = reqUrl.searchParams.get("strictMode") === "true";
|
const strictMode = reqUrl.searchParams.get("strictMode") === "true";
|
||||||
const exclusionsParam = reqUrl.searchParams.get("exclusions");
|
const exclusionsParam = reqUrl.searchParams.get("exclusions");
|
||||||
|
|||||||
668
src/kijiji.ts
668
src/kijiji.ts
@@ -26,16 +26,29 @@ interface ApolloListingRoot {
|
|||||||
url?: string;
|
url?: string;
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
price?: { amount?: number | string; currency?: string };
|
price?: { amount?: number | string; currency?: string; type?: string };
|
||||||
type?: string;
|
type?: string;
|
||||||
status?: string;
|
status?: string;
|
||||||
activationDate?: string;
|
activationDate?: string;
|
||||||
endDate?: string;
|
endDate?: string;
|
||||||
metrics?: { views?: number | string };
|
metrics?: { views?: number | string };
|
||||||
location?: { address?: string | null };
|
location?: {
|
||||||
|
address?: string | null;
|
||||||
|
id?: number;
|
||||||
|
name?: string;
|
||||||
|
coordinates?: { latitude: number; longitude: number };
|
||||||
|
};
|
||||||
|
imageUrls?: string[];
|
||||||
|
imageCount?: number;
|
||||||
|
categoryId?: number;
|
||||||
|
adSource?: string;
|
||||||
|
flags?: { topAd?: boolean; priceDrop?: boolean };
|
||||||
|
posterInfo?: { posterId?: string; rating?: number };
|
||||||
|
attributes?: Array<{ canonicalName?: string; canonicalValues?: string[] }>;
|
||||||
[k: string]: unknown;
|
[k: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Keep existing interface for backward compatibility
|
||||||
type ListingDetails = {
|
type ListingDetails = {
|
||||||
url: string;
|
url: string;
|
||||||
title: string;
|
title: string;
|
||||||
@@ -53,10 +66,178 @@ type ListingDetails = {
|
|||||||
address?: string | null;
|
address?: string | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// New comprehensive interface for detailed listings
|
||||||
|
interface DetailedListing extends ListingDetails {
|
||||||
|
images: string[];
|
||||||
|
categoryId: number;
|
||||||
|
adSource: string;
|
||||||
|
flags: {
|
||||||
|
topAd: boolean;
|
||||||
|
priceDrop: boolean;
|
||||||
|
};
|
||||||
|
attributes: Record<string, string[]>;
|
||||||
|
location: {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
coordinates?: {
|
||||||
|
latitude: number;
|
||||||
|
longitude: number;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
sellerInfo?: {
|
||||||
|
posterId: string;
|
||||||
|
rating?: number;
|
||||||
|
accountType?: string;
|
||||||
|
memberSince?: string;
|
||||||
|
reviewCount?: number;
|
||||||
|
reviewScore?: number;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configuration interfaces
|
||||||
|
interface SearchOptions {
|
||||||
|
location?: number | string; // Location ID or name
|
||||||
|
category?: number | string; // Category ID or name
|
||||||
|
keywords?: string;
|
||||||
|
sortBy?: 'relevancy' | 'date' | 'price' | 'distance';
|
||||||
|
sortOrder?: 'desc' | 'asc';
|
||||||
|
maxPages?: number; // Default: 5
|
||||||
|
priceMin?: number;
|
||||||
|
priceMax?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ListingFetchOptions {
|
||||||
|
includeImages?: boolean; // Default: true
|
||||||
|
sellerDataDepth?: 'basic' | 'detailed' | 'full'; // Default: 'detailed'
|
||||||
|
includeClientSideData?: boolean; // Default: false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- Constants & Mappings -----------------------------
|
||||||
|
|
||||||
|
// Location mappings from KIJIJI.md
|
||||||
|
const LOCATION_MAPPINGS: Record<string, number> = {
|
||||||
|
'canada': 0,
|
||||||
|
'ontario': 9004,
|
||||||
|
'toronto': 1700273,
|
||||||
|
'gta': 1700272,
|
||||||
|
'oshawa': 1700275,
|
||||||
|
'quebec': 9001,
|
||||||
|
'nova scotia': 9002,
|
||||||
|
'alberta': 9003,
|
||||||
|
'new brunswick': 9005,
|
||||||
|
'manitoba': 9006,
|
||||||
|
'british columbia': 9007,
|
||||||
|
'newfoundland': 9008,
|
||||||
|
'saskatchewan': 9009,
|
||||||
|
'territories': 9010,
|
||||||
|
'pei': 9011,
|
||||||
|
'prince edward island': 9011,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Category mappings from KIJIJI.md (Buy & Sell main categories)
|
||||||
|
const CATEGORY_MAPPINGS: Record<string, number> = {
|
||||||
|
'all': 0,
|
||||||
|
'buy-sell': 10,
|
||||||
|
'arts-collectibles': 12,
|
||||||
|
'audio': 767,
|
||||||
|
'baby-items': 253,
|
||||||
|
'bags-luggage': 931,
|
||||||
|
'bikes': 644,
|
||||||
|
'books': 109,
|
||||||
|
'cameras': 103,
|
||||||
|
'cds': 104,
|
||||||
|
'clothing': 274,
|
||||||
|
'computers': 16,
|
||||||
|
'computer-accessories': 128,
|
||||||
|
'electronics': 29659001,
|
||||||
|
'free-stuff': 17220001,
|
||||||
|
'furniture': 235,
|
||||||
|
'garage-sales': 638,
|
||||||
|
'health-special-needs': 140,
|
||||||
|
'hobbies-crafts': 139,
|
||||||
|
'home-appliances': 107,
|
||||||
|
'home-indoor': 717,
|
||||||
|
'home-outdoor': 727,
|
||||||
|
'jewellery': 133,
|
||||||
|
'musical-instruments': 17,
|
||||||
|
'phones': 132,
|
||||||
|
'sporting-goods': 111,
|
||||||
|
'tools': 110,
|
||||||
|
'toys-games': 108,
|
||||||
|
'tvs-video': 15093001,
|
||||||
|
'video-games': 141,
|
||||||
|
'other': 26,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Sort parameter mappings
|
||||||
|
const SORT_MAPPINGS: Record<string, string> = {
|
||||||
|
'relevancy': 'MATCH',
|
||||||
|
'date': 'DATE',
|
||||||
|
'price': 'PRICE',
|
||||||
|
'distance': 'DISTANCE',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------- Exports for Testing -----------------------------
|
||||||
|
// Note: These are exported for testing purposes only
|
||||||
|
|
||||||
|
export { resolveLocationId, resolveCategoryId, buildSearchUrl };
|
||||||
|
export { extractApolloState, parseSearch };
|
||||||
|
export { parseDetailedListing };
|
||||||
|
export { HttpError, NetworkError, ParseError, RateLimitError, ValidationError };
|
||||||
|
|
||||||
// ----------------------------- Utilities -----------------------------
|
// ----------------------------- Utilities -----------------------------
|
||||||
|
|
||||||
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve location ID from name or return numeric ID
|
||||||
|
*/
|
||||||
|
function resolveLocationId(location?: number | string): number {
|
||||||
|
if (typeof location === 'number') return location;
|
||||||
|
if (typeof location === 'string') {
|
||||||
|
const normalized = location.toLowerCase().replace(/\s+/g, '-');
|
||||||
|
return LOCATION_MAPPINGS[normalized] ?? 0; // Default to Canada (0)
|
||||||
|
}
|
||||||
|
return 0; // Default to Canada
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve category ID from name or return numeric ID
|
||||||
|
*/
|
||||||
|
function resolveCategoryId(category?: number | string): number {
|
||||||
|
if (typeof category === 'number') return category;
|
||||||
|
if (typeof category === 'string') {
|
||||||
|
const normalized = category.toLowerCase().replace(/\s+/g, '-');
|
||||||
|
return CATEGORY_MAPPINGS[normalized] ?? 0; // Default to all categories
|
||||||
|
}
|
||||||
|
return 0; // Default to all categories
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build search URL with enhanced parameters
|
||||||
|
*/
|
||||||
|
function buildSearchUrl(
|
||||||
|
keywords: string,
|
||||||
|
options: SearchOptions & { page?: number },
|
||||||
|
BASE_URL = "https://www.kijiji.ca"
|
||||||
|
): string {
|
||||||
|
const locationId = resolveLocationId(options.location);
|
||||||
|
const categoryId = resolveCategoryId(options.category);
|
||||||
|
|
||||||
|
const categorySlug = categoryId === 0 ? 'buy-sell' : 'buy-sell'; // Could be enhanced
|
||||||
|
const locationSlug = locationId === 0 ? 'canada' : 'canada'; // Could be enhanced
|
||||||
|
|
||||||
|
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||||||
|
|
||||||
|
const sortParam = options.sortBy ? `&sort=${SORT_MAPPINGS[options.sortBy]}` : '';
|
||||||
|
const sortOrder = options.sortOrder === 'asc' ? 'ASC' : 'DESC';
|
||||||
|
const pageParam = options.page && options.page > 1 ? `&page=${options.page}` : '';
|
||||||
|
|
||||||
|
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
|
||||||
|
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Slugifies a string for search
|
* Slugifies a string for search
|
||||||
*/
|
*/
|
||||||
@@ -67,13 +248,14 @@ export function slugify(input: string): string {
|
|||||||
|
|
||||||
for (let i = 0; i < s.length; i++) {
|
for (let i = 0; i < s.length; i++) {
|
||||||
const ch = s[i];
|
const ch = s[i];
|
||||||
const code = ch!.charCodeAt(0);
|
if (!ch) continue;
|
||||||
|
const code = ch.charCodeAt(0);
|
||||||
|
|
||||||
// a-z or 0-9
|
// a-z or 0-9
|
||||||
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
|
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
|
||||||
out.push(ch!);
|
out.push(ch);
|
||||||
lastHyphen = false;
|
lastHyphen = false;
|
||||||
} else if (SEPS.has(ch!)) {
|
} else if (SEPS.has(ch)) {
|
||||||
if (!lastHyphen) {
|
if (!lastHyphen) {
|
||||||
out.push("-");
|
out.push("-");
|
||||||
lastHyphen = true;
|
lastHyphen = true;
|
||||||
@@ -87,7 +269,7 @@ export function slugify(input: string): string {
|
|||||||
/**
|
/**
|
||||||
* Turns cents to localized currency string.
|
* Turns cents to localized currency string.
|
||||||
*/
|
*/
|
||||||
function formatCentsToCurrency(
|
export function formatCentsToCurrency(
|
||||||
num: number | string | undefined,
|
num: number | string | undefined,
|
||||||
locale = "en-US",
|
locale = "en-US",
|
||||||
): string {
|
): string {
|
||||||
@@ -96,21 +278,24 @@ function formatCentsToCurrency(
|
|||||||
if (Number.isNaN(cents)) return "";
|
if (Number.isNaN(cents)) return "";
|
||||||
const dollars = cents / 100;
|
const dollars = cents / 100;
|
||||||
const formatter = new Intl.NumberFormat(locale, {
|
const formatter = new Intl.NumberFormat(locale, {
|
||||||
|
style: 'currency',
|
||||||
|
currency: 'USD',
|
||||||
minimumFractionDigits: 2,
|
minimumFractionDigits: 2,
|
||||||
maximumFractionDigits: 2,
|
maximumFractionDigits: 2,
|
||||||
useGrouping: true,
|
|
||||||
});
|
});
|
||||||
return formatter.format(dollars);
|
return formatter.format(dollars);
|
||||||
}
|
}
|
||||||
|
|
||||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||||
return typeof value === "object" && value !== null;
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function delay(ms: number): Promise<void> {
|
async function delay(ms: number): Promise<void> {
|
||||||
await new Promise((resolve) => setTimeout(resolve, ms));
|
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ----------------------------- Error Classes -----------------------------
|
||||||
|
|
||||||
class HttpError extends Error {
|
class HttpError extends Error {
|
||||||
constructor(
|
constructor(
|
||||||
message: string,
|
message: string,
|
||||||
@@ -122,12 +307,52 @@ class HttpError extends Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class NetworkError extends Error {
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
public readonly url: string,
|
||||||
|
public readonly cause?: Error,
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = "NetworkError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ParseError extends Error {
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
public readonly data?: unknown,
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = "ParseError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class RateLimitError extends Error {
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
public readonly url: string,
|
||||||
|
public readonly resetTime?: number,
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = "RateLimitError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ValidationError extends Error {
|
||||||
|
constructor(message: string) {
|
||||||
|
super(message);
|
||||||
|
this.name = "ValidationError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ----------------------------- HTTP Client -----------------------------
|
// ----------------------------- HTTP Client -----------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
|
Fetch HTML with enhanced retry strategy and exponential backoff.
|
||||||
- Retries on 429 and 5xx
|
- Retries on 429, 5xx, and network errors
|
||||||
- Respects X-RateLimit-Reset when present (seconds)
|
- Respects X-RateLimit-Reset when present (seconds)
|
||||||
|
- Exponential backoff with jitter
|
||||||
*/
|
*/
|
||||||
async function fetchHtml(
|
async function fetchHtml(
|
||||||
url: string,
|
url: string,
|
||||||
@@ -139,11 +364,13 @@ async function fetchHtml(
|
|||||||
},
|
},
|
||||||
): Promise<HTMLString> {
|
): Promise<HTMLString> {
|
||||||
const maxRetries = opts?.maxRetries ?? 3;
|
const maxRetries = opts?.maxRetries ?? 3;
|
||||||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
const retryBaseMs = opts?.retryBaseMs ?? 1000;
|
||||||
|
|
||||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||||
try {
|
try {
|
||||||
// console.log(`Fetching: `, url);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
|
||||||
|
|
||||||
const res = await fetch(url, {
|
const res = await fetch(url, {
|
||||||
method: "GET",
|
method: "GET",
|
||||||
headers: {
|
headers: {
|
||||||
@@ -155,27 +382,40 @@ async function fetchHtml(
|
|||||||
"user-agent":
|
"user-agent":
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
||||||
},
|
},
|
||||||
|
signal: controller.signal,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
||||||
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
||||||
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
// Respect 429 reset if provided
|
// Handle rate limiting
|
||||||
if (res.status === 429) {
|
if (res.status === 429) {
|
||||||
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
|
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : Number.NaN;
|
||||||
const waitMs = Number.isFinite(resetSeconds)
|
const waitMs = Number.isFinite(resetSeconds)
|
||||||
? Math.max(0, resetSeconds * 1000)
|
? Math.max(0, resetSeconds * 1000)
|
||||||
: (attempt + 1) * retryBaseMs;
|
: calculateBackoffDelay(attempt, retryBaseMs);
|
||||||
|
|
||||||
|
if (attempt < maxRetries) {
|
||||||
await delay(waitMs);
|
await delay(waitMs);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Retry on 5xx
|
throw new RateLimitError(
|
||||||
|
`Rate limit exceeded for ${url}`,
|
||||||
|
url,
|
||||||
|
resetSeconds,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retry on server errors
|
||||||
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
||||||
await delay((attempt + 1) * retryBaseMs);
|
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new HttpError(
|
throw new HttpError(
|
||||||
`Request failed with status ${res.status}`,
|
`Request failed with status ${res.status}`,
|
||||||
res.status,
|
res.status,
|
||||||
@@ -184,16 +424,171 @@ async function fetchHtml(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const html = await res.text();
|
const html = await res.text();
|
||||||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
|
||||||
|
// Respect per-request delay to maintain rate limiting
|
||||||
await delay(DELAY_MS);
|
await delay(DELAY_MS);
|
||||||
return html;
|
return html;
|
||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (attempt >= maxRetries) throw err;
|
// Handle different error types
|
||||||
await delay((attempt + 1) * retryBaseMs);
|
if (err instanceof RateLimitError || err instanceof HttpError) {
|
||||||
|
throw err; // Re-throw known errors
|
||||||
|
}
|
||||||
|
|
||||||
|
if (err instanceof Error && err.name === 'AbortError') {
|
||||||
|
if (attempt < maxRetries) {
|
||||||
|
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw new NetworkError(`Request timeout for ${url}`, url, err);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Network or other errors
|
||||||
|
if (attempt < maxRetries) {
|
||||||
|
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw new NetworkError(
|
||||||
|
`Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`,
|
||||||
|
url,
|
||||||
|
err instanceof Error ? err : undefined
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new Error("Exhausted retries without response");
|
throw new NetworkError(`Exhausted retries without response for ${url}`, url);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate exponential backoff delay with jitter
|
||||||
|
*/
|
||||||
|
function calculateBackoffDelay(attempt: number, baseMs: number): number {
|
||||||
|
const exponentialDelay = baseMs * (2 ** attempt);
|
||||||
|
const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter
|
||||||
|
return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- GraphQL Client -----------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch additional data via GraphQL API
|
||||||
|
*/
|
||||||
|
async function fetchGraphQLData(
|
||||||
|
query: string,
|
||||||
|
variables: Record<string, unknown>,
|
||||||
|
BASE_URL = "https://www.kijiji.ca"
|
||||||
|
): Promise<unknown> {
|
||||||
|
const endpoint = `${BASE_URL}/anvil/api`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(endpoint, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'apollo-require-preflight': 'true',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
query,
|
||||||
|
variables,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new HttpError(
|
||||||
|
`GraphQL request failed with status ${response.status}`,
|
||||||
|
response.status,
|
||||||
|
endpoint
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.errors) {
|
||||||
|
throw new ParseError(`GraphQL errors: ${JSON.stringify(result.errors)}`, result.errors);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.data;
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof HttpError || err instanceof ParseError) {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
throw new NetworkError(
|
||||||
|
`Failed to fetch GraphQL data: ${err instanceof Error ? err.message : String(err)}`,
|
||||||
|
endpoint,
|
||||||
|
err instanceof Error ? err : undefined
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GraphQL response interfaces
|
||||||
|
interface GraphQLReviewResponse {
|
||||||
|
user?: {
|
||||||
|
reviewSummary?: {
|
||||||
|
count?: number;
|
||||||
|
score?: number;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GraphQLProfileResponse {
|
||||||
|
user?: {
|
||||||
|
memberSince?: string;
|
||||||
|
accountType?: string;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// GraphQL queries from KIJIJI.md
|
||||||
|
const GRAPHQL_QUERIES = {
|
||||||
|
getReviewSummary: `
|
||||||
|
query GetReviewSummary($userId: String!) {
|
||||||
|
user(id: $userId) {
|
||||||
|
reviewSummary {
|
||||||
|
count
|
||||||
|
score
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`,
|
||||||
|
getProfileMetrics: `
|
||||||
|
query GetProfileMetrics($profileId: String!) {
|
||||||
|
user(id: $profileId) {
|
||||||
|
memberSince
|
||||||
|
accountType
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`,
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch additional seller data via GraphQL
|
||||||
|
*/
|
||||||
|
async function fetchSellerDetails(
|
||||||
|
posterId: string,
|
||||||
|
BASE_URL = "https://www.kijiji.ca"
|
||||||
|
): Promise<{ reviewCount?: number; reviewScore?: number; memberSince?: string; accountType?: string }> {
|
||||||
|
try {
|
||||||
|
const [reviewData, profileData] = await Promise.all([
|
||||||
|
fetchGraphQLData(GRAPHQL_QUERIES.getReviewSummary, { userId: posterId }, BASE_URL),
|
||||||
|
fetchGraphQLData(GRAPHQL_QUERIES.getProfileMetrics, { profileId: posterId }, BASE_URL),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const reviewResponse = reviewData as GraphQLReviewResponse;
|
||||||
|
const profileResponse = profileData as GraphQLProfileResponse;
|
||||||
|
|
||||||
|
return {
|
||||||
|
reviewCount: reviewResponse?.user?.reviewSummary?.count,
|
||||||
|
reviewScore: reviewResponse?.user?.reviewSummary?.score,
|
||||||
|
memberSince: profileResponse?.user?.memberSince,
|
||||||
|
accountType: profileResponse?.user?.accountType,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
// Silently fail for GraphQL errors - not critical for basic functionality
|
||||||
|
console.warn(`Failed to fetch seller details for ${posterId}:`, err instanceof Error ? err.message : String(err));
|
||||||
|
return {};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----------------------------- Parsing -----------------------------
|
// ----------------------------- Parsing -----------------------------
|
||||||
@@ -299,7 +694,7 @@ function parseListing(
|
|||||||
listingPrice: amountFormatted
|
listingPrice: amountFormatted
|
||||||
? {
|
? {
|
||||||
amountFormatted,
|
amountFormatted,
|
||||||
cents: Number.isFinite(cents!) ? cents : undefined,
|
cents: cents !== undefined && Number.isFinite(cents) ? cents : undefined,
|
||||||
currency: price?.currency,
|
currency: price?.currency,
|
||||||
}
|
}
|
||||||
: undefined,
|
: undefined,
|
||||||
@@ -307,84 +702,237 @@ function parseListing(
|
|||||||
listingStatus: status,
|
listingStatus: status,
|
||||||
creationDate: activationDate,
|
creationDate: activationDate,
|
||||||
endDate,
|
endDate,
|
||||||
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined,
|
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
|
||||||
address: location?.address ?? null,
|
address: location?.address ?? null,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a listing page into a detailed object with all available fields
|
||||||
|
*/
|
||||||
|
async function parseDetailedListing(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
BASE_URL: string,
|
||||||
|
options: ListingFetchOptions = {}
|
||||||
|
): Promise<DetailedListing | null> {
|
||||||
|
const apolloState = extractApolloState(htmlString);
|
||||||
|
if (!apolloState) return null;
|
||||||
|
|
||||||
|
// Find the listing root key
|
||||||
|
const listingKey = Object.keys(apolloState).find((k) =>
|
||||||
|
k.includes("Listing"),
|
||||||
|
);
|
||||||
|
if (!listingKey) return null;
|
||||||
|
|
||||||
|
const root = apolloState[listingKey];
|
||||||
|
if (!isRecord(root)) return null;
|
||||||
|
|
||||||
|
const {
|
||||||
|
url,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
price,
|
||||||
|
type,
|
||||||
|
status,
|
||||||
|
activationDate,
|
||||||
|
endDate,
|
||||||
|
metrics,
|
||||||
|
location,
|
||||||
|
imageUrls,
|
||||||
|
imageCount,
|
||||||
|
categoryId,
|
||||||
|
adSource,
|
||||||
|
flags,
|
||||||
|
posterInfo,
|
||||||
|
attributes,
|
||||||
|
} = root as ApolloListingRoot;
|
||||||
|
|
||||||
|
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
||||||
|
const amountFormatted = formatCentsToCurrency(cents);
|
||||||
|
|
||||||
|
const numberOfViews =
|
||||||
|
metrics?.views != null ? Number(metrics.views) : undefined;
|
||||||
|
|
||||||
|
const listingUrl =
|
||||||
|
typeof url === "string"
|
||||||
|
? url.startsWith("http")
|
||||||
|
? url
|
||||||
|
: `${BASE_URL}${url}`
|
||||||
|
: "";
|
||||||
|
|
||||||
|
if (!listingUrl || !title) return null;
|
||||||
|
|
||||||
|
// Only include fixed-price listings
|
||||||
|
if (!amountFormatted || cents === undefined) return null;
|
||||||
|
|
||||||
|
// Extract images if requested
|
||||||
|
const images = options.includeImages !== false && Array.isArray(imageUrls)
|
||||||
|
? imageUrls.filter((url): url is string => typeof url === 'string')
|
||||||
|
: [];
|
||||||
|
|
||||||
|
// Extract attributes as key-value pairs
|
||||||
|
const attributeMap: Record<string, string[]> = {};
|
||||||
|
if (Array.isArray(attributes)) {
|
||||||
|
for (const attr of attributes) {
|
||||||
|
if (attr?.canonicalName && Array.isArray(attr.canonicalValues)) {
|
||||||
|
attributeMap[attr.canonicalName] = attr.canonicalValues;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract seller info based on depth setting
|
||||||
|
let sellerInfo: DetailedListing['sellerInfo'];
|
||||||
|
const depth = options.sellerDataDepth ?? 'detailed';
|
||||||
|
|
||||||
|
if (posterInfo?.posterId) {
|
||||||
|
sellerInfo = {
|
||||||
|
posterId: posterInfo.posterId,
|
||||||
|
rating: typeof posterInfo.rating === 'number' ? posterInfo.rating : undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Add more detailed info if requested and client-side data is enabled
|
||||||
|
if ((depth === 'detailed' || depth === 'full') && options.includeClientSideData) {
|
||||||
|
try {
|
||||||
|
const additionalData = await fetchSellerDetails(posterInfo.posterId, BASE_URL);
|
||||||
|
sellerInfo = {
|
||||||
|
...sellerInfo,
|
||||||
|
...additionalData,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
// Silently fail - GraphQL data is optional
|
||||||
|
console.warn(`Failed to fetch additional seller data for ${posterInfo.posterId}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: listingUrl,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
listingPrice: {
|
||||||
|
amountFormatted,
|
||||||
|
cents,
|
||||||
|
currency: price?.currency,
|
||||||
|
},
|
||||||
|
listingType: type,
|
||||||
|
listingStatus: status,
|
||||||
|
creationDate: activationDate,
|
||||||
|
endDate,
|
||||||
|
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
|
||||||
|
address: location?.address ?? null,
|
||||||
|
images,
|
||||||
|
categoryId: typeof categoryId === 'number' ? categoryId : 0,
|
||||||
|
adSource: typeof adSource === 'string' ? adSource : 'UNKNOWN',
|
||||||
|
flags: {
|
||||||
|
topAd: flags?.topAd === true,
|
||||||
|
priceDrop: flags?.priceDrop === true,
|
||||||
|
},
|
||||||
|
attributes: attributeMap,
|
||||||
|
location: {
|
||||||
|
id: typeof location?.id === 'number' ? location.id : 0,
|
||||||
|
name: typeof location?.name === 'string' ? location.name : 'Unknown',
|
||||||
|
coordinates: location?.coordinates ? {
|
||||||
|
latitude: location.coordinates.latitude,
|
||||||
|
longitude: location.coordinates.longitude,
|
||||||
|
} : undefined,
|
||||||
|
},
|
||||||
|
sellerInfo,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// ----------------------------- Main -----------------------------
|
// ----------------------------- Main -----------------------------
|
||||||
|
|
||||||
export default async function fetchKijijiItems(
|
export default async function fetchKijijiItems(
|
||||||
SEARCH_QUERY: string,
|
SEARCH_QUERY: string,
|
||||||
REQUESTS_PER_SECOND = 1,
|
REQUESTS_PER_SECOND = 1,
|
||||||
BASE_URL = "https://www.kijiji.ca",
|
BASE_URL = "https://www.kijiji.ca",
|
||||||
|
searchOptions: SearchOptions = {},
|
||||||
|
listingOptions: ListingFetchOptions = {},
|
||||||
) {
|
) {
|
||||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||||||
|
|
||||||
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
|
// Set defaults for configuration
|
||||||
|
const finalSearchOptions: Required<SearchOptions> = {
|
||||||
|
location: searchOptions.location ?? 1700272, // Default to GTA
|
||||||
|
category: searchOptions.category ?? 0, // Default to all categories
|
||||||
|
keywords: searchOptions.keywords ?? SEARCH_QUERY,
|
||||||
|
sortBy: searchOptions.sortBy ?? 'relevancy',
|
||||||
|
sortOrder: searchOptions.sortOrder ?? 'desc',
|
||||||
|
maxPages: searchOptions.maxPages ?? 5, // Default to 5 pages
|
||||||
|
priceMin: searchOptions.priceMin,
|
||||||
|
priceMax: searchOptions.priceMax,
|
||||||
|
};
|
||||||
|
|
||||||
console.log(`Fetching search: ${searchUrl}`);
|
const finalListingOptions: Required<ListingFetchOptions> = {
|
||||||
|
includeImages: listingOptions.includeImages ?? true,
|
||||||
|
sellerDataDepth: listingOptions.sellerDataDepth ?? 'detailed',
|
||||||
|
includeClientSideData: listingOptions.includeClientSideData ?? false,
|
||||||
|
};
|
||||||
|
|
||||||
|
const allListings: DetailedListing[] = [];
|
||||||
|
const seenUrls = new Set<string>();
|
||||||
|
|
||||||
|
// Fetch multiple pages
|
||||||
|
for (let page = 1; page <= finalSearchOptions.maxPages; page++) {
|
||||||
|
const searchUrl = buildSearchUrl(finalSearchOptions.keywords, {
|
||||||
|
...finalSearchOptions,
|
||||||
|
// Add page parameter for pagination
|
||||||
|
...(page > 1 && { page }),
|
||||||
|
}, BASE_URL);
|
||||||
|
|
||||||
|
console.log(`Fetching search page ${page}: ${searchUrl}`);
|
||||||
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
||||||
onRateInfo: (remaining, reset) => {
|
onRateInfo: (remaining, reset) => {
|
||||||
if (remaining && reset) {
|
if (remaining && reset) {
|
||||||
console.log(
|
console.log(`\nSearch - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
|
||||||
"\n" +
|
|
||||||
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const searchResults = parseSearch(searchHtml, BASE_URL);
|
const searchResults = parseSearch(searchHtml, BASE_URL);
|
||||||
if (searchResults.length === 0) {
|
if (searchResults.length === 0) {
|
||||||
console.warn("No search results parsed from page.");
|
console.log(`No more results found on page ${page}. Stopping pagination.`);
|
||||||
return;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deduplicate links
|
// Deduplicate links across pages
|
||||||
const listingLinks = Array.from(
|
const newListingLinks = searchResults
|
||||||
new Set(searchResults.map((r) => r.listingLink)),
|
.map((r) => r.listingLink)
|
||||||
);
|
.filter((link) => !seenUrls.has(link));
|
||||||
|
|
||||||
console.log(
|
for (const link of newListingLinks) {
|
||||||
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
|
seenUrls.add(link);
|
||||||
);
|
}
|
||||||
|
|
||||||
|
console.log(`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`);
|
||||||
|
|
||||||
|
// Fetch details for this page's listings
|
||||||
const progressBar = new cliProgress.SingleBar(
|
const progressBar = new cliProgress.SingleBar(
|
||||||
{},
|
{},
|
||||||
cliProgress.Presets.shades_classic,
|
cliProgress.Presets.shades_classic,
|
||||||
);
|
);
|
||||||
const totalProgress = listingLinks.length;
|
const totalProgress = newListingLinks.length;
|
||||||
let currentProgress = 0;
|
let currentProgress = 0;
|
||||||
progressBar.start(totalProgress, currentProgress);
|
progressBar.start(totalProgress, currentProgress);
|
||||||
|
|
||||||
const items: ListingDetails[] = [];
|
for (const link of newListingLinks) {
|
||||||
for (const link of listingLinks) {
|
|
||||||
try {
|
try {
|
||||||
const html = await fetchHtml(link, DELAY_MS, {
|
const html = await fetchHtml(link, DELAY_MS, {
|
||||||
onRateInfo: (remaining, reset) => {
|
onRateInfo: (remaining, reset) => {
|
||||||
if (remaining && reset) {
|
if (remaining && reset) {
|
||||||
console.log(
|
console.log(`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
|
||||||
"\n" +
|
|
||||||
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
const parsed = parseListing(html, BASE_URL);
|
const parsed = await parseDetailedListing(html, BASE_URL, finalListingOptions);
|
||||||
if (parsed) {
|
if (parsed) {
|
||||||
if (parsed.listingPrice?.cents) items.push(parsed);
|
allListings.push(parsed);
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof HttpError) {
|
if (err instanceof HttpError) {
|
||||||
console.error(
|
console.error(`\nFailed to fetch ${link}\n - ${err.status} ${err.message}`);
|
||||||
"\n" + `Failed to fetch ${link}\n - ${err.status} ${err.message}`,
|
|
||||||
);
|
|
||||||
} else {
|
} else {
|
||||||
console.error(
|
console.error(`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`);
|
||||||
"\n" +
|
|
||||||
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
currentProgress++;
|
currentProgress++;
|
||||||
@@ -392,6 +940,14 @@ export default async function fetchKijijiItems(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log("\n" + `Parsed ${items.length} listings.`);
|
progressBar.stop();
|
||||||
return items;
|
|
||||||
|
// If we got fewer results than expected (40 per page), we've reached the end
|
||||||
|
if (searchResults.length < 40) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\nParsed ${allListings.length} detailed listings.`);
|
||||||
|
return allListings;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user