feat: update kijiji scraper
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
18
src/index.ts
18
src/index.ts
@@ -26,8 +26,12 @@ const server = Bun.serve({
|
||||
{ status: 400 },
|
||||
);
|
||||
|
||||
const items = await fetchKijijiItems(SEARCH_QUERY, 5);
|
||||
if (!items)
|
||||
const items = await fetchKijijiItems(SEARCH_QUERY, 1, undefined, {}, {
|
||||
includeImages: true,
|
||||
sellerDataDepth: 'detailed',
|
||||
includeClientSideData: false,
|
||||
});
|
||||
if (!items || items.length === 0)
|
||||
return Response.json(
|
||||
{ message: "Search didn't return any results!" },
|
||||
{ status: 404 },
|
||||
@@ -85,11 +89,13 @@ const server = Bun.serve({
|
||||
);
|
||||
|
||||
// Parse optional parameters with defaults
|
||||
const minPrice = reqUrl.searchParams.get("minPrice")
|
||||
? parseInt(reqUrl.searchParams.get("minPrice")!)
|
||||
const minPriceParam = reqUrl.searchParams.get("minPrice");
|
||||
const minPrice = minPriceParam
|
||||
? Number.parseInt(minPriceParam, 10)
|
||||
: undefined;
|
||||
const maxPrice = reqUrl.searchParams.get("maxPrice")
|
||||
? parseInt(reqUrl.searchParams.get("maxPrice")!)
|
||||
const maxPriceParam = reqUrl.searchParams.get("maxPrice");
|
||||
const maxPrice = maxPriceParam
|
||||
? Number.parseInt(maxPriceParam, 10)
|
||||
: undefined;
|
||||
const strictMode = reqUrl.searchParams.get("strictMode") === "true";
|
||||
const exclusionsParam = reqUrl.searchParams.get("exclusions");
|
||||
|
||||
668
src/kijiji.ts
668
src/kijiji.ts
@@ -26,16 +26,29 @@ interface ApolloListingRoot {
|
||||
url?: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
price?: { amount?: number | string; currency?: string };
|
||||
price?: { amount?: number | string; currency?: string; type?: string };
|
||||
type?: string;
|
||||
status?: string;
|
||||
activationDate?: string;
|
||||
endDate?: string;
|
||||
metrics?: { views?: number | string };
|
||||
location?: { address?: string | null };
|
||||
location?: {
|
||||
address?: string | null;
|
||||
id?: number;
|
||||
name?: string;
|
||||
coordinates?: { latitude: number; longitude: number };
|
||||
};
|
||||
imageUrls?: string[];
|
||||
imageCount?: number;
|
||||
categoryId?: number;
|
||||
adSource?: string;
|
||||
flags?: { topAd?: boolean; priceDrop?: boolean };
|
||||
posterInfo?: { posterId?: string; rating?: number };
|
||||
attributes?: Array<{ canonicalName?: string; canonicalValues?: string[] }>;
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
// Keep existing interface for backward compatibility
|
||||
type ListingDetails = {
|
||||
url: string;
|
||||
title: string;
|
||||
@@ -53,10 +66,178 @@ type ListingDetails = {
|
||||
address?: string | null;
|
||||
};
|
||||
|
||||
// New comprehensive interface for detailed listings
|
||||
interface DetailedListing extends ListingDetails {
|
||||
images: string[];
|
||||
categoryId: number;
|
||||
adSource: string;
|
||||
flags: {
|
||||
topAd: boolean;
|
||||
priceDrop: boolean;
|
||||
};
|
||||
attributes: Record<string, string[]>;
|
||||
location: {
|
||||
id: number;
|
||||
name: string;
|
||||
coordinates?: {
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
};
|
||||
};
|
||||
sellerInfo?: {
|
||||
posterId: string;
|
||||
rating?: number;
|
||||
accountType?: string;
|
||||
memberSince?: string;
|
||||
reviewCount?: number;
|
||||
reviewScore?: number;
|
||||
};
|
||||
}
|
||||
|
||||
// Configuration interfaces
|
||||
interface SearchOptions {
|
||||
location?: number | string; // Location ID or name
|
||||
category?: number | string; // Category ID or name
|
||||
keywords?: string;
|
||||
sortBy?: 'relevancy' | 'date' | 'price' | 'distance';
|
||||
sortOrder?: 'desc' | 'asc';
|
||||
maxPages?: number; // Default: 5
|
||||
priceMin?: number;
|
||||
priceMax?: number;
|
||||
}
|
||||
|
||||
interface ListingFetchOptions {
|
||||
includeImages?: boolean; // Default: true
|
||||
sellerDataDepth?: 'basic' | 'detailed' | 'full'; // Default: 'detailed'
|
||||
includeClientSideData?: boolean; // Default: false
|
||||
}
|
||||
|
||||
// ----------------------------- Constants & Mappings -----------------------------
|
||||
|
||||
// Location mappings from KIJIJI.md
|
||||
const LOCATION_MAPPINGS: Record<string, number> = {
|
||||
'canada': 0,
|
||||
'ontario': 9004,
|
||||
'toronto': 1700273,
|
||||
'gta': 1700272,
|
||||
'oshawa': 1700275,
|
||||
'quebec': 9001,
|
||||
'nova scotia': 9002,
|
||||
'alberta': 9003,
|
||||
'new brunswick': 9005,
|
||||
'manitoba': 9006,
|
||||
'british columbia': 9007,
|
||||
'newfoundland': 9008,
|
||||
'saskatchewan': 9009,
|
||||
'territories': 9010,
|
||||
'pei': 9011,
|
||||
'prince edward island': 9011,
|
||||
};
|
||||
|
||||
// Category mappings from KIJIJI.md (Buy & Sell main categories)
|
||||
const CATEGORY_MAPPINGS: Record<string, number> = {
|
||||
'all': 0,
|
||||
'buy-sell': 10,
|
||||
'arts-collectibles': 12,
|
||||
'audio': 767,
|
||||
'baby-items': 253,
|
||||
'bags-luggage': 931,
|
||||
'bikes': 644,
|
||||
'books': 109,
|
||||
'cameras': 103,
|
||||
'cds': 104,
|
||||
'clothing': 274,
|
||||
'computers': 16,
|
||||
'computer-accessories': 128,
|
||||
'electronics': 29659001,
|
||||
'free-stuff': 17220001,
|
||||
'furniture': 235,
|
||||
'garage-sales': 638,
|
||||
'health-special-needs': 140,
|
||||
'hobbies-crafts': 139,
|
||||
'home-appliances': 107,
|
||||
'home-indoor': 717,
|
||||
'home-outdoor': 727,
|
||||
'jewellery': 133,
|
||||
'musical-instruments': 17,
|
||||
'phones': 132,
|
||||
'sporting-goods': 111,
|
||||
'tools': 110,
|
||||
'toys-games': 108,
|
||||
'tvs-video': 15093001,
|
||||
'video-games': 141,
|
||||
'other': 26,
|
||||
};
|
||||
|
||||
// Sort parameter mappings
|
||||
const SORT_MAPPINGS: Record<string, string> = {
|
||||
'relevancy': 'MATCH',
|
||||
'date': 'DATE',
|
||||
'price': 'PRICE',
|
||||
'distance': 'DISTANCE',
|
||||
};
|
||||
|
||||
// ----------------------------- Exports for Testing -----------------------------
|
||||
// Note: These are exported for testing purposes only
|
||||
|
||||
export { resolveLocationId, resolveCategoryId, buildSearchUrl };
|
||||
export { extractApolloState, parseSearch };
|
||||
export { parseDetailedListing };
|
||||
export { HttpError, NetworkError, ParseError, RateLimitError, ValidationError };
|
||||
|
||||
// ----------------------------- Utilities -----------------------------
|
||||
|
||||
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
||||
|
||||
/**
|
||||
* Resolve location ID from name or return numeric ID
|
||||
*/
|
||||
function resolveLocationId(location?: number | string): number {
|
||||
if (typeof location === 'number') return location;
|
||||
if (typeof location === 'string') {
|
||||
const normalized = location.toLowerCase().replace(/\s+/g, '-');
|
||||
return LOCATION_MAPPINGS[normalized] ?? 0; // Default to Canada (0)
|
||||
}
|
||||
return 0; // Default to Canada
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve category ID from name or return numeric ID
|
||||
*/
|
||||
function resolveCategoryId(category?: number | string): number {
|
||||
if (typeof category === 'number') return category;
|
||||
if (typeof category === 'string') {
|
||||
const normalized = category.toLowerCase().replace(/\s+/g, '-');
|
||||
return CATEGORY_MAPPINGS[normalized] ?? 0; // Default to all categories
|
||||
}
|
||||
return 0; // Default to all categories
|
||||
}
|
||||
|
||||
/**
|
||||
* Build search URL with enhanced parameters
|
||||
*/
|
||||
function buildSearchUrl(
|
||||
keywords: string,
|
||||
options: SearchOptions & { page?: number },
|
||||
BASE_URL = "https://www.kijiji.ca"
|
||||
): string {
|
||||
const locationId = resolveLocationId(options.location);
|
||||
const categoryId = resolveCategoryId(options.category);
|
||||
|
||||
const categorySlug = categoryId === 0 ? 'buy-sell' : 'buy-sell'; // Could be enhanced
|
||||
const locationSlug = locationId === 0 ? 'canada' : 'canada'; // Could be enhanced
|
||||
|
||||
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||||
|
||||
const sortParam = options.sortBy ? `&sort=${SORT_MAPPINGS[options.sortBy]}` : '';
|
||||
const sortOrder = options.sortOrder === 'asc' ? 'ASC' : 'DESC';
|
||||
const pageParam = options.page && options.page > 1 ? `&page=${options.page}` : '';
|
||||
|
||||
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Slugifies a string for search
|
||||
*/
|
||||
@@ -67,13 +248,14 @@ export function slugify(input: string): string {
|
||||
|
||||
for (let i = 0; i < s.length; i++) {
|
||||
const ch = s[i];
|
||||
const code = ch!.charCodeAt(0);
|
||||
if (!ch) continue;
|
||||
const code = ch.charCodeAt(0);
|
||||
|
||||
// a-z or 0-9
|
||||
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
|
||||
out.push(ch!);
|
||||
out.push(ch);
|
||||
lastHyphen = false;
|
||||
} else if (SEPS.has(ch!)) {
|
||||
} else if (SEPS.has(ch)) {
|
||||
if (!lastHyphen) {
|
||||
out.push("-");
|
||||
lastHyphen = true;
|
||||
@@ -87,7 +269,7 @@ export function slugify(input: string): string {
|
||||
/**
|
||||
* Turns cents to localized currency string.
|
||||
*/
|
||||
function formatCentsToCurrency(
|
||||
export function formatCentsToCurrency(
|
||||
num: number | string | undefined,
|
||||
locale = "en-US",
|
||||
): string {
|
||||
@@ -96,21 +278,24 @@ function formatCentsToCurrency(
|
||||
if (Number.isNaN(cents)) return "";
|
||||
const dollars = cents / 100;
|
||||
const formatter = new Intl.NumberFormat(locale, {
|
||||
style: 'currency',
|
||||
currency: 'USD',
|
||||
minimumFractionDigits: 2,
|
||||
maximumFractionDigits: 2,
|
||||
useGrouping: true,
|
||||
});
|
||||
return formatter.format(dollars);
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === "object" && value !== null;
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
async function delay(ms: number): Promise<void> {
|
||||
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// ----------------------------- Error Classes -----------------------------
|
||||
|
||||
class HttpError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
@@ -122,12 +307,52 @@ class HttpError extends Error {
|
||||
}
|
||||
}
|
||||
|
||||
class NetworkError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly url: string,
|
||||
public readonly cause?: Error,
|
||||
) {
|
||||
super(message);
|
||||
this.name = "NetworkError";
|
||||
}
|
||||
}
|
||||
|
||||
class ParseError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly data?: unknown,
|
||||
) {
|
||||
super(message);
|
||||
this.name = "ParseError";
|
||||
}
|
||||
}
|
||||
|
||||
class RateLimitError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly url: string,
|
||||
public readonly resetTime?: number,
|
||||
) {
|
||||
super(message);
|
||||
this.name = "RateLimitError";
|
||||
}
|
||||
}
|
||||
|
||||
class ValidationError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = "ValidationError";
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------- HTTP Client -----------------------------
|
||||
|
||||
/**
|
||||
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls.
|
||||
- Retries on 429 and 5xx
|
||||
Fetch HTML with enhanced retry strategy and exponential backoff.
|
||||
- Retries on 429, 5xx, and network errors
|
||||
- Respects X-RateLimit-Reset when present (seconds)
|
||||
- Exponential backoff with jitter
|
||||
*/
|
||||
async function fetchHtml(
|
||||
url: string,
|
||||
@@ -139,11 +364,13 @@ async function fetchHtml(
|
||||
},
|
||||
): Promise<HTMLString> {
|
||||
const maxRetries = opts?.maxRetries ?? 3;
|
||||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||||
const retryBaseMs = opts?.retryBaseMs ?? 1000;
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
// console.log(`Fetching: `, url);
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
|
||||
|
||||
const res = await fetch(url, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
@@ -155,27 +382,40 @@ async function fetchHtml(
|
||||
"user-agent":
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
||||
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
||||
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
||||
|
||||
if (!res.ok) {
|
||||
// Respect 429 reset if provided
|
||||
// Handle rate limiting
|
||||
if (res.status === 429) {
|
||||
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN;
|
||||
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : Number.NaN;
|
||||
const waitMs = Number.isFinite(resetSeconds)
|
||||
? Math.max(0, resetSeconds * 1000)
|
||||
: (attempt + 1) * retryBaseMs;
|
||||
: calculateBackoffDelay(attempt, retryBaseMs);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
await delay(waitMs);
|
||||
continue;
|
||||
}
|
||||
// Retry on 5xx
|
||||
throw new RateLimitError(
|
||||
`Rate limit exceeded for ${url}`,
|
||||
url,
|
||||
resetSeconds,
|
||||
);
|
||||
}
|
||||
|
||||
// Retry on server errors
|
||||
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||||
continue;
|
||||
}
|
||||
|
||||
throw new HttpError(
|
||||
`Request failed with status ${res.status}`,
|
||||
res.status,
|
||||
@@ -184,16 +424,171 @@ async function fetchHtml(
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||||
|
||||
// Respect per-request delay to maintain rate limiting
|
||||
await delay(DELAY_MS);
|
||||
return html;
|
||||
|
||||
} catch (err) {
|
||||
if (attempt >= maxRetries) throw err;
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
// Handle different error types
|
||||
if (err instanceof RateLimitError || err instanceof HttpError) {
|
||||
throw err; // Re-throw known errors
|
||||
}
|
||||
|
||||
if (err instanceof Error && err.name === 'AbortError') {
|
||||
if (attempt < maxRetries) {
|
||||
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||||
continue;
|
||||
}
|
||||
throw new NetworkError(`Request timeout for ${url}`, url, err);
|
||||
}
|
||||
|
||||
// Network or other errors
|
||||
if (attempt < maxRetries) {
|
||||
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||||
continue;
|
||||
}
|
||||
throw new NetworkError(
|
||||
`Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`,
|
||||
url,
|
||||
err instanceof Error ? err : undefined
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error("Exhausted retries without response");
|
||||
throw new NetworkError(`Exhausted retries without response for ${url}`, url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate exponential backoff delay with jitter
|
||||
*/
|
||||
function calculateBackoffDelay(attempt: number, baseMs: number): number {
|
||||
const exponentialDelay = baseMs * (2 ** attempt);
|
||||
const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter
|
||||
return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds
|
||||
}
|
||||
|
||||
// ----------------------------- GraphQL Client -----------------------------
|
||||
|
||||
/**
|
||||
* Fetch additional data via GraphQL API
|
||||
*/
|
||||
async function fetchGraphQLData(
|
||||
query: string,
|
||||
variables: Record<string, unknown>,
|
||||
BASE_URL = "https://www.kijiji.ca"
|
||||
): Promise<unknown> {
|
||||
const endpoint = `${BASE_URL}/anvil/api`;
|
||||
|
||||
try {
|
||||
const response = await fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'apollo-require-preflight': 'true',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
query,
|
||||
variables,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new HttpError(
|
||||
`GraphQL request failed with status ${response.status}`,
|
||||
response.status,
|
||||
endpoint
|
||||
);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
if (result.errors) {
|
||||
throw new ParseError(`GraphQL errors: ${JSON.stringify(result.errors)}`, result.errors);
|
||||
}
|
||||
|
||||
return result.data;
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError || err instanceof ParseError) {
|
||||
throw err;
|
||||
}
|
||||
throw new NetworkError(
|
||||
`Failed to fetch GraphQL data: ${err instanceof Error ? err.message : String(err)}`,
|
||||
endpoint,
|
||||
err instanceof Error ? err : undefined
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// GraphQL response interfaces
|
||||
interface GraphQLReviewResponse {
|
||||
user?: {
|
||||
reviewSummary?: {
|
||||
count?: number;
|
||||
score?: number;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
interface GraphQLProfileResponse {
|
||||
user?: {
|
||||
memberSince?: string;
|
||||
accountType?: string;
|
||||
};
|
||||
}
|
||||
|
||||
// GraphQL queries from KIJIJI.md
|
||||
const GRAPHQL_QUERIES = {
|
||||
getReviewSummary: `
|
||||
query GetReviewSummary($userId: String!) {
|
||||
user(id: $userId) {
|
||||
reviewSummary {
|
||||
count
|
||||
score
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
`,
|
||||
getProfileMetrics: `
|
||||
query GetProfileMetrics($profileId: String!) {
|
||||
user(id: $profileId) {
|
||||
memberSince
|
||||
accountType
|
||||
__typename
|
||||
}
|
||||
}
|
||||
`,
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* Fetch additional seller data via GraphQL
|
||||
*/
|
||||
async function fetchSellerDetails(
|
||||
posterId: string,
|
||||
BASE_URL = "https://www.kijiji.ca"
|
||||
): Promise<{ reviewCount?: number; reviewScore?: number; memberSince?: string; accountType?: string }> {
|
||||
try {
|
||||
const [reviewData, profileData] = await Promise.all([
|
||||
fetchGraphQLData(GRAPHQL_QUERIES.getReviewSummary, { userId: posterId }, BASE_URL),
|
||||
fetchGraphQLData(GRAPHQL_QUERIES.getProfileMetrics, { profileId: posterId }, BASE_URL),
|
||||
]);
|
||||
|
||||
const reviewResponse = reviewData as GraphQLReviewResponse;
|
||||
const profileResponse = profileData as GraphQLProfileResponse;
|
||||
|
||||
return {
|
||||
reviewCount: reviewResponse?.user?.reviewSummary?.count,
|
||||
reviewScore: reviewResponse?.user?.reviewSummary?.score,
|
||||
memberSince: profileResponse?.user?.memberSince,
|
||||
accountType: profileResponse?.user?.accountType,
|
||||
};
|
||||
} catch (err) {
|
||||
// Silently fail for GraphQL errors - not critical for basic functionality
|
||||
console.warn(`Failed to fetch seller details for ${posterId}:`, err instanceof Error ? err.message : String(err));
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------- Parsing -----------------------------
|
||||
@@ -299,7 +694,7 @@ function parseListing(
|
||||
listingPrice: amountFormatted
|
||||
? {
|
||||
amountFormatted,
|
||||
cents: Number.isFinite(cents!) ? cents : undefined,
|
||||
cents: cents !== undefined && Number.isFinite(cents) ? cents : undefined,
|
||||
currency: price?.currency,
|
||||
}
|
||||
: undefined,
|
||||
@@ -307,84 +702,237 @@ function parseListing(
|
||||
listingStatus: status,
|
||||
creationDate: activationDate,
|
||||
endDate,
|
||||
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined,
|
||||
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
|
||||
address: location?.address ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a listing page into a detailed object with all available fields
|
||||
*/
|
||||
async function parseDetailedListing(
|
||||
htmlString: HTMLString,
|
||||
BASE_URL: string,
|
||||
options: ListingFetchOptions = {}
|
||||
): Promise<DetailedListing | null> {
|
||||
const apolloState = extractApolloState(htmlString);
|
||||
if (!apolloState) return null;
|
||||
|
||||
// Find the listing root key
|
||||
const listingKey = Object.keys(apolloState).find((k) =>
|
||||
k.includes("Listing"),
|
||||
);
|
||||
if (!listingKey) return null;
|
||||
|
||||
const root = apolloState[listingKey];
|
||||
if (!isRecord(root)) return null;
|
||||
|
||||
const {
|
||||
url,
|
||||
title,
|
||||
description,
|
||||
price,
|
||||
type,
|
||||
status,
|
||||
activationDate,
|
||||
endDate,
|
||||
metrics,
|
||||
location,
|
||||
imageUrls,
|
||||
imageCount,
|
||||
categoryId,
|
||||
adSource,
|
||||
flags,
|
||||
posterInfo,
|
||||
attributes,
|
||||
} = root as ApolloListingRoot;
|
||||
|
||||
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
||||
const amountFormatted = formatCentsToCurrency(cents);
|
||||
|
||||
const numberOfViews =
|
||||
metrics?.views != null ? Number(metrics.views) : undefined;
|
||||
|
||||
const listingUrl =
|
||||
typeof url === "string"
|
||||
? url.startsWith("http")
|
||||
? url
|
||||
: `${BASE_URL}${url}`
|
||||
: "";
|
||||
|
||||
if (!listingUrl || !title) return null;
|
||||
|
||||
// Only include fixed-price listings
|
||||
if (!amountFormatted || cents === undefined) return null;
|
||||
|
||||
// Extract images if requested
|
||||
const images = options.includeImages !== false && Array.isArray(imageUrls)
|
||||
? imageUrls.filter((url): url is string => typeof url === 'string')
|
||||
: [];
|
||||
|
||||
// Extract attributes as key-value pairs
|
||||
const attributeMap: Record<string, string[]> = {};
|
||||
if (Array.isArray(attributes)) {
|
||||
for (const attr of attributes) {
|
||||
if (attr?.canonicalName && Array.isArray(attr.canonicalValues)) {
|
||||
attributeMap[attr.canonicalName] = attr.canonicalValues;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract seller info based on depth setting
|
||||
let sellerInfo: DetailedListing['sellerInfo'];
|
||||
const depth = options.sellerDataDepth ?? 'detailed';
|
||||
|
||||
if (posterInfo?.posterId) {
|
||||
sellerInfo = {
|
||||
posterId: posterInfo.posterId,
|
||||
rating: typeof posterInfo.rating === 'number' ? posterInfo.rating : undefined,
|
||||
};
|
||||
|
||||
// Add more detailed info if requested and client-side data is enabled
|
||||
if ((depth === 'detailed' || depth === 'full') && options.includeClientSideData) {
|
||||
try {
|
||||
const additionalData = await fetchSellerDetails(posterInfo.posterId, BASE_URL);
|
||||
sellerInfo = {
|
||||
...sellerInfo,
|
||||
...additionalData,
|
||||
};
|
||||
} catch (err) {
|
||||
// Silently fail - GraphQL data is optional
|
||||
console.warn(`Failed to fetch additional seller data for ${posterInfo.posterId}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
url: listingUrl,
|
||||
title,
|
||||
description,
|
||||
listingPrice: {
|
||||
amountFormatted,
|
||||
cents,
|
||||
currency: price?.currency,
|
||||
},
|
||||
listingType: type,
|
||||
listingStatus: status,
|
||||
creationDate: activationDate,
|
||||
endDate,
|
||||
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
|
||||
address: location?.address ?? null,
|
||||
images,
|
||||
categoryId: typeof categoryId === 'number' ? categoryId : 0,
|
||||
adSource: typeof adSource === 'string' ? adSource : 'UNKNOWN',
|
||||
flags: {
|
||||
topAd: flags?.topAd === true,
|
||||
priceDrop: flags?.priceDrop === true,
|
||||
},
|
||||
attributes: attributeMap,
|
||||
location: {
|
||||
id: typeof location?.id === 'number' ? location.id : 0,
|
||||
name: typeof location?.name === 'string' ? location.name : 'Unknown',
|
||||
coordinates: location?.coordinates ? {
|
||||
latitude: location.coordinates.latitude,
|
||||
longitude: location.coordinates.longitude,
|
||||
} : undefined,
|
||||
},
|
||||
sellerInfo,
|
||||
};
|
||||
}
|
||||
|
||||
// ----------------------------- Main -----------------------------
|
||||
|
||||
export default async function fetchKijijiItems(
|
||||
SEARCH_QUERY: string,
|
||||
REQUESTS_PER_SECOND = 1,
|
||||
BASE_URL = "https://www.kijiji.ca",
|
||||
searchOptions: SearchOptions = {},
|
||||
listingOptions: ListingFetchOptions = {},
|
||||
) {
|
||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||||
|
||||
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
|
||||
// Set defaults for configuration
|
||||
const finalSearchOptions: Required<SearchOptions> = {
|
||||
location: searchOptions.location ?? 1700272, // Default to GTA
|
||||
category: searchOptions.category ?? 0, // Default to all categories
|
||||
keywords: searchOptions.keywords ?? SEARCH_QUERY,
|
||||
sortBy: searchOptions.sortBy ?? 'relevancy',
|
||||
sortOrder: searchOptions.sortOrder ?? 'desc',
|
||||
maxPages: searchOptions.maxPages ?? 5, // Default to 5 pages
|
||||
priceMin: searchOptions.priceMin,
|
||||
priceMax: searchOptions.priceMax,
|
||||
};
|
||||
|
||||
console.log(`Fetching search: ${searchUrl}`);
|
||||
const finalListingOptions: Required<ListingFetchOptions> = {
|
||||
includeImages: listingOptions.includeImages ?? true,
|
||||
sellerDataDepth: listingOptions.sellerDataDepth ?? 'detailed',
|
||||
includeClientSideData: listingOptions.includeClientSideData ?? false,
|
||||
};
|
||||
|
||||
const allListings: DetailedListing[] = [];
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
// Fetch multiple pages
|
||||
for (let page = 1; page <= finalSearchOptions.maxPages; page++) {
|
||||
const searchUrl = buildSearchUrl(finalSearchOptions.keywords, {
|
||||
...finalSearchOptions,
|
||||
// Add page parameter for pagination
|
||||
...(page > 1 && { page }),
|
||||
}, BASE_URL);
|
||||
|
||||
console.log(`Fetching search page ${page}: ${searchUrl}`);
|
||||
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
console.log(
|
||||
"\n" +
|
||||
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||
);
|
||||
console.log(`\nSearch - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
const searchResults = parseSearch(searchHtml, BASE_URL);
|
||||
if (searchResults.length === 0) {
|
||||
console.warn("No search results parsed from page.");
|
||||
return;
|
||||
console.log(`No more results found on page ${page}. Stopping pagination.`);
|
||||
break;
|
||||
}
|
||||
|
||||
// Deduplicate links
|
||||
const listingLinks = Array.from(
|
||||
new Set(searchResults.map((r) => r.listingLink)),
|
||||
);
|
||||
// Deduplicate links across pages
|
||||
const newListingLinks = searchResults
|
||||
.map((r) => r.listingLink)
|
||||
.filter((link) => !seenUrls.has(link));
|
||||
|
||||
console.log(
|
||||
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
|
||||
);
|
||||
for (const link of newListingLinks) {
|
||||
seenUrls.add(link);
|
||||
}
|
||||
|
||||
console.log(`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`);
|
||||
|
||||
// Fetch details for this page's listings
|
||||
const progressBar = new cliProgress.SingleBar(
|
||||
{},
|
||||
cliProgress.Presets.shades_classic,
|
||||
);
|
||||
const totalProgress = listingLinks.length;
|
||||
const totalProgress = newListingLinks.length;
|
||||
let currentProgress = 0;
|
||||
progressBar.start(totalProgress, currentProgress);
|
||||
|
||||
const items: ListingDetails[] = [];
|
||||
for (const link of listingLinks) {
|
||||
for (const link of newListingLinks) {
|
||||
try {
|
||||
const html = await fetchHtml(link, DELAY_MS, {
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
console.log(
|
||||
"\n" +
|
||||
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||
);
|
||||
console.log(`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
|
||||
}
|
||||
},
|
||||
});
|
||||
const parsed = parseListing(html, BASE_URL);
|
||||
const parsed = await parseDetailedListing(html, BASE_URL, finalListingOptions);
|
||||
if (parsed) {
|
||||
if (parsed.listingPrice?.cents) items.push(parsed);
|
||||
allListings.push(parsed);
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.error(
|
||||
"\n" + `Failed to fetch ${link}\n - ${err.status} ${err.message}`,
|
||||
);
|
||||
console.error(`\nFailed to fetch ${link}\n - ${err.status} ${err.message}`);
|
||||
} else {
|
||||
console.error(
|
||||
"\n" +
|
||||
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
||||
);
|
||||
console.error(`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`);
|
||||
}
|
||||
} finally {
|
||||
currentProgress++;
|
||||
@@ -392,6 +940,14 @@ export default async function fetchKijijiItems(
|
||||
}
|
||||
}
|
||||
|
||||
console.log("\n" + `Parsed ${items.length} listings.`);
|
||||
return items;
|
||||
progressBar.stop();
|
||||
|
||||
// If we got fewer results than expected (40 per page), we've reached the end
|
||||
if (searchResults.length < 40) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nParsed ${allListings.length} detailed listings.`);
|
||||
return allListings;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user