feat: update kijiji scraper

Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
2026-01-22 00:25:19 -05:00
parent bdf504ba37
commit 87aa31cf1b
2 changed files with 664 additions and 102 deletions

View File

@@ -26,8 +26,12 @@ const server = Bun.serve({
{ status: 400 }, { status: 400 },
); );
const items = await fetchKijijiItems(SEARCH_QUERY, 5); const items = await fetchKijijiItems(SEARCH_QUERY, 1, undefined, {}, {
if (!items) includeImages: true,
sellerDataDepth: 'detailed',
includeClientSideData: false,
});
if (!items || items.length === 0)
return Response.json( return Response.json(
{ message: "Search didn't return any results!" }, { message: "Search didn't return any results!" },
{ status: 404 }, { status: 404 },
@@ -85,11 +89,13 @@ const server = Bun.serve({
); );
// Parse optional parameters with defaults // Parse optional parameters with defaults
const minPrice = reqUrl.searchParams.get("minPrice") const minPriceParam = reqUrl.searchParams.get("minPrice");
? parseInt(reqUrl.searchParams.get("minPrice")!) const minPrice = minPriceParam
? Number.parseInt(minPriceParam, 10)
: undefined; : undefined;
const maxPrice = reqUrl.searchParams.get("maxPrice") const maxPriceParam = reqUrl.searchParams.get("maxPrice");
? parseInt(reqUrl.searchParams.get("maxPrice")!) const maxPrice = maxPriceParam
? Number.parseInt(maxPriceParam, 10)
: undefined; : undefined;
const strictMode = reqUrl.searchParams.get("strictMode") === "true"; const strictMode = reqUrl.searchParams.get("strictMode") === "true";
const exclusionsParam = reqUrl.searchParams.get("exclusions"); const exclusionsParam = reqUrl.searchParams.get("exclusions");

View File

@@ -26,16 +26,29 @@ interface ApolloListingRoot {
url?: string; url?: string;
title?: string; title?: string;
description?: string; description?: string;
price?: { amount?: number | string; currency?: string }; price?: { amount?: number | string; currency?: string; type?: string };
type?: string; type?: string;
status?: string; status?: string;
activationDate?: string; activationDate?: string;
endDate?: string; endDate?: string;
metrics?: { views?: number | string }; metrics?: { views?: number | string };
location?: { address?: string | null }; location?: {
address?: string | null;
id?: number;
name?: string;
coordinates?: { latitude: number; longitude: number };
};
imageUrls?: string[];
imageCount?: number;
categoryId?: number;
adSource?: string;
flags?: { topAd?: boolean; priceDrop?: boolean };
posterInfo?: { posterId?: string; rating?: number };
attributes?: Array<{ canonicalName?: string; canonicalValues?: string[] }>;
[k: string]: unknown; [k: string]: unknown;
} }
// Keep existing interface for backward compatibility
type ListingDetails = { type ListingDetails = {
url: string; url: string;
title: string; title: string;
@@ -53,10 +66,178 @@ type ListingDetails = {
address?: string | null; address?: string | null;
}; };
// New comprehensive interface for detailed listings
interface DetailedListing extends ListingDetails {
images: string[];
categoryId: number;
adSource: string;
flags: {
topAd: boolean;
priceDrop: boolean;
};
attributes: Record<string, string[]>;
location: {
id: number;
name: string;
coordinates?: {
latitude: number;
longitude: number;
};
};
sellerInfo?: {
posterId: string;
rating?: number;
accountType?: string;
memberSince?: string;
reviewCount?: number;
reviewScore?: number;
};
}
// Configuration interfaces
interface SearchOptions {
location?: number | string; // Location ID or name
category?: number | string; // Category ID or name
keywords?: string;
sortBy?: 'relevancy' | 'date' | 'price' | 'distance';
sortOrder?: 'desc' | 'asc';
maxPages?: number; // Default: 5
priceMin?: number;
priceMax?: number;
}
interface ListingFetchOptions {
includeImages?: boolean; // Default: true
sellerDataDepth?: 'basic' | 'detailed' | 'full'; // Default: 'detailed'
includeClientSideData?: boolean; // Default: false
}
// ----------------------------- Constants & Mappings -----------------------------
// Location mappings from KIJIJI.md
const LOCATION_MAPPINGS: Record<string, number> = {
'canada': 0,
'ontario': 9004,
'toronto': 1700273,
'gta': 1700272,
'oshawa': 1700275,
'quebec': 9001,
'nova scotia': 9002,
'alberta': 9003,
'new brunswick': 9005,
'manitoba': 9006,
'british columbia': 9007,
'newfoundland': 9008,
'saskatchewan': 9009,
'territories': 9010,
'pei': 9011,
'prince edward island': 9011,
};
// Category mappings from KIJIJI.md (Buy & Sell main categories)
const CATEGORY_MAPPINGS: Record<string, number> = {
'all': 0,
'buy-sell': 10,
'arts-collectibles': 12,
'audio': 767,
'baby-items': 253,
'bags-luggage': 931,
'bikes': 644,
'books': 109,
'cameras': 103,
'cds': 104,
'clothing': 274,
'computers': 16,
'computer-accessories': 128,
'electronics': 29659001,
'free-stuff': 17220001,
'furniture': 235,
'garage-sales': 638,
'health-special-needs': 140,
'hobbies-crafts': 139,
'home-appliances': 107,
'home-indoor': 717,
'home-outdoor': 727,
'jewellery': 133,
'musical-instruments': 17,
'phones': 132,
'sporting-goods': 111,
'tools': 110,
'toys-games': 108,
'tvs-video': 15093001,
'video-games': 141,
'other': 26,
};
// Sort parameter mappings
const SORT_MAPPINGS: Record<string, string> = {
'relevancy': 'MATCH',
'date': 'DATE',
'price': 'PRICE',
'distance': 'DISTANCE',
};
// ----------------------------- Exports for Testing -----------------------------
// Note: These are exported for testing purposes only
export { resolveLocationId, resolveCategoryId, buildSearchUrl };
export { extractApolloState, parseSearch };
export { parseDetailedListing };
export { HttpError, NetworkError, ParseError, RateLimitError, ValidationError };
// ----------------------------- Utilities ----------------------------- // ----------------------------- Utilities -----------------------------
const SEPS = new Set([" ", "", "—", "/", ":", ";", ",", ".", "-"]); const SEPS = new Set([" ", "", "—", "/", ":", ";", ",", ".", "-"]);
/**
* Resolve location ID from name or return numeric ID
*/
function resolveLocationId(location?: number | string): number {
if (typeof location === 'number') return location;
if (typeof location === 'string') {
const normalized = location.toLowerCase().replace(/\s+/g, '-');
return LOCATION_MAPPINGS[normalized] ?? 0; // Default to Canada (0)
}
return 0; // Default to Canada
}
/**
* Resolve category ID from name or return numeric ID
*/
function resolveCategoryId(category?: number | string): number {
if (typeof category === 'number') return category;
if (typeof category === 'string') {
const normalized = category.toLowerCase().replace(/\s+/g, '-');
return CATEGORY_MAPPINGS[normalized] ?? 0; // Default to all categories
}
return 0; // Default to all categories
}
/**
* Build search URL with enhanced parameters
*/
function buildSearchUrl(
keywords: string,
options: SearchOptions & { page?: number },
BASE_URL = "https://www.kijiji.ca"
): string {
const locationId = resolveLocationId(options.location);
const categoryId = resolveCategoryId(options.category);
const categorySlug = categoryId === 0 ? 'buy-sell' : 'buy-sell'; // Could be enhanced
const locationSlug = locationId === 0 ? 'canada' : 'canada'; // Could be enhanced
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
const sortParam = options.sortBy ? `&sort=${SORT_MAPPINGS[options.sortBy]}` : '';
const sortOrder = options.sortOrder === 'asc' ? 'ASC' : 'DESC';
const pageParam = options.page && options.page > 1 ? `&page=${options.page}` : '';
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
return url;
}
/** /**
* Slugifies a string for search * Slugifies a string for search
*/ */
@@ -67,13 +248,14 @@ export function slugify(input: string): string {
for (let i = 0; i < s.length; i++) { for (let i = 0; i < s.length; i++) {
const ch = s[i]; const ch = s[i];
const code = ch!.charCodeAt(0); if (!ch) continue;
const code = ch.charCodeAt(0);
// a-z or 0-9 // a-z or 0-9
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) { if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
out.push(ch!); out.push(ch);
lastHyphen = false; lastHyphen = false;
} else if (SEPS.has(ch!)) { } else if (SEPS.has(ch)) {
if (!lastHyphen) { if (!lastHyphen) {
out.push("-"); out.push("-");
lastHyphen = true; lastHyphen = true;
@@ -87,30 +269,33 @@ export function slugify(input: string): string {
/** /**
* Turns cents to localized currency string. * Turns cents to localized currency string.
*/ */
function formatCentsToCurrency( export function formatCentsToCurrency(
num: number | string | undefined, num: number | string | undefined,
locale = "en-US", locale = "en-US",
): string { ): string {
if (num == null) return ""; if (num == null) return "";
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num; const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
if (Number.isNaN(cents)) return ""; if (Number.isNaN(cents)) return "";
const dollars = cents / 100; const dollars = cents / 100;
const formatter = new Intl.NumberFormat(locale, { const formatter = new Intl.NumberFormat(locale, {
style: 'currency',
currency: 'USD',
minimumFractionDigits: 2, minimumFractionDigits: 2,
maximumFractionDigits: 2, maximumFractionDigits: 2,
useGrouping: true,
}); });
return formatter.format(dollars); return formatter.format(dollars);
} }
function isRecord(value: unknown): value is Record<string, unknown> { function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null; return typeof value === "object" && value !== null && !Array.isArray(value);
} }
async function delay(ms: number): Promise<void> { async function delay(ms: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, ms)); await new Promise((resolve) => setTimeout(resolve, ms));
} }
// ----------------------------- Error Classes -----------------------------
class HttpError extends Error { class HttpError extends Error {
constructor( constructor(
message: string, message: string,
@@ -122,12 +307,52 @@ class HttpError extends Error {
} }
} }
class NetworkError extends Error {
constructor(
message: string,
public readonly url: string,
public readonly cause?: Error,
) {
super(message);
this.name = "NetworkError";
}
}
class ParseError extends Error {
constructor(
message: string,
public readonly data?: unknown,
) {
super(message);
this.name = "ParseError";
}
}
class RateLimitError extends Error {
constructor(
message: string,
public readonly url: string,
public readonly resetTime?: number,
) {
super(message);
this.name = "RateLimitError";
}
}
class ValidationError extends Error {
constructor(message: string) {
super(message);
this.name = "ValidationError";
}
}
// ----------------------------- HTTP Client ----------------------------- // ----------------------------- HTTP Client -----------------------------
/** /**
Fetch HTML with a basic retry strategy and simple rate-limit delay between calls. Fetch HTML with enhanced retry strategy and exponential backoff.
- Retries on 429 and 5xx - Retries on 429, 5xx, and network errors
- Respects X-RateLimit-Reset when present (seconds) - Respects X-RateLimit-Reset when present (seconds)
- Exponential backoff with jitter
*/ */
async function fetchHtml( async function fetchHtml(
url: string, url: string,
@@ -139,11 +364,13 @@ async function fetchHtml(
}, },
): Promise<HTMLString> { ): Promise<HTMLString> {
const maxRetries = opts?.maxRetries ?? 3; const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500; const retryBaseMs = opts?.retryBaseMs ?? 1000;
for (let attempt = 0; attempt <= maxRetries; attempt++) { for (let attempt = 0; attempt <= maxRetries; attempt++) {
try { try {
// console.log(`Fetching: `, url); const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
const res = await fetch(url, { const res = await fetch(url, {
method: "GET", method: "GET",
headers: { headers: {
@@ -155,27 +382,40 @@ async function fetchHtml(
"user-agent": "user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
}, },
signal: controller.signal,
}); });
clearTimeout(timeoutId);
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining"); const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
const rateLimitReset = res.headers.get("X-RateLimit-Reset"); const rateLimitReset = res.headers.get("X-RateLimit-Reset");
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset); opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
if (!res.ok) { if (!res.ok) {
// Respect 429 reset if provided // Handle rate limiting
if (res.status === 429) { if (res.status === 429) {
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : NaN; const resetSeconds = rateLimitReset ? Number(rateLimitReset) : Number.NaN;
const waitMs = Number.isFinite(resetSeconds) const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000) ? Math.max(0, resetSeconds * 1000)
: (attempt + 1) * retryBaseMs; : calculateBackoffDelay(attempt, retryBaseMs);
await delay(waitMs);
continue; if (attempt < maxRetries) {
await delay(waitMs);
continue;
}
throw new RateLimitError(
`Rate limit exceeded for ${url}`,
url,
resetSeconds,
);
} }
// Retry on 5xx
// Retry on server errors
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) { if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
await delay((attempt + 1) * retryBaseMs); await delay(calculateBackoffDelay(attempt, retryBaseMs));
continue; continue;
} }
throw new HttpError( throw new HttpError(
`Request failed with status ${res.status}`, `Request failed with status ${res.status}`,
res.status, res.status,
@@ -184,22 +424,177 @@ async function fetchHtml(
} }
const html = await res.text(); const html = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
// Respect per-request delay to maintain rate limiting
await delay(DELAY_MS); await delay(DELAY_MS);
return html; return html;
} catch (err) { } catch (err) {
if (attempt >= maxRetries) throw err; // Handle different error types
await delay((attempt + 1) * retryBaseMs); if (err instanceof RateLimitError || err instanceof HttpError) {
throw err; // Re-throw known errors
}
if (err instanceof Error && err.name === 'AbortError') {
if (attempt < maxRetries) {
await delay(calculateBackoffDelay(attempt, retryBaseMs));
continue;
}
throw new NetworkError(`Request timeout for ${url}`, url, err);
}
// Network or other errors
if (attempt < maxRetries) {
await delay(calculateBackoffDelay(attempt, retryBaseMs));
continue;
}
throw new NetworkError(
`Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`,
url,
err instanceof Error ? err : undefined
);
} }
} }
throw new Error("Exhausted retries without response"); throw new NetworkError(`Exhausted retries without response for ${url}`, url);
}
/**
* Calculate exponential backoff delay with jitter
*/
function calculateBackoffDelay(attempt: number, baseMs: number): number {
const exponentialDelay = baseMs * (2 ** attempt);
const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter
return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds
}
// ----------------------------- GraphQL Client -----------------------------
/**
* Fetch additional data via GraphQL API
*/
async function fetchGraphQLData(
query: string,
variables: Record<string, unknown>,
BASE_URL = "https://www.kijiji.ca"
): Promise<unknown> {
const endpoint = `${BASE_URL}/anvil/api`;
try {
const response = await fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'apollo-require-preflight': 'true',
},
body: JSON.stringify({
query,
variables,
}),
});
if (!response.ok) {
throw new HttpError(
`GraphQL request failed with status ${response.status}`,
response.status,
endpoint
);
}
const result = await response.json();
if (result.errors) {
throw new ParseError(`GraphQL errors: ${JSON.stringify(result.errors)}`, result.errors);
}
return result.data;
} catch (err) {
if (err instanceof HttpError || err instanceof ParseError) {
throw err;
}
throw new NetworkError(
`Failed to fetch GraphQL data: ${err instanceof Error ? err.message : String(err)}`,
endpoint,
err instanceof Error ? err : undefined
);
}
}
// GraphQL response interfaces
interface GraphQLReviewResponse {
user?: {
reviewSummary?: {
count?: number;
score?: number;
};
};
}
interface GraphQLProfileResponse {
user?: {
memberSince?: string;
accountType?: string;
};
}
// GraphQL queries from KIJIJI.md
const GRAPHQL_QUERIES = {
getReviewSummary: `
query GetReviewSummary($userId: String!) {
user(id: $userId) {
reviewSummary {
count
score
__typename
}
__typename
}
}
`,
getProfileMetrics: `
query GetProfileMetrics($profileId: String!) {
user(id: $profileId) {
memberSince
accountType
__typename
}
}
`,
} as const;
/**
* Fetch additional seller data via GraphQL
*/
async function fetchSellerDetails(
posterId: string,
BASE_URL = "https://www.kijiji.ca"
): Promise<{ reviewCount?: number; reviewScore?: number; memberSince?: string; accountType?: string }> {
try {
const [reviewData, profileData] = await Promise.all([
fetchGraphQLData(GRAPHQL_QUERIES.getReviewSummary, { userId: posterId }, BASE_URL),
fetchGraphQLData(GRAPHQL_QUERIES.getProfileMetrics, { profileId: posterId }, BASE_URL),
]);
const reviewResponse = reviewData as GraphQLReviewResponse;
const profileResponse = profileData as GraphQLProfileResponse;
return {
reviewCount: reviewResponse?.user?.reviewSummary?.count,
reviewScore: reviewResponse?.user?.reviewSummary?.score,
memberSince: profileResponse?.user?.memberSince,
accountType: profileResponse?.user?.accountType,
};
} catch (err) {
// Silently fail for GraphQL errors - not critical for basic functionality
console.warn(`Failed to fetch seller details for ${posterId}:`, err instanceof Error ? err.message : String(err));
return {};
}
} }
// ----------------------------- Parsing ----------------------------- // ----------------------------- Parsing -----------------------------
/** /**
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML. Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
*/ */
function extractApolloState(htmlString: HTMLString): ApolloRecord | null { function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
const { document } = parseHTML(htmlString); const { document } = parseHTML(htmlString);
@@ -299,7 +694,7 @@ function parseListing(
listingPrice: amountFormatted listingPrice: amountFormatted
? { ? {
amountFormatted, amountFormatted,
cents: Number.isFinite(cents!) ? cents : undefined, cents: cents !== undefined && Number.isFinite(cents) ? cents : undefined,
currency: price?.currency, currency: price?.currency,
} }
: undefined, : undefined,
@@ -307,91 +702,252 @@ function parseListing(
listingStatus: status, listingStatus: status,
creationDate: activationDate, creationDate: activationDate,
endDate, endDate,
numberOfViews: Number.isFinite(numberOfViews!) ? numberOfViews : undefined, numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
address: location?.address ?? null, address: location?.address ?? null,
}; };
} }
/**
* Parse a listing page into a detailed object with all available fields
*/
async function parseDetailedListing(
htmlString: HTMLString,
BASE_URL: string,
options: ListingFetchOptions = {}
): Promise<DetailedListing | null> {
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing"),
);
if (!listingKey) return null;
const root = apolloState[listingKey];
if (!isRecord(root)) return null;
const {
url,
title,
description,
price,
type,
status,
activationDate,
endDate,
metrics,
location,
imageUrls,
imageCount,
categoryId,
adSource,
flags,
posterInfo,
attributes,
} = root as ApolloListingRoot;
const cents = price?.amount != null ? Number(price.amount) : undefined;
const amountFormatted = formatCentsToCurrency(cents);
const numberOfViews =
metrics?.views != null ? Number(metrics.views) : undefined;
const listingUrl =
typeof url === "string"
? url.startsWith("http")
? url
: `${BASE_URL}${url}`
: "";
if (!listingUrl || !title) return null;
// Only include fixed-price listings
if (!amountFormatted || cents === undefined) return null;
// Extract images if requested
const images = options.includeImages !== false && Array.isArray(imageUrls)
? imageUrls.filter((url): url is string => typeof url === 'string')
: [];
// Extract attributes as key-value pairs
const attributeMap: Record<string, string[]> = {};
if (Array.isArray(attributes)) {
for (const attr of attributes) {
if (attr?.canonicalName && Array.isArray(attr.canonicalValues)) {
attributeMap[attr.canonicalName] = attr.canonicalValues;
}
}
}
// Extract seller info based on depth setting
let sellerInfo: DetailedListing['sellerInfo'];
const depth = options.sellerDataDepth ?? 'detailed';
if (posterInfo?.posterId) {
sellerInfo = {
posterId: posterInfo.posterId,
rating: typeof posterInfo.rating === 'number' ? posterInfo.rating : undefined,
};
// Add more detailed info if requested and client-side data is enabled
if ((depth === 'detailed' || depth === 'full') && options.includeClientSideData) {
try {
const additionalData = await fetchSellerDetails(posterInfo.posterId, BASE_URL);
sellerInfo = {
...sellerInfo,
...additionalData,
};
} catch (err) {
// Silently fail - GraphQL data is optional
console.warn(`Failed to fetch additional seller data for ${posterInfo.posterId}`);
}
}
}
return {
url: listingUrl,
title,
description,
listingPrice: {
amountFormatted,
cents,
currency: price?.currency,
},
listingType: type,
listingStatus: status,
creationDate: activationDate,
endDate,
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
address: location?.address ?? null,
images,
categoryId: typeof categoryId === 'number' ? categoryId : 0,
adSource: typeof adSource === 'string' ? adSource : 'UNKNOWN',
flags: {
topAd: flags?.topAd === true,
priceDrop: flags?.priceDrop === true,
},
attributes: attributeMap,
location: {
id: typeof location?.id === 'number' ? location.id : 0,
name: typeof location?.name === 'string' ? location.name : 'Unknown',
coordinates: location?.coordinates ? {
latitude: location.coordinates.latitude,
longitude: location.coordinates.longitude,
} : undefined,
},
sellerInfo,
};
}
// ----------------------------- Main ----------------------------- // ----------------------------- Main -----------------------------
export default async function fetchKijijiItems( export default async function fetchKijijiItems(
SEARCH_QUERY: string, SEARCH_QUERY: string,
REQUESTS_PER_SECOND = 1, REQUESTS_PER_SECOND = 1,
BASE_URL = "https://www.kijiji.ca", BASE_URL = "https://www.kijiji.ca",
searchOptions: SearchOptions = {},
listingOptions: ListingFetchOptions = {},
) { ) {
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`; // Set defaults for configuration
const finalSearchOptions: Required<SearchOptions> = {
location: searchOptions.location ?? 1700272, // Default to GTA
category: searchOptions.category ?? 0, // Default to all categories
keywords: searchOptions.keywords ?? SEARCH_QUERY,
sortBy: searchOptions.sortBy ?? 'relevancy',
sortOrder: searchOptions.sortOrder ?? 'desc',
maxPages: searchOptions.maxPages ?? 5, // Default to 5 pages
priceMin: searchOptions.priceMin,
priceMax: searchOptions.priceMax,
};
console.log(`Fetching search: ${searchUrl}`); const finalListingOptions: Required<ListingFetchOptions> = {
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, { includeImages: listingOptions.includeImages ?? true,
onRateInfo: (remaining, reset) => { sellerDataDepth: listingOptions.sellerDataDepth ?? 'detailed',
if (remaining && reset) { includeClientSideData: listingOptions.includeClientSideData ?? false,
console.log( };
"\n" +
`Search - Rate limit remaining: ${remaining}, reset in: ${reset}s`, const allListings: DetailedListing[] = [];
); const seenUrls = new Set<string>();
// Fetch multiple pages
for (let page = 1; page <= finalSearchOptions.maxPages; page++) {
const searchUrl = buildSearchUrl(finalSearchOptions.keywords, {
...finalSearchOptions,
// Add page parameter for pagination
...(page > 1 && { page }),
}, BASE_URL);
console.log(`Fetching search page ${page}: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(`\nSearch - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
}
},
});
const searchResults = parseSearch(searchHtml, BASE_URL);
if (searchResults.length === 0) {
console.log(`No more results found on page ${page}. Stopping pagination.`);
break;
}
// Deduplicate links across pages
const newListingLinks = searchResults
.map((r) => r.listingLink)
.filter((link) => !seenUrls.has(link));
for (const link of newListingLinks) {
seenUrls.add(link);
}
console.log(`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`);
// Fetch details for this page's listings
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = newListingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
for (const link of newListingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
}
},
});
const parsed = await parseDetailedListing(html, BASE_URL, finalListingOptions);
if (parsed) {
allListings.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(`\nFailed to fetch ${link}\n - ${err.status} ${err.message}`);
} else {
console.error(`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
} }
}, }
});
const searchResults = parseSearch(searchHtml, BASE_URL); progressBar.stop();
if (searchResults.length === 0) {
console.warn("No search results parsed from page.");
return;
}
// Deduplicate links // If we got fewer results than expected (40 per page), we've reached the end
const listingLinks = Array.from( if (searchResults.length < 40) {
new Set(searchResults.map((r) => r.listingLink)), break;
);
console.log(
"\n" + `Found ${listingLinks.length} listing links. Fetching details...`,
);
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
const totalProgress = listingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
const items: ListingDetails[] = [];
for (const link of listingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
"\n" +
`Item - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const parsed = parseListing(html, BASE_URL);
if (parsed) {
if (parsed.listingPrice?.cents) items.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(
"\n" + `Failed to fetch ${link}\n - ${err.status} ${err.message}`,
);
} else {
console.error(
"\n" +
`Failed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
} }
} }
console.log("\n" + `Parsed ${items.length} listings.`); console.log(`\nParsed ${allListings.length} detailed listings.`);
return items; return allListings;
} }