954 lines
24 KiB
TypeScript
954 lines
24 KiB
TypeScript
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||
import { parseHTML } from "linkedom";
|
||
import unidecode from "unidecode";
|
||
import cliProgress from "cli-progress";
|
||
|
||
// const unidecode = require("unidecode");
|
||
|
||
// ----------------------------- Types -----------------------------
|
||
|
||
type HTMLString = string;
|
||
|
||
type SearchListing = {
|
||
name: string;
|
||
listingLink: string;
|
||
};
|
||
|
||
type ApolloRecord = Record<string, unknown>;
|
||
|
||
interface ApolloSearchItem {
|
||
url?: string;
|
||
title?: string;
|
||
[k: string]: unknown;
|
||
}
|
||
|
||
interface ApolloListingRoot {
|
||
url?: string;
|
||
title?: string;
|
||
description?: string;
|
||
price?: { amount?: number | string; currency?: string; type?: string };
|
||
type?: string;
|
||
status?: string;
|
||
activationDate?: string;
|
||
endDate?: string;
|
||
metrics?: { views?: number | string };
|
||
location?: {
|
||
address?: string | null;
|
||
id?: number;
|
||
name?: string;
|
||
coordinates?: { latitude: number; longitude: number };
|
||
};
|
||
imageUrls?: string[];
|
||
imageCount?: number;
|
||
categoryId?: number;
|
||
adSource?: string;
|
||
flags?: { topAd?: boolean; priceDrop?: boolean };
|
||
posterInfo?: { posterId?: string; rating?: number };
|
||
attributes?: Array<{ canonicalName?: string; canonicalValues?: string[] }>;
|
||
[k: string]: unknown;
|
||
}
|
||
|
||
// Keep existing interface for backward compatibility
|
||
type ListingDetails = {
|
||
url: string;
|
||
title: string;
|
||
description?: string;
|
||
listingPrice?: {
|
||
amountFormatted: string;
|
||
cents?: number;
|
||
currency?: string;
|
||
};
|
||
listingType?: string;
|
||
listingStatus?: string;
|
||
creationDate?: string;
|
||
endDate?: string;
|
||
numberOfViews?: number;
|
||
address?: string | null;
|
||
};
|
||
|
||
// New comprehensive interface for detailed listings
|
||
interface DetailedListing extends ListingDetails {
|
||
images: string[];
|
||
categoryId: number;
|
||
adSource: string;
|
||
flags: {
|
||
topAd: boolean;
|
||
priceDrop: boolean;
|
||
};
|
||
attributes: Record<string, string[]>;
|
||
location: {
|
||
id: number;
|
||
name: string;
|
||
coordinates?: {
|
||
latitude: number;
|
||
longitude: number;
|
||
};
|
||
};
|
||
sellerInfo?: {
|
||
posterId: string;
|
||
rating?: number;
|
||
accountType?: string;
|
||
memberSince?: string;
|
||
reviewCount?: number;
|
||
reviewScore?: number;
|
||
};
|
||
}
|
||
|
||
// Configuration interfaces
|
||
interface SearchOptions {
|
||
location?: number | string; // Location ID or name
|
||
category?: number | string; // Category ID or name
|
||
keywords?: string;
|
||
sortBy?: 'relevancy' | 'date' | 'price' | 'distance';
|
||
sortOrder?: 'desc' | 'asc';
|
||
maxPages?: number; // Default: 5
|
||
priceMin?: number;
|
||
priceMax?: number;
|
||
}
|
||
|
||
interface ListingFetchOptions {
|
||
includeImages?: boolean; // Default: true
|
||
sellerDataDepth?: 'basic' | 'detailed' | 'full'; // Default: 'detailed'
|
||
includeClientSideData?: boolean; // Default: false
|
||
}
|
||
|
||
// ----------------------------- Constants & Mappings -----------------------------
|
||
|
||
// Location mappings from KIJIJI.md
|
||
const LOCATION_MAPPINGS: Record<string, number> = {
|
||
'canada': 0,
|
||
'ontario': 9004,
|
||
'toronto': 1700273,
|
||
'gta': 1700272,
|
||
'oshawa': 1700275,
|
||
'quebec': 9001,
|
||
'nova scotia': 9002,
|
||
'alberta': 9003,
|
||
'new brunswick': 9005,
|
||
'manitoba': 9006,
|
||
'british columbia': 9007,
|
||
'newfoundland': 9008,
|
||
'saskatchewan': 9009,
|
||
'territories': 9010,
|
||
'pei': 9011,
|
||
'prince edward island': 9011,
|
||
};
|
||
|
||
// Category mappings from KIJIJI.md (Buy & Sell main categories)
|
||
const CATEGORY_MAPPINGS: Record<string, number> = {
|
||
'all': 0,
|
||
'buy-sell': 10,
|
||
'arts-collectibles': 12,
|
||
'audio': 767,
|
||
'baby-items': 253,
|
||
'bags-luggage': 931,
|
||
'bikes': 644,
|
||
'books': 109,
|
||
'cameras': 103,
|
||
'cds': 104,
|
||
'clothing': 274,
|
||
'computers': 16,
|
||
'computer-accessories': 128,
|
||
'electronics': 29659001,
|
||
'free-stuff': 17220001,
|
||
'furniture': 235,
|
||
'garage-sales': 638,
|
||
'health-special-needs': 140,
|
||
'hobbies-crafts': 139,
|
||
'home-appliances': 107,
|
||
'home-indoor': 717,
|
||
'home-outdoor': 727,
|
||
'jewellery': 133,
|
||
'musical-instruments': 17,
|
||
'phones': 132,
|
||
'sporting-goods': 111,
|
||
'tools': 110,
|
||
'toys-games': 108,
|
||
'tvs-video': 15093001,
|
||
'video-games': 141,
|
||
'other': 26,
|
||
};
|
||
|
||
// Sort parameter mappings
|
||
const SORT_MAPPINGS: Record<string, string> = {
|
||
'relevancy': 'MATCH',
|
||
'date': 'DATE',
|
||
'price': 'PRICE',
|
||
'distance': 'DISTANCE',
|
||
};
|
||
|
||
// ----------------------------- Exports for Testing -----------------------------
|
||
// Note: These are exported for testing purposes only
|
||
|
||
export { resolveLocationId, resolveCategoryId, buildSearchUrl };
|
||
export { extractApolloState, parseSearch };
|
||
export { parseDetailedListing };
|
||
export { HttpError, NetworkError, ParseError, RateLimitError, ValidationError };
|
||
|
||
// ----------------------------- Utilities -----------------------------
|
||
|
||
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
||
|
||
/**
|
||
* Resolve location ID from name or return numeric ID
|
||
*/
|
||
function resolveLocationId(location?: number | string): number {
|
||
if (typeof location === 'number') return location;
|
||
if (typeof location === 'string') {
|
||
const normalized = location.toLowerCase().replace(/\s+/g, '-');
|
||
return LOCATION_MAPPINGS[normalized] ?? 0; // Default to Canada (0)
|
||
}
|
||
return 0; // Default to Canada
|
||
}
|
||
|
||
/**
|
||
* Resolve category ID from name or return numeric ID
|
||
*/
|
||
function resolveCategoryId(category?: number | string): number {
|
||
if (typeof category === 'number') return category;
|
||
if (typeof category === 'string') {
|
||
const normalized = category.toLowerCase().replace(/\s+/g, '-');
|
||
return CATEGORY_MAPPINGS[normalized] ?? 0; // Default to all categories
|
||
}
|
||
return 0; // Default to all categories
|
||
}
|
||
|
||
/**
|
||
* Build search URL with enhanced parameters
|
||
*/
|
||
function buildSearchUrl(
|
||
keywords: string,
|
||
options: SearchOptions & { page?: number },
|
||
BASE_URL = "https://www.kijiji.ca"
|
||
): string {
|
||
const locationId = resolveLocationId(options.location);
|
||
const categoryId = resolveCategoryId(options.category);
|
||
|
||
const categorySlug = categoryId === 0 ? 'buy-sell' : 'buy-sell'; // Could be enhanced
|
||
const locationSlug = locationId === 0 ? 'canada' : 'canada'; // Could be enhanced
|
||
|
||
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||
|
||
const sortParam = options.sortBy ? `&sort=${SORT_MAPPINGS[options.sortBy]}` : '';
|
||
const sortOrder = options.sortOrder === 'asc' ? 'ASC' : 'DESC';
|
||
const pageParam = options.page && options.page > 1 ? `&page=${options.page}` : '';
|
||
|
||
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
|
||
|
||
return url;
|
||
}
|
||
|
||
/**
|
||
* Slugifies a string for search
|
||
*/
|
||
export function slugify(input: string): string {
|
||
const s = unidecode(input).toLowerCase();
|
||
const out: string[] = [];
|
||
let lastHyphen = false;
|
||
|
||
for (let i = 0; i < s.length; i++) {
|
||
const ch = s[i];
|
||
if (!ch) continue;
|
||
const code = ch.charCodeAt(0);
|
||
|
||
// a-z or 0-9
|
||
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
|
||
out.push(ch);
|
||
lastHyphen = false;
|
||
} else if (SEPS.has(ch)) {
|
||
if (!lastHyphen) {
|
||
out.push("-");
|
||
lastHyphen = true;
|
||
}
|
||
}
|
||
// else drop character
|
||
}
|
||
return out.join("");
|
||
}
|
||
|
||
/**
|
||
* Turns cents to localized currency string.
|
||
*/
|
||
export function formatCentsToCurrency(
|
||
num: number | string | undefined,
|
||
locale = "en-US",
|
||
): string {
|
||
if (num == null) return "";
|
||
const cents = typeof num === "string" ? Number.parseInt(num, 10) : num;
|
||
if (Number.isNaN(cents)) return "";
|
||
const dollars = cents / 100;
|
||
const formatter = new Intl.NumberFormat(locale, {
|
||
style: 'currency',
|
||
currency: 'USD',
|
||
minimumFractionDigits: 2,
|
||
maximumFractionDigits: 2,
|
||
});
|
||
return formatter.format(dollars);
|
||
}
|
||
|
||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||
return typeof value === "object" && value !== null && !Array.isArray(value);
|
||
}
|
||
|
||
async function delay(ms: number): Promise<void> {
|
||
await new Promise((resolve) => setTimeout(resolve, ms));
|
||
}
|
||
|
||
// ----------------------------- Error Classes -----------------------------
|
||
|
||
class HttpError extends Error {
|
||
constructor(
|
||
message: string,
|
||
public readonly status: number,
|
||
public readonly url: string,
|
||
) {
|
||
super(message);
|
||
this.name = "HttpError";
|
||
}
|
||
}
|
||
|
||
class NetworkError extends Error {
|
||
constructor(
|
||
message: string,
|
||
public readonly url: string,
|
||
public readonly cause?: Error,
|
||
) {
|
||
super(message);
|
||
this.name = "NetworkError";
|
||
}
|
||
}
|
||
|
||
class ParseError extends Error {
|
||
constructor(
|
||
message: string,
|
||
public readonly data?: unknown,
|
||
) {
|
||
super(message);
|
||
this.name = "ParseError";
|
||
}
|
||
}
|
||
|
||
class RateLimitError extends Error {
|
||
constructor(
|
||
message: string,
|
||
public readonly url: string,
|
||
public readonly resetTime?: number,
|
||
) {
|
||
super(message);
|
||
this.name = "RateLimitError";
|
||
}
|
||
}
|
||
|
||
class ValidationError extends Error {
|
||
constructor(message: string) {
|
||
super(message);
|
||
this.name = "ValidationError";
|
||
}
|
||
}
|
||
|
||
// ----------------------------- HTTP Client -----------------------------
|
||
|
||
/**
|
||
Fetch HTML with enhanced retry strategy and exponential backoff.
|
||
- Retries on 429, 5xx, and network errors
|
||
- Respects X-RateLimit-Reset when present (seconds)
|
||
- Exponential backoff with jitter
|
||
*/
|
||
async function fetchHtml(
|
||
url: string,
|
||
DELAY_MS: number,
|
||
opts?: {
|
||
maxRetries?: number;
|
||
retryBaseMs?: number;
|
||
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||
},
|
||
): Promise<HTMLString> {
|
||
const maxRetries = opts?.maxRetries ?? 3;
|
||
const retryBaseMs = opts?.retryBaseMs ?? 1000;
|
||
|
||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||
try {
|
||
const controller = new AbortController();
|
||
const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
|
||
|
||
const res = await fetch(url, {
|
||
method: "GET",
|
||
headers: {
|
||
accept:
|
||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
|
||
"cache-control": "no-cache",
|
||
"upgrade-insecure-requests": "1",
|
||
"user-agent":
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
||
},
|
||
signal: controller.signal,
|
||
});
|
||
|
||
clearTimeout(timeoutId);
|
||
|
||
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
|
||
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
|
||
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
|
||
|
||
if (!res.ok) {
|
||
// Handle rate limiting
|
||
if (res.status === 429) {
|
||
const resetSeconds = rateLimitReset ? Number(rateLimitReset) : Number.NaN;
|
||
const waitMs = Number.isFinite(resetSeconds)
|
||
? Math.max(0, resetSeconds * 1000)
|
||
: calculateBackoffDelay(attempt, retryBaseMs);
|
||
|
||
if (attempt < maxRetries) {
|
||
await delay(waitMs);
|
||
continue;
|
||
}
|
||
throw new RateLimitError(
|
||
`Rate limit exceeded for ${url}`,
|
||
url,
|
||
resetSeconds,
|
||
);
|
||
}
|
||
|
||
// Retry on server errors
|
||
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
|
||
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||
continue;
|
||
}
|
||
|
||
throw new HttpError(
|
||
`Request failed with status ${res.status}`,
|
||
res.status,
|
||
url,
|
||
);
|
||
}
|
||
|
||
const html = await res.text();
|
||
|
||
// Respect per-request delay to maintain rate limiting
|
||
await delay(DELAY_MS);
|
||
return html;
|
||
|
||
} catch (err) {
|
||
// Handle different error types
|
||
if (err instanceof RateLimitError || err instanceof HttpError) {
|
||
throw err; // Re-throw known errors
|
||
}
|
||
|
||
if (err instanceof Error && err.name === 'AbortError') {
|
||
if (attempt < maxRetries) {
|
||
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||
continue;
|
||
}
|
||
throw new NetworkError(`Request timeout for ${url}`, url, err);
|
||
}
|
||
|
||
// Network or other errors
|
||
if (attempt < maxRetries) {
|
||
await delay(calculateBackoffDelay(attempt, retryBaseMs));
|
||
continue;
|
||
}
|
||
throw new NetworkError(
|
||
`Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`,
|
||
url,
|
||
err instanceof Error ? err : undefined
|
||
);
|
||
}
|
||
}
|
||
|
||
throw new NetworkError(`Exhausted retries without response for ${url}`, url);
|
||
}
|
||
|
||
/**
|
||
* Calculate exponential backoff delay with jitter
|
||
*/
|
||
function calculateBackoffDelay(attempt: number, baseMs: number): number {
|
||
const exponentialDelay = baseMs * (2 ** attempt);
|
||
const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter
|
||
return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds
|
||
}
|
||
|
||
// ----------------------------- GraphQL Client -----------------------------
|
||
|
||
/**
|
||
* Fetch additional data via GraphQL API
|
||
*/
|
||
async function fetchGraphQLData(
|
||
query: string,
|
||
variables: Record<string, unknown>,
|
||
BASE_URL = "https://www.kijiji.ca"
|
||
): Promise<unknown> {
|
||
const endpoint = `${BASE_URL}/anvil/api`;
|
||
|
||
try {
|
||
const response = await fetch(endpoint, {
|
||
method: 'POST',
|
||
headers: {
|
||
'Content-Type': 'application/json',
|
||
'apollo-require-preflight': 'true',
|
||
},
|
||
body: JSON.stringify({
|
||
query,
|
||
variables,
|
||
}),
|
||
});
|
||
|
||
if (!response.ok) {
|
||
throw new HttpError(
|
||
`GraphQL request failed with status ${response.status}`,
|
||
response.status,
|
||
endpoint
|
||
);
|
||
}
|
||
|
||
const result = await response.json();
|
||
|
||
if (result.errors) {
|
||
throw new ParseError(`GraphQL errors: ${JSON.stringify(result.errors)}`, result.errors);
|
||
}
|
||
|
||
return result.data;
|
||
} catch (err) {
|
||
if (err instanceof HttpError || err instanceof ParseError) {
|
||
throw err;
|
||
}
|
||
throw new NetworkError(
|
||
`Failed to fetch GraphQL data: ${err instanceof Error ? err.message : String(err)}`,
|
||
endpoint,
|
||
err instanceof Error ? err : undefined
|
||
);
|
||
}
|
||
}
|
||
|
||
// GraphQL response interfaces
|
||
interface GraphQLReviewResponse {
|
||
user?: {
|
||
reviewSummary?: {
|
||
count?: number;
|
||
score?: number;
|
||
};
|
||
};
|
||
}
|
||
|
||
interface GraphQLProfileResponse {
|
||
user?: {
|
||
memberSince?: string;
|
||
accountType?: string;
|
||
};
|
||
}
|
||
|
||
// GraphQL queries from KIJIJI.md
|
||
const GRAPHQL_QUERIES = {
|
||
getReviewSummary: `
|
||
query GetReviewSummary($userId: String!) {
|
||
user(id: $userId) {
|
||
reviewSummary {
|
||
count
|
||
score
|
||
__typename
|
||
}
|
||
__typename
|
||
}
|
||
}
|
||
`,
|
||
getProfileMetrics: `
|
||
query GetProfileMetrics($profileId: String!) {
|
||
user(id: $profileId) {
|
||
memberSince
|
||
accountType
|
||
__typename
|
||
}
|
||
}
|
||
`,
|
||
} as const;
|
||
|
||
/**
|
||
* Fetch additional seller data via GraphQL
|
||
*/
|
||
async function fetchSellerDetails(
|
||
posterId: string,
|
||
BASE_URL = "https://www.kijiji.ca"
|
||
): Promise<{ reviewCount?: number; reviewScore?: number; memberSince?: string; accountType?: string }> {
|
||
try {
|
||
const [reviewData, profileData] = await Promise.all([
|
||
fetchGraphQLData(GRAPHQL_QUERIES.getReviewSummary, { userId: posterId }, BASE_URL),
|
||
fetchGraphQLData(GRAPHQL_QUERIES.getProfileMetrics, { profileId: posterId }, BASE_URL),
|
||
]);
|
||
|
||
const reviewResponse = reviewData as GraphQLReviewResponse;
|
||
const profileResponse = profileData as GraphQLProfileResponse;
|
||
|
||
return {
|
||
reviewCount: reviewResponse?.user?.reviewSummary?.count,
|
||
reviewScore: reviewResponse?.user?.reviewSummary?.score,
|
||
memberSince: profileResponse?.user?.memberSince,
|
||
accountType: profileResponse?.user?.accountType,
|
||
};
|
||
} catch (err) {
|
||
// Silently fail for GraphQL errors - not critical for basic functionality
|
||
console.warn(`Failed to fetch seller details for ${posterId}:`, err instanceof Error ? err.message : String(err));
|
||
return {};
|
||
}
|
||
}
|
||
|
||
// ----------------------------- Parsing -----------------------------
|
||
|
||
/**
|
||
Extracts json.props.pageProps.__APOLLO_STATE__ safely from a Kijiji page HTML.
|
||
*/
|
||
function extractApolloState(htmlString: HTMLString): ApolloRecord | null {
|
||
const { document } = parseHTML(htmlString);
|
||
const nextData = document.getElementById("__NEXT_DATA__");
|
||
if (!nextData || !nextData.textContent) return null;
|
||
|
||
try {
|
||
const jsonData = JSON.parse(nextData.textContent);
|
||
const apollo = jsonData?.props?.pageProps?.__APOLLO_STATE__;
|
||
return isRecord(apollo) ? apollo : null;
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
Parse search page apollo state into SearchListing[].
|
||
Filters keys likely to be listing entities and ensures url/title exist.
|
||
*/
|
||
function parseSearch(
|
||
htmlString: HTMLString,
|
||
BASE_URL: string,
|
||
): SearchListing[] {
|
||
const apolloState = extractApolloState(htmlString);
|
||
if (!apolloState) return [];
|
||
|
||
const results: SearchListing[] = [];
|
||
for (const [key, value] of Object.entries(apolloState)) {
|
||
// Heuristic: Kijiji listing keys usually contain "Listing"
|
||
if (!key.includes("Listing")) continue;
|
||
if (!isRecord(value)) continue;
|
||
|
||
const item = value as ApolloSearchItem;
|
||
if (typeof item.url === "string" && typeof item.title === "string") {
|
||
results.push({
|
||
listingLink: item.url.startsWith("http")
|
||
? item.url
|
||
: `${BASE_URL}${item.url}`,
|
||
name: item.title,
|
||
});
|
||
}
|
||
}
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
Parse a listing page into a typed object.
|
||
*/
|
||
function parseListing(
|
||
htmlString: HTMLString,
|
||
BASE_URL: string,
|
||
): ListingDetails | null {
|
||
const apolloState = extractApolloState(htmlString);
|
||
if (!apolloState) return null;
|
||
|
||
// Find the listing root key
|
||
const listingKey = Object.keys(apolloState).find((k) =>
|
||
k.includes("Listing"),
|
||
);
|
||
if (!listingKey) return null;
|
||
|
||
const root = apolloState[listingKey];
|
||
if (!isRecord(root)) return null;
|
||
|
||
const {
|
||
url,
|
||
title,
|
||
description,
|
||
price,
|
||
type,
|
||
status,
|
||
activationDate,
|
||
endDate,
|
||
metrics,
|
||
location,
|
||
} = root as ApolloListingRoot;
|
||
|
||
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
||
const amountFormatted = formatCentsToCurrency(cents);
|
||
|
||
const numberOfViews =
|
||
metrics?.views != null ? Number(metrics.views) : undefined;
|
||
|
||
const listingUrl =
|
||
typeof url === "string"
|
||
? url.startsWith("http")
|
||
? url
|
||
: `${BASE_URL}${url}`
|
||
: "";
|
||
|
||
if (!listingUrl || !title) return null;
|
||
|
||
return {
|
||
url: listingUrl,
|
||
title,
|
||
description,
|
||
listingPrice: amountFormatted
|
||
? {
|
||
amountFormatted,
|
||
cents: cents !== undefined && Number.isFinite(cents) ? cents : undefined,
|
||
currency: price?.currency,
|
||
}
|
||
: undefined,
|
||
listingType: type,
|
||
listingStatus: status,
|
||
creationDate: activationDate,
|
||
endDate,
|
||
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
|
||
address: location?.address ?? null,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Parse a listing page into a detailed object with all available fields
|
||
*/
|
||
async function parseDetailedListing(
|
||
htmlString: HTMLString,
|
||
BASE_URL: string,
|
||
options: ListingFetchOptions = {}
|
||
): Promise<DetailedListing | null> {
|
||
const apolloState = extractApolloState(htmlString);
|
||
if (!apolloState) return null;
|
||
|
||
// Find the listing root key
|
||
const listingKey = Object.keys(apolloState).find((k) =>
|
||
k.includes("Listing"),
|
||
);
|
||
if (!listingKey) return null;
|
||
|
||
const root = apolloState[listingKey];
|
||
if (!isRecord(root)) return null;
|
||
|
||
const {
|
||
url,
|
||
title,
|
||
description,
|
||
price,
|
||
type,
|
||
status,
|
||
activationDate,
|
||
endDate,
|
||
metrics,
|
||
location,
|
||
imageUrls,
|
||
imageCount,
|
||
categoryId,
|
||
adSource,
|
||
flags,
|
||
posterInfo,
|
||
attributes,
|
||
} = root as ApolloListingRoot;
|
||
|
||
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
||
const amountFormatted = formatCentsToCurrency(cents);
|
||
|
||
const numberOfViews =
|
||
metrics?.views != null ? Number(metrics.views) : undefined;
|
||
|
||
const listingUrl =
|
||
typeof url === "string"
|
||
? url.startsWith("http")
|
||
? url
|
||
: `${BASE_URL}${url}`
|
||
: "";
|
||
|
||
if (!listingUrl || !title) return null;
|
||
|
||
// Only include fixed-price listings
|
||
if (!amountFormatted || cents === undefined) return null;
|
||
|
||
// Extract images if requested
|
||
const images = options.includeImages !== false && Array.isArray(imageUrls)
|
||
? imageUrls.filter((url): url is string => typeof url === 'string')
|
||
: [];
|
||
|
||
// Extract attributes as key-value pairs
|
||
const attributeMap: Record<string, string[]> = {};
|
||
if (Array.isArray(attributes)) {
|
||
for (const attr of attributes) {
|
||
if (attr?.canonicalName && Array.isArray(attr.canonicalValues)) {
|
||
attributeMap[attr.canonicalName] = attr.canonicalValues;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract seller info based on depth setting
|
||
let sellerInfo: DetailedListing['sellerInfo'];
|
||
const depth = options.sellerDataDepth ?? 'detailed';
|
||
|
||
if (posterInfo?.posterId) {
|
||
sellerInfo = {
|
||
posterId: posterInfo.posterId,
|
||
rating: typeof posterInfo.rating === 'number' ? posterInfo.rating : undefined,
|
||
};
|
||
|
||
// Add more detailed info if requested and client-side data is enabled
|
||
if ((depth === 'detailed' || depth === 'full') && options.includeClientSideData) {
|
||
try {
|
||
const additionalData = await fetchSellerDetails(posterInfo.posterId, BASE_URL);
|
||
sellerInfo = {
|
||
...sellerInfo,
|
||
...additionalData,
|
||
};
|
||
} catch (err) {
|
||
// Silently fail - GraphQL data is optional
|
||
console.warn(`Failed to fetch additional seller data for ${posterInfo.posterId}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
return {
|
||
url: listingUrl,
|
||
title,
|
||
description,
|
||
listingPrice: {
|
||
amountFormatted,
|
||
cents,
|
||
currency: price?.currency,
|
||
},
|
||
listingType: type,
|
||
listingStatus: status,
|
||
creationDate: activationDate,
|
||
endDate,
|
||
numberOfViews: numberOfViews !== undefined && Number.isFinite(numberOfViews) ? numberOfViews : undefined,
|
||
address: location?.address ?? null,
|
||
images,
|
||
categoryId: typeof categoryId === 'number' ? categoryId : 0,
|
||
adSource: typeof adSource === 'string' ? adSource : 'UNKNOWN',
|
||
flags: {
|
||
topAd: flags?.topAd === true,
|
||
priceDrop: flags?.priceDrop === true,
|
||
},
|
||
attributes: attributeMap,
|
||
location: {
|
||
id: typeof location?.id === 'number' ? location.id : 0,
|
||
name: typeof location?.name === 'string' ? location.name : 'Unknown',
|
||
coordinates: location?.coordinates ? {
|
||
latitude: location.coordinates.latitude,
|
||
longitude: location.coordinates.longitude,
|
||
} : undefined,
|
||
},
|
||
sellerInfo,
|
||
};
|
||
}
|
||
|
||
// ----------------------------- Main -----------------------------
|
||
|
||
export default async function fetchKijijiItems(
|
||
SEARCH_QUERY: string,
|
||
REQUESTS_PER_SECOND = 1,
|
||
BASE_URL = "https://www.kijiji.ca",
|
||
searchOptions: SearchOptions = {},
|
||
listingOptions: ListingFetchOptions = {},
|
||
) {
|
||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||
|
||
// Set defaults for configuration
|
||
const finalSearchOptions: Required<SearchOptions> = {
|
||
location: searchOptions.location ?? 1700272, // Default to GTA
|
||
category: searchOptions.category ?? 0, // Default to all categories
|
||
keywords: searchOptions.keywords ?? SEARCH_QUERY,
|
||
sortBy: searchOptions.sortBy ?? 'relevancy',
|
||
sortOrder: searchOptions.sortOrder ?? 'desc',
|
||
maxPages: searchOptions.maxPages ?? 5, // Default to 5 pages
|
||
priceMin: searchOptions.priceMin,
|
||
priceMax: searchOptions.priceMax,
|
||
};
|
||
|
||
const finalListingOptions: Required<ListingFetchOptions> = {
|
||
includeImages: listingOptions.includeImages ?? true,
|
||
sellerDataDepth: listingOptions.sellerDataDepth ?? 'detailed',
|
||
includeClientSideData: listingOptions.includeClientSideData ?? false,
|
||
};
|
||
|
||
const allListings: DetailedListing[] = [];
|
||
const seenUrls = new Set<string>();
|
||
|
||
// Fetch multiple pages
|
||
for (let page = 1; page <= finalSearchOptions.maxPages; page++) {
|
||
const searchUrl = buildSearchUrl(finalSearchOptions.keywords, {
|
||
...finalSearchOptions,
|
||
// Add page parameter for pagination
|
||
...(page > 1 && { page }),
|
||
}, BASE_URL);
|
||
|
||
console.log(`Fetching search page ${page}: ${searchUrl}`);
|
||
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
||
onRateInfo: (remaining, reset) => {
|
||
if (remaining && reset) {
|
||
console.log(`\nSearch - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
|
||
}
|
||
},
|
||
});
|
||
|
||
const searchResults = parseSearch(searchHtml, BASE_URL);
|
||
if (searchResults.length === 0) {
|
||
console.log(`No more results found on page ${page}. Stopping pagination.`);
|
||
break;
|
||
}
|
||
|
||
// Deduplicate links across pages
|
||
const newListingLinks = searchResults
|
||
.map((r) => r.listingLink)
|
||
.filter((link) => !seenUrls.has(link));
|
||
|
||
for (const link of newListingLinks) {
|
||
seenUrls.add(link);
|
||
}
|
||
|
||
console.log(`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`);
|
||
|
||
// Fetch details for this page's listings
|
||
const progressBar = new cliProgress.SingleBar(
|
||
{},
|
||
cliProgress.Presets.shades_classic,
|
||
);
|
||
const totalProgress = newListingLinks.length;
|
||
let currentProgress = 0;
|
||
progressBar.start(totalProgress, currentProgress);
|
||
|
||
for (const link of newListingLinks) {
|
||
try {
|
||
const html = await fetchHtml(link, DELAY_MS, {
|
||
onRateInfo: (remaining, reset) => {
|
||
if (remaining && reset) {
|
||
console.log(`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`);
|
||
}
|
||
},
|
||
});
|
||
const parsed = await parseDetailedListing(html, BASE_URL, finalListingOptions);
|
||
if (parsed) {
|
||
allListings.push(parsed);
|
||
}
|
||
} catch (err) {
|
||
if (err instanceof HttpError) {
|
||
console.error(`\nFailed to fetch ${link}\n - ${err.status} ${err.message}`);
|
||
} else {
|
||
console.error(`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`);
|
||
}
|
||
} finally {
|
||
currentProgress++;
|
||
progressBar.update(currentProgress);
|
||
}
|
||
}
|
||
|
||
progressBar.stop();
|
||
|
||
// If we got fewer results than expected (40 per page), we've reached the end
|
||
if (searchResults.length < 40) {
|
||
break;
|
||
}
|
||
}
|
||
|
||
console.log(`\nParsed ${allListings.length} detailed listings.`);
|
||
return allListings;
|
||
}
|