feat: port upstream scraper improvements to monorepo

Kijiji improvements:
- Add error classes: NetworkError, ParseError, RateLimitError, ValidationError
- Add exponential backoff with jitter for retries
- Add request timeout (30s abort)
- Add pagination support (SearchOptions.maxPages)
- Add location/category mappings and resolution functions
- Add enhanced DetailedListing interface with images, seller info, attributes
- Add GraphQL client for seller details

Facebook improvements:
- Add parseFacebookCookieString() for parsing cookie strings
- Add ensureFacebookCookies() with env var fallback
- Add extractFacebookItemData() with multiple extraction paths
- Add fetchFacebookItem() for individual item fetching
- Add extraction metrics and API stability monitoring
- Add vehicle-specific field extraction
- Improve error handling with specific guidance for auth errors

Shared utilities:
- Update http.ts with new error classes and improved fetchHtml

Documentation:
- Port KIJIJI.md, FMARKETPLACE.md, AGENTS.md from upstream

Tests:
- Port kijiji-core, kijiji-integration, kijiji-utils tests
- Port facebook-core, facebook-integration tests
- Add test setup file

Scripts:
- Port parse-facebook-cookies.ts script

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-23 00:34:50 -05:00
parent 497c7995a2
commit 50d56201af
14 changed files with 4687 additions and 179 deletions

View File

@@ -1,87 +1,200 @@
/** Custom error class for HTTP-related failures */
export class HttpError extends Error {
constructor(
public statusCode: number,
message: string
message: string,
public readonly statusCode: number,
public readonly url?: string
) {
super(message);
this.name = "HttpError";
}
}
/** Error class for network failures (timeouts, connection issues) */
export class NetworkError extends Error {
constructor(
message: string,
public readonly url: string,
public readonly cause?: Error
) {
super(message);
this.name = "NetworkError";
}
}
/** Error class for parsing failures */
export class ParseError extends Error {
constructor(
message: string,
public readonly data?: unknown
) {
super(message);
this.name = "ParseError";
}
}
/** Error class for rate limiting */
export class RateLimitError extends Error {
constructor(
message: string,
public readonly url: string,
public readonly resetTime?: number
) {
super(message);
this.name = "RateLimitError";
}
}
/** Error class for validation failures */
export class ValidationError extends Error {
constructor(message: string) {
super(message);
this.name = "ValidationError";
}
}
/** Type guard to check if a value is a record (object) */
export function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
/**
* Fetch HTML content from a URL with automatic retries
* Calculate exponential backoff delay with jitter
*/
function calculateBackoffDelay(attempt: number, baseMs: number): number {
const exponentialDelay = baseMs * 2 ** attempt;
const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter
return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds
}
/** Options for fetchHtml */
export interface FetchHtmlOptions {
maxRetries?: number;
retryBaseMs?: number;
timeoutMs?: number;
onRateInfo?: (remaining: string | null, reset: string | null) => void;
headers?: Record<string, string>;
}
/**
* Fetch HTML content from a URL with automatic retries, timeout, and exponential backoff
* @param url - The URL to fetch
* @param delayMs - Delay in milliseconds between retries
* @param delayMs - Delay in milliseconds between requests (rate limiting)
* @param opts - Optional fetch options
* @returns The HTML content as a string
* @throws HttpError if all retries are exhausted
* @throws HttpError, NetworkError, or RateLimitError on failure
*/
export async function fetchHtml(
url: string,
delayMs: number,
opts?: RequestInit
opts?: FetchHtmlOptions
): Promise<string> {
const maxAttempts = 3;
let lastError: Error | null = null;
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 1000;
const timeoutMs = opts?.timeoutMs ?? 30000;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const defaultHeaders: Record<string, string> = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"upgrade-insecure-requests": "1",
"user-agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
};
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const response = await fetch(url, opts);
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
// Check for rate limiting
if (response.status === 429) {
const retryAfter = response.headers.get("Retry-After");
const waitTime = retryAfter ? parseInt(retryAfter) * 1000 : delayMs * (attempt + 1);
console.warn(
`Rate limited. Retrying after ${waitTime}ms...`
);
await new Promise((resolve) => setTimeout(resolve, waitTime));
continue;
}
const res = await fetch(url, {
method: "GET",
headers: { ...defaultHeaders, ...opts?.headers },
signal: controller.signal,
});
// Check for server errors
if (response.status >= 500) {
lastError = new HttpError(
response.status,
`Server error: ${response.status}`
);
if (attempt < maxAttempts - 1) {
clearTimeout(timeoutId);
const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining");
const rateLimitReset = res.headers.get("X-RateLimit-Reset");
opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset);
if (!res.ok) {
// Handle rate limiting
if (res.status === 429) {
const resetSeconds = rateLimitReset
? Number(rateLimitReset)
: Number.NaN;
const waitMs = Number.isFinite(resetSeconds)
? Math.max(0, resetSeconds * 1000)
: calculateBackoffDelay(attempt, retryBaseMs);
if (attempt < maxRetries) {
await new Promise((resolve) => setTimeout(resolve, waitMs));
continue;
}
throw new RateLimitError(
`Rate limit exceeded for ${url}`,
url,
resetSeconds
);
}
// Retry on server errors
if (res.status >= 500 && res.status < 600 && attempt < maxRetries) {
await new Promise((resolve) =>
setTimeout(resolve, delayMs * (attempt + 1))
setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs))
);
continue;
}
throw lastError;
}
// Check for successful response
if (!response.ok) {
throw new HttpError(
response.status,
`HTTP ${response.status}: ${response.statusText}`
`Request failed with status ${res.status}`,
res.status,
url
);
}
return await response.text();
} catch (error) {
lastError =
error instanceof Error
? error
: new Error("Unknown error during fetch");
const html = await res.text();
if (attempt < maxAttempts - 1) {
// Respect per-request delay to maintain rate limiting
await new Promise((resolve) => setTimeout(resolve, delayMs));
return html;
} catch (err) {
// Re-throw known errors
if (
err instanceof RateLimitError ||
err instanceof HttpError ||
err instanceof NetworkError
) {
throw err;
}
if (err instanceof Error && err.name === "AbortError") {
if (attempt < maxRetries) {
await new Promise((resolve) =>
setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs))
);
continue;
}
throw new NetworkError(`Request timeout for ${url}`, url, err);
}
// Network or other errors
if (attempt < maxRetries) {
await new Promise((resolve) =>
setTimeout(resolve, delayMs * (attempt + 1))
setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs))
);
continue;
}
throw new NetworkError(
`Network error fetching ${url}: ${err instanceof Error ? err.message : String(err)}`,
url,
err instanceof Error ? err : undefined
);
}
}
throw lastError || new HttpError(0, "Failed to fetch after retries");
throw new NetworkError(`Exhausted retries without response for ${url}`, url);
}