From 6e50ebf90106d65b0e5cea4fbf72f77d82682083 Mon Sep 17 00:00:00 2001 From: Dmytro Stanchiev Date: Wed, 29 Apr 2026 13:14:20 -0400 Subject: [PATCH] refactor: share scraper http fetching --- packages/core/src/scrapers/ebay.ts | 34 +---- packages/core/src/scrapers/facebook.ts | 166 ++++++------------------- packages/core/src/utils/http.ts | 53 +++++--- packages/core/src/utils/logger.ts | 3 + packages/core/test/ebay-core.test.ts | 19 +++ packages/core/test/http.test.ts | 19 +++ 6 files changed, 121 insertions(+), 173 deletions(-) diff --git a/packages/core/src/scrapers/ebay.ts b/packages/core/src/scrapers/ebay.ts index 25254d7..a0d0864 100644 --- a/packages/core/src/scrapers/ebay.ts +++ b/packages/core/src/scrapers/ebay.ts @@ -9,7 +9,7 @@ import { ensureCookies, formatCookiesForHeader, } from "../utils/cookies"; -import { delay } from "../utils/delay"; +import { fetchHtml, HttpError } from "../utils/http"; import { logger } from "../utils/logger"; import { classifyUnstableListings } from "../utils/unstable"; @@ -102,17 +102,6 @@ function parseEbayPrice( return { cents, currency }; } -class HttpError extends Error { - constructor( - message: string, - public readonly status: number, - public readonly url: string, - ) { - super(message); - this.name = "HttpError"; - } -} - // ----------------------------- Parsing ----------------------------- /** @@ -500,22 +489,7 @@ export default async function fetchEbayItems( headers.Cookie = cookies; } - const res = await fetch(searchUrl, { - method: "GET", - headers, - }); - - if (!res.ok) { - throw new HttpError( - `Request failed with status ${res.status}`, - res.status, - searchUrl, - ); - } - - const searchHtml = await res.text(); - // Respect per-request delay to keep at or under REQUESTS_PER_SECOND - await delay(DELAY_MS); + const searchHtml = await fetchHtml(searchUrl, DELAY_MS, { headers }); logger.log(`\nParsing eBay listings...`); @@ -538,8 +512,8 @@ export default async function fetchEbayItems( return finalizeResults(filteredListings); } catch (err) { if (err instanceof HttpError) { - console.error( - `Failed to fetch eBay search (${err.status}): ${err.message}`, + logger.error( + `Failed to fetch eBay search (${err.statusCode}): ${err.message}`, ); return finalizeResults([]); } diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index b70d98c..12fb129 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -12,9 +12,8 @@ import { formatCookiesForHeader, parseCookieString, } from "../utils/cookies"; -import { delay } from "../utils/delay"; import { formatCentsToCurrency } from "../utils/format"; -import { isRecord } from "../utils/http"; +import { fetchHtml, HttpError, isRecord, RateLimitError } from "../utils/http"; import { logger } from "../utils/logger"; import { classifyUnstableListings } from "../utils/unstable"; @@ -219,17 +218,6 @@ export async function ensureFacebookCookies(): Promise { return ensureCookies(FACEBOOK_COOKIE_CONFIG); } -class HttpError extends Error { - constructor( - message: string, - public readonly status: number, - public readonly url: string, - ) { - super(message); - this.name = "HttpError"; - } -} - // ----------------------------- Extraction Metrics ----------------------------- /** @@ -274,112 +262,21 @@ function logExtractionMetrics(success: boolean, itemId?: string) { // ----------------------------- HTTP Client ----------------------------- -/** - Fetch HTML with a basic retry strategy and simple rate-limit delay between calls. - - Retries on 429 and 5xx - - Respects X-RateLimit-Reset when present (seconds) - - Supports custom cookies for Facebook authentication -*/ -async function fetchHtml( - url: string, - DELAY_MS: number, - opts?: { - maxRetries?: number; - retryBaseMs?: number; - onRateInfo?: (remaining: string | null, reset: string | null) => void; - cookies?: string; - }, -): Promise<{ html: HTMLString; responseUrl: string }> { - const maxRetries = opts?.maxRetries ?? 3; - const retryBaseMs = opts?.retryBaseMs ?? 500; - let lastRateLimitError: HttpError | null = null; - - for (let attempt = 0; attempt <= maxRetries; attempt++) { - try { - const headers: Record = { - accept: - "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", - "accept-encoding": "gzip, deflate, br", - "cache-control": "no-cache", - "upgrade-insecure-requests": "1", - "sec-fetch-dest": "document", - "sec-fetch-mode": "navigate", - "sec-fetch-site": "none", - "sec-fetch-user": "?1", - "user-agent": - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - }; - - // Add cookies if provided - if (opts?.cookies) { - headers.cookie = opts.cookies; - } - - const res = await fetch(url, { - method: "GET", - headers, - }); - - const rateLimitRemaining = res.headers.get("X-RateLimit-Remaining"); - const rateLimitReset = res.headers.get("X-RateLimit-Reset"); - opts?.onRateInfo?.(rateLimitRemaining, rateLimitReset); - - if (!res.ok) { - // Respect 429 reset if provided - if (res.status === 429) { - lastRateLimitError = new HttpError( - `Request failed with status ${res.status}`, - res.status, - url, - ); - const resetSeconds = rateLimitReset - ? Number(rateLimitReset) - : Number.NaN; - const waitMs = Number.isFinite(resetSeconds) - ? Math.max(0, resetSeconds * 1000) - : (attempt + 1) * retryBaseMs; - if (attempt >= maxRetries) { - throw lastRateLimitError; - } - await delay(waitMs); - continue; - } - // For Facebook, 400 often means authentication required - // Don't retry 4xx client errors except 429 - if (res.status >= 400 && res.status < 500 && res.status !== 429) { - throw new HttpError( - `Request failed with status ${res.status} (Facebook may require authentication cookies for access)`, - res.status, - url, - ); - } - // Retry on 5xx - if (res.status >= 500 && res.status < 600 && attempt < maxRetries) { - await delay((attempt + 1) * retryBaseMs); - continue; - } - throw new HttpError( - `Request failed with status ${res.status}`, - res.status, - url, - ); - } - - const html = await res.text(); - // Respect per-request delay to keep at or under REQUESTS_PER_SECOND - await delay(DELAY_MS); - return { html, responseUrl: res.url || url }; - } catch (err) { - if (err instanceof HttpError) { - throw err; - } - if (attempt >= maxRetries) throw err; - await delay((attempt + 1) * retryBaseMs); - } - } - - throw lastRateLimitError ?? new Error("Exhausted retries without response"); +function createFacebookHeaders(cookies: string): Record { + return { + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", + "cache-control": "no-cache", + "upgrade-insecure-requests": "1", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "user-agent": + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + cookie: cookies, + }; } // ----------------------------- Parsing ----------------------------- @@ -1157,6 +1054,8 @@ export default async function fetchFacebookItems( try { const response = await fetchHtml(searchUrl, DELAY_MS, { maxRetries: 3, + includeResponseUrl: true, + headers: createFacebookHeaders(cookiesHeader), onRateInfo: (remaining, reset) => { if (remaining && reset) { logger.log( @@ -1164,22 +1063,27 @@ export default async function fetchFacebookItems( ); } }, - cookies: cookiesHeader, }); searchHtml = response.html; searchResponseUrl = response.responseUrl; } catch (err) { if (err instanceof HttpError) { logger.warn( - `\nFacebook marketplace access failed (${err.status}): ${err.message}`, + `\nFacebook marketplace access failed (${err.statusCode}): ${err.message}`, ); - if (err.status === 400 || err.status === 401 || err.status === 403) { + if (err.statusCode === 400 || err.statusCode === 401 || err.statusCode === 403) { logger.warn( "This might indicate invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.", ); } return finalizeResults([]); } + if (err instanceof RateLimitError) { + logger.warn( + `\nFacebook marketplace access rate limited: ${err.message}`, + ); + return finalizeResults([]); + } throw err; } @@ -1261,6 +1165,8 @@ export async function fetchFacebookItem( let itemResponseUrl = itemUrl; try { const response = await fetchHtml(itemUrl, 1000, { + includeResponseUrl: true, + headers: createFacebookHeaders(cookiesHeader), onRateInfo: (remaining, reset) => { if (remaining && reset) { logger.log( @@ -1268,18 +1174,17 @@ export async function fetchFacebookItem( ); } }, - cookies: cookiesHeader, }); itemHtml = response.html; itemResponseUrl = response.responseUrl; } catch (err) { if (err instanceof HttpError) { logger.warn( - `\nFacebook marketplace item access failed (${err.status}): ${err.message}`, + `\nFacebook marketplace item access failed (${err.statusCode}): ${err.message}`, ); // Enhanced error handling based on status codes - switch (err.status) { + switch (err.statusCode) { case 400: case 401: case 403: @@ -1305,10 +1210,19 @@ export async function fetchFacebookItem( ); break; default: - logger.warn(`Unexpected error status: ${err.status}`); + logger.warn(`Unexpected error status: ${err.statusCode}`); } return null; } + if (err instanceof RateLimitError) { + logger.warn( + `\nFacebook marketplace item rate limited for item ${itemId}: ${err.message}`, + ); + logger.warn( + "Rate limited: Too many requests. Facebook is blocking access temporarily.", + ); + return null; + } throw err; } diff --git a/packages/core/src/utils/http.ts b/packages/core/src/utils/http.ts index c624c97..e8ab2f2 100644 --- a/packages/core/src/utils/http.ts +++ b/packages/core/src/utils/http.ts @@ -1,3 +1,4 @@ +import type { HTMLString } from "../types/common"; import { delay } from "./delay"; /** Custom error class for HTTP-related failures */ @@ -60,10 +61,20 @@ export function isRecord(value: unknown): value is Record { /** * Calculate exponential backoff delay with jitter */ -function calculateBackoffDelay(attempt: number, baseMs: number): number { +function calculateBackoffDelay( + attempt: number, + baseMs: number, + jitter: () => number = Math.random, +): number { const exponentialDelay = baseMs * 2 ** attempt; - const jitter = Math.random() * 0.1 * exponentialDelay; // 10% jitter - return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds + const jitterDelay = jitter() * 0.1 * exponentialDelay; // 10% jitter + return Math.min(exponentialDelay + jitterDelay, 30000); // Cap at 30 seconds +} + +/** Result type when includeResponseUrl is true */ +export interface FetchHtmlResult { + html: HTMLString; + responseUrl: string; } /** Options for fetchHtml */ @@ -73,6 +84,8 @@ export interface FetchHtmlOptions { timeoutMs?: number; onRateInfo?: (remaining: string | null, reset: string | null) => void; headers?: Record; + includeResponseUrl?: boolean; + jitter?: () => number; } /** @@ -80,14 +93,24 @@ export interface FetchHtmlOptions { * @param url - The URL to fetch * @param delayMs - Delay in milliseconds between requests (rate limiting) * @param opts - Optional fetch options - * @returns The HTML content as a string + * @returns The HTML content as a string, or an object with html and responseUrl * @throws HttpError, NetworkError, or RateLimitError on failure */ +export async function fetchHtml( + url: string, + delayMs: number, + opts: FetchHtmlOptions & { includeResponseUrl: true }, +): Promise; export async function fetchHtml( url: string, delayMs: number, opts?: FetchHtmlOptions, -): Promise { +): Promise; +export async function fetchHtml( + url: string, + delayMs: number, + opts?: FetchHtmlOptions, +): Promise { const maxRetries = opts?.maxRetries ?? 3; const retryBaseMs = opts?.retryBaseMs ?? 1000; const timeoutMs = opts?.timeoutMs ?? 30000; @@ -138,10 +161,10 @@ export async function fetchHtml( : Number.NaN; const waitMs = Number.isFinite(resetSeconds) ? Math.max(0, resetSeconds * 1000) - : calculateBackoffDelay(attempt, retryBaseMs); + : calculateBackoffDelay(attempt, retryBaseMs, opts?.jitter ?? Math.random); if (attempt < maxRetries) { - await new Promise((resolve) => setTimeout(resolve, waitMs)); + await delay(waitMs); continue; } throw new RateLimitError( @@ -153,9 +176,7 @@ export async function fetchHtml( // Retry on server errors if (res.status >= 500 && res.status < 600 && attempt < maxRetries) { - await new Promise((resolve) => - setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs)), - ); + await delay(calculateBackoffDelay(attempt, retryBaseMs, opts?.jitter ?? Math.random)); continue; } @@ -170,7 +191,9 @@ export async function fetchHtml( // Respect per-request delay to maintain rate limiting await delay(delayMs); - return html; + return opts?.includeResponseUrl + ? { html, responseUrl: res.url || url } + : html; } catch (err) { // Re-throw known errors if ( @@ -183,9 +206,7 @@ export async function fetchHtml( if (err instanceof Error && err.name === "AbortError") { if (attempt < maxRetries) { - await new Promise((resolve) => - setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs)), - ); + await delay(calculateBackoffDelay(attempt, retryBaseMs, opts?.jitter ?? Math.random)); continue; } throw new NetworkError(`Request timeout for ${url}`, url, err); @@ -193,9 +214,7 @@ export async function fetchHtml( // Network or other errors if (attempt < maxRetries) { - await new Promise((resolve) => - setTimeout(resolve, calculateBackoffDelay(attempt, retryBaseMs)), - ); + await delay(calculateBackoffDelay(attempt, retryBaseMs, opts?.jitter ?? Math.random)); continue; } throw new NetworkError( diff --git a/packages/core/src/utils/logger.ts b/packages/core/src/utils/logger.ts index 30cd024..26c5c79 100644 --- a/packages/core/src/utils/logger.ts +++ b/packages/core/src/utils/logger.ts @@ -7,4 +7,7 @@ export const logger = { warn: (...args: Parameters) => { if (!isTest()) console.warn(...args); }, + error: (...args: Parameters) => { + if (!isTest()) console.error(...args); + }, }; diff --git a/packages/core/test/ebay-core.test.ts b/packages/core/test/ebay-core.test.ts index 2016fdb..69c8de1 100644 --- a/packages/core/test/ebay-core.test.ts +++ b/packages/core/test/ebay-core.test.ts @@ -32,6 +32,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(""), }), ) as unknown as typeof fetch; @@ -64,6 +65,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -88,6 +90,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -114,6 +117,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -146,6 +150,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -188,6 +193,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -214,6 +220,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -243,6 +250,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -272,6 +280,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -301,6 +310,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -343,6 +353,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -375,6 +386,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -407,6 +419,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -440,6 +453,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -467,6 +481,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -499,6 +514,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -529,6 +545,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -574,6 +591,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` @@ -612,6 +630,7 @@ describe("eBay Scraper Cookie Handling", () => { global.fetch = mock(() => Promise.resolve({ ok: true, + headers: { get: () => null }, text: () => Promise.resolve(` diff --git a/packages/core/test/http.test.ts b/packages/core/test/http.test.ts index 056b171..828c01b 100644 --- a/packages/core/test/http.test.ts +++ b/packages/core/test/http.test.ts @@ -38,4 +38,23 @@ describe("fetchHtml", () => { expect(scheduledDelays).not.toContain(1000); }); + + test("fetchHtml returns responseUrl when includeResponseUrl is true", async () => { + process.env.NODE_ENV = "test"; + global.fetch = mock(() => + Promise.resolve({ + ok: true, + status: 200, + url: "https://example.test/final", + headers: { get: () => null }, + text: () => Promise.resolve(""), + }), + ) as unknown as typeof fetch; + + const result = await fetchHtml("https://example.test", 0, { + includeResponseUrl: true, + }); + expect(result.html).toBe(""); + expect(result.responseUrl).toBe("https://example.test/final"); + }); });