refactor: increase kijiji scraping request rate to 4 rps

Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
2026-01-23 14:41:56 -05:00
parent f3839aba54
commit 65eb8d1724
3 changed files with 165 additions and 94 deletions

View File

@@ -756,51 +756,75 @@ export default async function fetchKijijiItems(
`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`,
);
// Fetch details for this page's listings
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
// Fetch details for this page's listings with controlled concurrency
const isTTY = process.stdout?.isTTY ?? false;
const progressBar = isTTY
? new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic)
: null;
const totalProgress = newListingLinks.length;
let currentProgress = 0;
progressBar.start(totalProgress, currentProgress);
progressBar?.start(totalProgress, currentProgress);
for (const link of newListingLinks) {
try {
const html = await fetchHtml(link, DELAY_MS, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const parsed = await parseDetailedListing(
html,
BASE_URL,
finalListingOptions,
// Process in batches for controlled concurrency
const CONCURRENT_REQUESTS = REQUESTS_PER_SECOND * 2; // 2x rate for faster processing
const results: (DetailedListing | null)[] = [];
for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) {
const batch = newListingLinks.slice(i, i + CONCURRENT_REQUESTS);
const batchPromises = batch.map(async (link) => {
try {
const html = await fetchHtml(link, 0, {
// No per-request delay, batch handles rate limit
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
);
}
},
});
const parsed = await parseDetailedListing(
html,
BASE_URL,
finalListingOptions,
);
return parsed;
} catch (err) {
if (err instanceof HttpError) {
console.error(
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
);
} else {
console.error(
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
);
}
return null;
} finally {
currentProgress++;
progressBar?.update(currentProgress);
if (!progressBar) {
console.log(`Progress: ${currentProgress}/${totalProgress}`);
}
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
// Wait between batches to respect rate limit
if (i + CONCURRENT_REQUESTS < newListingLinks.length) {
await new Promise((resolve) =>
setTimeout(resolve, DELAY_MS * batch.length),
);
if (parsed) {
allListings.push(parsed);
}
} catch (err) {
if (err instanceof HttpError) {
console.error(
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
);
} else {
console.error(
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
);
}
} finally {
currentProgress++;
progressBar.update(currentProgress);
}
}
progressBar.stop();
allListings.push(
...results.filter((r): r is DetailedListing => r !== null),
);
progressBar?.stop();
// If we got fewer results than expected (40 per page), we've reached the end
if (searchResults.length < 40) {