refactor: increase kijiji scraping request rate to 4 rps
Signed-off-by: Dmytro Stanchiev <git@dmytros.dev>
This commit is contained in:
@@ -756,51 +756,75 @@ export default async function fetchKijijiItems(
|
||||
`\nFound ${newListingLinks.length} new listing links on page ${page}. Total unique: ${seenUrls.size}`,
|
||||
);
|
||||
|
||||
// Fetch details for this page's listings
|
||||
const progressBar = new cliProgress.SingleBar(
|
||||
{},
|
||||
cliProgress.Presets.shades_classic,
|
||||
);
|
||||
// Fetch details for this page's listings with controlled concurrency
|
||||
const isTTY = process.stdout?.isTTY ?? false;
|
||||
const progressBar = isTTY
|
||||
? new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic)
|
||||
: null;
|
||||
const totalProgress = newListingLinks.length;
|
||||
let currentProgress = 0;
|
||||
progressBar.start(totalProgress, currentProgress);
|
||||
progressBar?.start(totalProgress, currentProgress);
|
||||
|
||||
for (const link of newListingLinks) {
|
||||
try {
|
||||
const html = await fetchHtml(link, DELAY_MS, {
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
console.log(
|
||||
`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||
);
|
||||
}
|
||||
},
|
||||
});
|
||||
const parsed = await parseDetailedListing(
|
||||
html,
|
||||
BASE_URL,
|
||||
finalListingOptions,
|
||||
// Process in batches for controlled concurrency
|
||||
const CONCURRENT_REQUESTS = REQUESTS_PER_SECOND * 2; // 2x rate for faster processing
|
||||
const results: (DetailedListing | null)[] = [];
|
||||
|
||||
for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) {
|
||||
const batch = newListingLinks.slice(i, i + CONCURRENT_REQUESTS);
|
||||
const batchPromises = batch.map(async (link) => {
|
||||
try {
|
||||
const html = await fetchHtml(link, 0, {
|
||||
// No per-request delay, batch handles rate limit
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
console.log(
|
||||
`\nItem - Rate limit remaining: ${remaining}, reset in: ${reset}s`,
|
||||
);
|
||||
}
|
||||
},
|
||||
});
|
||||
const parsed = await parseDetailedListing(
|
||||
html,
|
||||
BASE_URL,
|
||||
finalListingOptions,
|
||||
);
|
||||
return parsed;
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.error(
|
||||
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
|
||||
);
|
||||
} else {
|
||||
console.error(
|
||||
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
||||
);
|
||||
}
|
||||
return null;
|
||||
} finally {
|
||||
currentProgress++;
|
||||
progressBar?.update(currentProgress);
|
||||
if (!progressBar) {
|
||||
console.log(`Progress: ${currentProgress}/${totalProgress}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const batchResults = await Promise.all(batchPromises);
|
||||
results.push(...batchResults);
|
||||
|
||||
// Wait between batches to respect rate limit
|
||||
if (i + CONCURRENT_REQUESTS < newListingLinks.length) {
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, DELAY_MS * batch.length),
|
||||
);
|
||||
if (parsed) {
|
||||
allListings.push(parsed);
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.error(
|
||||
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
|
||||
);
|
||||
} else {
|
||||
console.error(
|
||||
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
currentProgress++;
|
||||
progressBar.update(currentProgress);
|
||||
}
|
||||
}
|
||||
|
||||
progressBar.stop();
|
||||
allListings.push(
|
||||
...results.filter((r): r is DetailedListing => r !== null),
|
||||
);
|
||||
|
||||
progressBar?.stop();
|
||||
|
||||
// If we got fewer results than expected (40 per page), we've reached the end
|
||||
if (searchResults.length < 40) {
|
||||
|
||||
Reference in New Issue
Block a user