fix: preserve free results and request pacing

This commit is contained in:
2026-04-23 05:40:42 -04:00
parent 13c0fec305
commit eb37e8814e
4 changed files with 162 additions and 10 deletions

View File

@@ -1205,7 +1205,8 @@ export default async function fetchFacebookItems(
// Filter to only priced items (already done in parseFacebookAds) // Filter to only priced items (already done in parseFacebookAds)
const pricedItems = items.filter( const pricedItems = items.filter(
(item) => item.listingPrice?.cents && item.listingPrice.cents > 0, (item) =>
typeof item.listingPrice?.cents === "number" && item.listingPrice.cents >= 0,
); );
progressBar.update(totalProgress); progressBar.update(totalProgress);

View File

@@ -889,15 +889,15 @@ export default async function fetchKijijiItems(
progressBar?.start(totalProgress, currentProgress); progressBar?.start(totalProgress, currentProgress);
// Process in batches for controlled concurrency // Process in batches for controlled concurrency
const CONCURRENT_REQUESTS = Math.max(1, Math.floor(requestsPerSecond * 2)); // 2x rate for faster processing const CONCURRENT_REQUESTS = 1;
const results: (DetailedListing | null)[] = []; const results: (DetailedListing | null)[] = [];
for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) { for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) {
const batch = newListingLinks.slice(i, i + CONCURRENT_REQUESTS); const batch = newListingLinks.slice(i, i + CONCURRENT_REQUESTS);
const batchPromises = batch.map(async (link) => { const batchPromises = batch.map(async (link) => {
try { try {
const html = await fetchHtml(link, 0, { const html = await fetchHtml(link, DELAY_MS, {
// No per-request delay, batch handles rate limit // Per-request delay keeps detail fetches within REQUESTS_PER_SECOND.
onRateInfo: (remaining, reset) => { onRateInfo: (remaining, reset) => {
if (remaining && reset) { if (remaining && reset) {
console.log( console.log(
@@ -936,12 +936,6 @@ export default async function fetchKijijiItems(
const batchResults = await Promise.all(batchPromises); const batchResults = await Promise.all(batchPromises);
results.push(...batchResults); results.push(...batchResults);
// Wait between batches to respect rate limit
if (i + CONCURRENT_REQUESTS < newListingLinks.length) {
await new Promise((resolve) =>
setTimeout(resolve, DELAY_MS * batch.length),
);
}
} }
allListings.push( allListings.push(

View File

@@ -571,6 +571,56 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
expect(results).toHaveLength(1); expect(results).toHaveLength(1);
}); });
test("preserves free listings through the public fetch entrypoint", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "free-1",
marketplace_listing_title: "Free Chair",
listing_price: {
amount: "0.00",
formatted_amount: "FREE",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(mockSearchHtml),
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("chair", 1, "toronto", 25);
expect(results).toEqual([
expect.objectContaining({
title: "Free Chair",
listingPrice: expect.objectContaining({
cents: 0,
amountFormatted: "FREE",
}),
}),
]);
});
test("returns results and unstableResults when unstable mode is enabled", async () => { test("returns results and unstableResults when unstable mode is enabled", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({ const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: { payload: {

View File

@@ -321,6 +321,113 @@ describe("fetchKijijiItems", () => {
]); ]);
}); });
test("respects REQUESTS_PER_SECOND without concurrent detail fetch bursts", async () => {
const searchHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:1": { url: "/v-one/k0l0", title: "One" },
"Listing:2": { url: "/v-two/k0l0", title: "Two" },
"Listing:3": { url: "/v-three/k0l0", title: "Three" },
},
},
},
})}
</script>
</html>
`;
const listingHtml = (title: string, slug: string) => `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:detail": {
url: `/${slug}`,
title,
price: { amount: 10000, currency: "CAD", type: "FIXED" },
type: "OFFER",
status: "ACTIVE",
},
},
},
},
})}
</script>
</html>
`;
let activeDetailRequests = 0;
let maxActiveDetailRequests = 0;
global.fetch = mock(async (input: string | URL | Request) => {
const url = typeof input === "string" ? input : input.toString();
if (url.includes("/k0c0l1700272")) {
return {
ok: true,
text: () => Promise.resolve(searchHtml),
headers: { get: () => null },
url,
};
}
activeDetailRequests++;
maxActiveDetailRequests = Math.max(
maxActiveDetailRequests,
activeDetailRequests,
);
await new Promise((resolve) => setTimeout(resolve, 5));
activeDetailRequests--;
if (url.endsWith("/v-one/k0l0")) {
return {
ok: true,
text: () => Promise.resolve(listingHtml("One", "v-one/k0l0")),
headers: { get: () => null },
url,
};
}
if (url.endsWith("/v-two/k0l0")) {
return {
ok: true,
text: () => Promise.resolve(listingHtml("Two", "v-two/k0l0")),
headers: { get: () => null },
url,
};
}
if (url.endsWith("/v-three/k0l0")) {
return {
ok: true,
text: () => Promise.resolve(listingHtml("Three", "v-three/k0l0")),
headers: { get: () => null },
url,
};
}
throw new Error(`Unexpected URL: ${url}`);
}) as typeof fetch;
const results = await fetchKijijiItems(
"phone",
1,
"https://www.kijiji.ca",
{ maxPages: 1 },
);
expect(results).toHaveLength(3);
expect(maxActiveDetailRequests).toBe(1);
});
test("classifies the filtered Kijiji result set in unstable mode", async () => { test("classifies the filtered Kijiji result set in unstable mode", async () => {
const searchHtml = ` const searchHtml = `
<html> <html>