fix: align scraper unstable mode behavior

This commit is contained in:
2026-04-22 23:36:00 -04:00
parent c7fc8352ac
commit 08edfa8097
5 changed files with 54 additions and 18 deletions

View File

@@ -1,5 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import type { import type {
HTMLString,
UnstableListingBuckets, UnstableListingBuckets,
UnstableListingModeOptions, UnstableListingModeOptions,
} from "../types/common"; } from "../types/common";
@@ -114,7 +115,7 @@ function parseEbayListings(
if (!href.startsWith("http")) { if (!href.startsWith("http")) {
href = href.startsWith("//") href = href.startsWith("//")
? `https:${href}` ? `https:${href}`
: `https://www.ebay.com${href}`; : `https://www.ebay.ca${href}`;
} }
// Find the container - go up several levels to find the item container // Find the container - go up several levels to find the item container
@@ -397,6 +398,8 @@ export default async function fetchEbayItems(
} = {}, } = {},
unstableMode: UnstableListingModeOptions = {}, unstableMode: UnstableListingModeOptions = {},
) { ) {
const requestsPerSecond = REQUESTS_PER_SECOND > 0 ? REQUESTS_PER_SECOND : 1;
const finalizeResults = ( const finalizeResults = (
listings: EbayListingDetails[], listings: EbayListingDetails[],
): EbayListingDetails[] | UnstableListingBuckets<EbayListingDetails> => { ): EbayListingDetails[] | UnstableListingBuckets<EbayListingDetails> => {
@@ -436,7 +439,7 @@ export default async function fetchEbayItems(
const searchUrl = `https://www.ebay.ca/sch/i.html?${urlParams.toString()}`; const searchUrl = `https://www.ebay.ca/sch/i.html?${urlParams.toString()}`;
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
console.log(`Fetching eBay search: ${searchUrl}`); console.log(`Fetching eBay search: ${searchUrl}`);

View File

@@ -1086,6 +1086,8 @@ export default async function fetchFacebookItems(
MAX_ITEMS = 25, MAX_ITEMS = 25,
unstableMode: UnstableListingModeOptions = {}, unstableMode: UnstableListingModeOptions = {},
) { ) {
const requestsPerSecond = REQUESTS_PER_SECOND > 0 ? REQUESTS_PER_SECOND : 1;
const finalizeResults = ( const finalizeResults = (
listings: FacebookListingDetails[], listings: FacebookListingDetails[],
): FacebookListingDetails[] | UnstableListingBuckets<FacebookListingDetails> => { ): FacebookListingDetails[] | UnstableListingBuckets<FacebookListingDetails> => {
@@ -1093,7 +1095,11 @@ export default async function fetchFacebookItems(
return listings.slice(0, MAX_ITEMS); return listings.slice(0, MAX_ITEMS);
} }
return classifyUnstableListings(listings.slice(0, MAX_ITEMS)); const classified = classifyUnstableListings(listings);
return {
results: classified.results.slice(0, MAX_ITEMS),
unstableResults: classified.unstableResults,
};
}; };
const cookies = await ensureFacebookCookies(); const cookies = await ensureFacebookCookies();
@@ -1107,7 +1113,7 @@ export default async function fetchFacebookItems(
); );
} }
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
// Encode search query for URL // Encode search query for URL
const encodedQuery = encodeURIComponent(SEARCH_QUERY); const encodedQuery = encodeURIComponent(SEARCH_QUERY);

View File

@@ -725,6 +725,8 @@ export default async function fetchKijijiItems(
listingOptions: ListingFetchOptions = {}, listingOptions: ListingFetchOptions = {},
unstableMode: UnstableListingModeOptions = {}, unstableMode: UnstableListingModeOptions = {},
) { ) {
const requestsPerSecond = REQUESTS_PER_SECOND > 0 ? REQUESTS_PER_SECOND : 1;
const finalizeResults = ( const finalizeResults = (
listings: DetailedListing[], listings: DetailedListing[],
): DetailedListing[] | UnstableListingBuckets<DetailedListing> => { ): DetailedListing[] | UnstableListingBuckets<DetailedListing> => {
@@ -735,7 +737,7 @@ export default async function fetchKijijiItems(
return classifyUnstableListings(listings); return classifyUnstableListings(listings);
}; };
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
// Load Kijiji cookies (optional - helps bypass bot detection) // Load Kijiji cookies (optional - helps bypass bot detection)
const cookies = await loadCookiesOptional( const cookies = await loadCookiesOptional(
@@ -824,7 +826,7 @@ export default async function fetchKijijiItems(
progressBar?.start(totalProgress, currentProgress); progressBar?.start(totalProgress, currentProgress);
// Process in batches for controlled concurrency // Process in batches for controlled concurrency
const CONCURRENT_REQUESTS = REQUESTS_PER_SECOND * 2; // 2x rate for faster processing const CONCURRENT_REQUESTS = Math.max(1, Math.floor(requestsPerSecond * 2)); // 2x rate for faster processing
const results: (DetailedListing | null)[] = []; const results: (DetailedListing | null)[] = [];
for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) { for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) {

View File

@@ -38,9 +38,7 @@ describe("eBay Scraper Cookie Handling", () => {
const warnMock = mock(() => {}); const warnMock = mock(() => {});
console.warn = warnMock; console.warn = warnMock;
await fetchEbayItems("laptop", 1000, { await fetchEbayItems("laptop", 1000);
cookies: "s=from-request",
});
expect(global.fetch).toHaveBeenCalledTimes(1); expect(global.fetch).toHaveBeenCalledTimes(1);
@@ -53,6 +51,30 @@ describe("eBay Scraper Cookie Handling", () => {
); );
}); });
test("keeps relative item links on the ebay.ca host", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(`
<html><body>
<li class="s-item">
<a href="/itm/123"></a>
<h3>Stable Laptop Bundle</h3>
<span class="s-item__price">CA $100.00</span>
</li>
</body></html>
`),
}),
) as typeof fetch;
const results = await fetchEbayItems("laptop", 1000);
expect(results).toEqual([
expect.objectContaining({ url: "https://www.ebay.ca/itm/123" }),
]);
});
test("returns results and unstableResults when unstable mode is enabled", async () => { test("returns results and unstableResults when unstable mode is enabled", async () => {
global.fetch = mock(() => global.fetch = mock(() =>
Promise.resolve({ Promise.resolve({

View File

@@ -521,7 +521,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
}); });
}); });
test("unstable mode keeps MAX_ITEMS as the classification boundary", async () => { test("unstable mode classifies before the final MAX_ITEMS limit", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({ const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: { payload: {
resultGroups: [ resultGroups: [
@@ -545,10 +545,10 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
node: { node: {
listing: { listing: {
id: "2", id: "2",
marketplace_listing_title: "Boundary Cheap Chair", marketplace_listing_title: "Second Boundary Stable Chair",
listing_price: { listing_price: {
amount: "50.00", amount: "110.00",
formatted_amount: "CA$50", formatted_amount: "CA$110",
currency: "CAD", currency: "CAD",
}, },
is_live: true, is_live: true,
@@ -559,10 +559,10 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
node: { node: {
listing: { listing: {
id: "3", id: "3",
marketplace_listing_title: "Past Boundary Chair", marketplace_listing_title: "Past Boundary Cheap Chair",
listing_price: { listing_price: {
amount: "110.00", amount: "70.00",
formatted_amount: "CA$110", formatted_amount: "CA$70",
currency: "CAD", currency: "CAD",
}, },
is_live: true, is_live: true,
@@ -591,9 +591,12 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
}); });
expect(results).toEqual({ expect(results).toEqual({
results: [expect.objectContaining({ title: "Boundary Stable Chair" })], results: [
expect.objectContaining({ title: "Boundary Stable Chair" }),
expect.objectContaining({ title: "Second Boundary Stable Chair" }),
],
unstableResults: [ unstableResults: [
expect.objectContaining({ title: "Boundary Cheap Chair" }), expect.objectContaining({ title: "Past Boundary Cheap Chair" }),
], ],
}); });
}); });