fix: align scraper unstable mode behavior
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import type {
|
import type {
|
||||||
|
HTMLString,
|
||||||
UnstableListingBuckets,
|
UnstableListingBuckets,
|
||||||
UnstableListingModeOptions,
|
UnstableListingModeOptions,
|
||||||
} from "../types/common";
|
} from "../types/common";
|
||||||
@@ -114,7 +115,7 @@ function parseEbayListings(
|
|||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
href = href.startsWith("//")
|
href = href.startsWith("//")
|
||||||
? `https:${href}`
|
? `https:${href}`
|
||||||
: `https://www.ebay.com${href}`;
|
: `https://www.ebay.ca${href}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the container - go up several levels to find the item container
|
// Find the container - go up several levels to find the item container
|
||||||
@@ -397,6 +398,8 @@ export default async function fetchEbayItems(
|
|||||||
} = {},
|
} = {},
|
||||||
unstableMode: UnstableListingModeOptions = {},
|
unstableMode: UnstableListingModeOptions = {},
|
||||||
) {
|
) {
|
||||||
|
const requestsPerSecond = REQUESTS_PER_SECOND > 0 ? REQUESTS_PER_SECOND : 1;
|
||||||
|
|
||||||
const finalizeResults = (
|
const finalizeResults = (
|
||||||
listings: EbayListingDetails[],
|
listings: EbayListingDetails[],
|
||||||
): EbayListingDetails[] | UnstableListingBuckets<EbayListingDetails> => {
|
): EbayListingDetails[] | UnstableListingBuckets<EbayListingDetails> => {
|
||||||
@@ -436,7 +439,7 @@ export default async function fetchEbayItems(
|
|||||||
|
|
||||||
const searchUrl = `https://www.ebay.ca/sch/i.html?${urlParams.toString()}`;
|
const searchUrl = `https://www.ebay.ca/sch/i.html?${urlParams.toString()}`;
|
||||||
|
|
||||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
|
||||||
|
|
||||||
console.log(`Fetching eBay search: ${searchUrl}`);
|
console.log(`Fetching eBay search: ${searchUrl}`);
|
||||||
|
|
||||||
|
|||||||
@@ -1086,6 +1086,8 @@ export default async function fetchFacebookItems(
|
|||||||
MAX_ITEMS = 25,
|
MAX_ITEMS = 25,
|
||||||
unstableMode: UnstableListingModeOptions = {},
|
unstableMode: UnstableListingModeOptions = {},
|
||||||
) {
|
) {
|
||||||
|
const requestsPerSecond = REQUESTS_PER_SECOND > 0 ? REQUESTS_PER_SECOND : 1;
|
||||||
|
|
||||||
const finalizeResults = (
|
const finalizeResults = (
|
||||||
listings: FacebookListingDetails[],
|
listings: FacebookListingDetails[],
|
||||||
): FacebookListingDetails[] | UnstableListingBuckets<FacebookListingDetails> => {
|
): FacebookListingDetails[] | UnstableListingBuckets<FacebookListingDetails> => {
|
||||||
@@ -1093,7 +1095,11 @@ export default async function fetchFacebookItems(
|
|||||||
return listings.slice(0, MAX_ITEMS);
|
return listings.slice(0, MAX_ITEMS);
|
||||||
}
|
}
|
||||||
|
|
||||||
return classifyUnstableListings(listings.slice(0, MAX_ITEMS));
|
const classified = classifyUnstableListings(listings);
|
||||||
|
return {
|
||||||
|
results: classified.results.slice(0, MAX_ITEMS),
|
||||||
|
unstableResults: classified.unstableResults,
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
const cookies = await ensureFacebookCookies();
|
const cookies = await ensureFacebookCookies();
|
||||||
@@ -1107,7 +1113,7 @@ export default async function fetchFacebookItems(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
|
||||||
|
|
||||||
// Encode search query for URL
|
// Encode search query for URL
|
||||||
const encodedQuery = encodeURIComponent(SEARCH_QUERY);
|
const encodedQuery = encodeURIComponent(SEARCH_QUERY);
|
||||||
|
|||||||
@@ -725,6 +725,8 @@ export default async function fetchKijijiItems(
|
|||||||
listingOptions: ListingFetchOptions = {},
|
listingOptions: ListingFetchOptions = {},
|
||||||
unstableMode: UnstableListingModeOptions = {},
|
unstableMode: UnstableListingModeOptions = {},
|
||||||
) {
|
) {
|
||||||
|
const requestsPerSecond = REQUESTS_PER_SECOND > 0 ? REQUESTS_PER_SECOND : 1;
|
||||||
|
|
||||||
const finalizeResults = (
|
const finalizeResults = (
|
||||||
listings: DetailedListing[],
|
listings: DetailedListing[],
|
||||||
): DetailedListing[] | UnstableListingBuckets<DetailedListing> => {
|
): DetailedListing[] | UnstableListingBuckets<DetailedListing> => {
|
||||||
@@ -735,7 +737,7 @@ export default async function fetchKijijiItems(
|
|||||||
return classifyUnstableListings(listings);
|
return classifyUnstableListings(listings);
|
||||||
};
|
};
|
||||||
|
|
||||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
const DELAY_MS = Math.max(1, Math.floor(1000 / requestsPerSecond));
|
||||||
|
|
||||||
// Load Kijiji cookies (optional - helps bypass bot detection)
|
// Load Kijiji cookies (optional - helps bypass bot detection)
|
||||||
const cookies = await loadCookiesOptional(
|
const cookies = await loadCookiesOptional(
|
||||||
@@ -824,7 +826,7 @@ export default async function fetchKijijiItems(
|
|||||||
progressBar?.start(totalProgress, currentProgress);
|
progressBar?.start(totalProgress, currentProgress);
|
||||||
|
|
||||||
// Process in batches for controlled concurrency
|
// Process in batches for controlled concurrency
|
||||||
const CONCURRENT_REQUESTS = REQUESTS_PER_SECOND * 2; // 2x rate for faster processing
|
const CONCURRENT_REQUESTS = Math.max(1, Math.floor(requestsPerSecond * 2)); // 2x rate for faster processing
|
||||||
const results: (DetailedListing | null)[] = [];
|
const results: (DetailedListing | null)[] = [];
|
||||||
|
|
||||||
for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) {
|
for (let i = 0; i < newListingLinks.length; i += CONCURRENT_REQUESTS) {
|
||||||
|
|||||||
@@ -38,9 +38,7 @@ describe("eBay Scraper Cookie Handling", () => {
|
|||||||
const warnMock = mock(() => {});
|
const warnMock = mock(() => {});
|
||||||
console.warn = warnMock;
|
console.warn = warnMock;
|
||||||
|
|
||||||
await fetchEbayItems("laptop", 1000, {
|
await fetchEbayItems("laptop", 1000);
|
||||||
cookies: "s=from-request",
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(global.fetch).toHaveBeenCalledTimes(1);
|
expect(global.fetch).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
@@ -53,6 +51,30 @@ describe("eBay Scraper Cookie Handling", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("keeps relative item links on the ebay.ca host", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () =>
|
||||||
|
Promise.resolve(`
|
||||||
|
<html><body>
|
||||||
|
<li class="s-item">
|
||||||
|
<a href="/itm/123"></a>
|
||||||
|
<h3>Stable Laptop Bundle</h3>
|
||||||
|
<span class="s-item__price">CA $100.00</span>
|
||||||
|
</li>
|
||||||
|
</body></html>
|
||||||
|
`),
|
||||||
|
}),
|
||||||
|
) as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchEbayItems("laptop", 1000);
|
||||||
|
|
||||||
|
expect(results).toEqual([
|
||||||
|
expect.objectContaining({ url: "https://www.ebay.ca/itm/123" }),
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
|
|||||||
@@ -521,7 +521,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
test("unstable mode keeps MAX_ITEMS as the classification boundary", async () => {
|
test("unstable mode classifies before the final MAX_ITEMS limit", async () => {
|
||||||
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
||||||
payload: {
|
payload: {
|
||||||
resultGroups: [
|
resultGroups: [
|
||||||
@@ -545,10 +545,10 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
node: {
|
node: {
|
||||||
listing: {
|
listing: {
|
||||||
id: "2",
|
id: "2",
|
||||||
marketplace_listing_title: "Boundary Cheap Chair",
|
marketplace_listing_title: "Second Boundary Stable Chair",
|
||||||
listing_price: {
|
listing_price: {
|
||||||
amount: "50.00",
|
amount: "110.00",
|
||||||
formatted_amount: "CA$50",
|
formatted_amount: "CA$110",
|
||||||
currency: "CAD",
|
currency: "CAD",
|
||||||
},
|
},
|
||||||
is_live: true,
|
is_live: true,
|
||||||
@@ -559,10 +559,10 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
node: {
|
node: {
|
||||||
listing: {
|
listing: {
|
||||||
id: "3",
|
id: "3",
|
||||||
marketplace_listing_title: "Past Boundary Chair",
|
marketplace_listing_title: "Past Boundary Cheap Chair",
|
||||||
listing_price: {
|
listing_price: {
|
||||||
amount: "110.00",
|
amount: "70.00",
|
||||||
formatted_amount: "CA$110",
|
formatted_amount: "CA$70",
|
||||||
currency: "CAD",
|
currency: "CAD",
|
||||||
},
|
},
|
||||||
is_live: true,
|
is_live: true,
|
||||||
@@ -591,9 +591,12 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
expect(results).toEqual({
|
expect(results).toEqual({
|
||||||
results: [expect.objectContaining({ title: "Boundary Stable Chair" })],
|
results: [
|
||||||
|
expect.objectContaining({ title: "Boundary Stable Chair" }),
|
||||||
|
expect.objectContaining({ title: "Second Boundary Stable Chair" }),
|
||||||
|
],
|
||||||
unstableResults: [
|
unstableResults: [
|
||||||
expect.objectContaining({ title: "Boundary Cheap Chair" }),
|
expect.objectContaining({ title: "Past Boundary Cheap Chair" }),
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user