fix: finalize scraper unstable mode integration
This commit is contained in:
@@ -40,6 +40,16 @@ export interface EbayListingDetails {
|
||||
|
||||
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C)\s*\$|\s*[$£€¥])/u;
|
||||
|
||||
function canonicalizeEbayItemUrl(url: string): string {
|
||||
try {
|
||||
const parsed = new URL(url, "https://www.ebay.ca");
|
||||
const match = parsed.pathname.match(/\/itm\/[^/?#]+/);
|
||||
return match ? `${parsed.origin}${match[0]}` : `${parsed.origin}${parsed.pathname}`;
|
||||
} catch {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------- Utilities -----------------------------
|
||||
|
||||
/**
|
||||
@@ -121,7 +131,8 @@ function parseEbayListings(
|
||||
: `https://www.ebay.ca${href}`;
|
||||
}
|
||||
|
||||
if (seenUrls.has(href)) continue;
|
||||
const canonicalUrl = canonicalizeEbayItemUrl(href);
|
||||
if (seenUrls.has(canonicalUrl)) continue;
|
||||
|
||||
// Find the container - go up several levels to find the item container
|
||||
// Modern eBay uses complex nested structures (often 5-10 levels deep)
|
||||
@@ -334,7 +345,7 @@ function parseEbayListings(
|
||||
};
|
||||
|
||||
results.push(listing);
|
||||
seenUrls.add(href);
|
||||
seenUrls.add(canonicalUrl);
|
||||
} catch (err) {
|
||||
console.warn(`Error parsing eBay listing: ${err}`);
|
||||
}
|
||||
|
||||
@@ -1289,13 +1289,15 @@ export async function fetchFacebookItem(
|
||||
return null;
|
||||
}
|
||||
|
||||
if (classification.unavailable) {
|
||||
const itemData = extractFacebookItemData(itemHtml);
|
||||
|
||||
if (classification.unavailable && !itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (classification.kind !== "item") {
|
||||
if (classification.kind !== "item" && !itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
console.warn(
|
||||
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
|
||||
@@ -1303,7 +1305,6 @@ export async function fetchFacebookItem(
|
||||
return null;
|
||||
}
|
||||
|
||||
const itemData = extractFacebookItemData(itemHtml);
|
||||
if (!itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
|
||||
|
||||
@@ -292,10 +292,14 @@ export function buildSearchUrl(
|
||||
? SORT_MAPPINGS[options.sortBy]
|
||||
: "relevancyDesc";
|
||||
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
||||
const priceMinParam =
|
||||
typeof options.priceMin === "number" ? `&priceMin=${options.priceMin}` : "";
|
||||
const priceMaxParam =
|
||||
typeof options.priceMax === "number" ? `&priceMax=${options.priceMax}` : "";
|
||||
const pageParam =
|
||||
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
||||
|
||||
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
|
||||
url += `?sort=${sortValue}&view=list&order=${sortOrder}${priceMinParam}${priceMaxParam}${pageParam}`;
|
||||
|
||||
return url;
|
||||
}
|
||||
@@ -954,26 +958,12 @@ export default async function fetchKijijiItems(
|
||||
matchesPriceFilters(listing, finalSearchOptions),
|
||||
);
|
||||
|
||||
const finalListings = unstableMode.hideUnstableResults
|
||||
? (() => {
|
||||
const classified = classifyUnstableListings(allListings);
|
||||
return {
|
||||
results: classified.results.filter((listing) =>
|
||||
matchesPriceFilters(listing, finalSearchOptions),
|
||||
),
|
||||
unstableResults: classified.unstableResults.filter((listing) =>
|
||||
matchesPriceFilters(listing, finalSearchOptions),
|
||||
),
|
||||
};
|
||||
})()
|
||||
: filteredListings;
|
||||
|
||||
console.log(
|
||||
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
|
||||
);
|
||||
return unstableMode.hideUnstableResults
|
||||
? finalListings
|
||||
: finalizeResults(finalListings);
|
||||
? finalizeResults(allListings)
|
||||
: finalizeResults(filteredListings);
|
||||
}
|
||||
|
||||
// Re-export error classes for convenience
|
||||
|
||||
@@ -101,6 +101,36 @@ describe("eBay Scraper Cookie Handling", () => {
|
||||
);
|
||||
});
|
||||
|
||||
test("deduplicates tracking variants of the same item URL", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(`
|
||||
<html><body>
|
||||
<li class="s-item">
|
||||
<a href="/itm/123?_trkparms=foo"></a>
|
||||
<h3>Stable Laptop Bundle</h3>
|
||||
<span class="s-item__price">CA $100.00</span>
|
||||
</li>
|
||||
<li class="s-item">
|
||||
<a href="https://www.ebay.ca/itm/123?hash=item123"></a>
|
||||
<h3>Stable Laptop Bundle</h3>
|
||||
<span class="s-item__price">CA $100.00</span>
|
||||
</li>
|
||||
</body></html>
|
||||
`),
|
||||
}),
|
||||
) as typeof fetch;
|
||||
|
||||
const results = await fetchEbayItems("laptop", 1000);
|
||||
|
||||
expect(results).toHaveLength(1);
|
||||
expect(results[0]).toEqual(
|
||||
expect.objectContaining({ url: "https://www.ebay.ca/itm/123?_trkparms=foo" }),
|
||||
);
|
||||
});
|
||||
|
||||
test("treats bare dollar prices as CAD on ebay.ca", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
|
||||
@@ -358,6 +358,53 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
);
|
||||
});
|
||||
|
||||
test("should parse structured data even when an unavailable banner is present", async () => {
|
||||
const unavailableStructuredHtml = `
|
||||
<html><body>
|
||||
<div>This listing is no longer available.</div>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
listing: {
|
||||
id: "458",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Recovered Item",
|
||||
formatted_price: { text: "CA$120" },
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "120.00",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(unavailableStructuredHtml),
|
||||
url: "https://www.facebook.com/marketplace/item/458/",
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await fetchFacebookItem("458");
|
||||
|
||||
expect(result).toEqual(
|
||||
expect.objectContaining({
|
||||
title: "Recovered Item",
|
||||
listingStatus: "ACTIVE",
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
test("should handle successful item extraction", async () => {
|
||||
const mockData = {
|
||||
require: [
|
||||
|
||||
@@ -138,6 +138,16 @@ describe("URL Construction", () => {
|
||||
expect(priceUrl).toContain("order=DESC");
|
||||
});
|
||||
|
||||
test("includes price filters in the generated search URL", () => {
|
||||
const url = buildSearchUrl("iphone", {
|
||||
priceMin: 8000,
|
||||
priceMax: 10000,
|
||||
});
|
||||
|
||||
expect(url).toContain("priceMin=8000");
|
||||
expect(url).toContain("priceMax=10000");
|
||||
});
|
||||
|
||||
test("should handle string location/category inputs", () => {
|
||||
const url = buildSearchUrl("iphone", {
|
||||
location: "toronto",
|
||||
@@ -292,7 +302,7 @@ describe("fetchKijijiItems", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
test("applies price filters to unstable-mode buckets", async () => {
|
||||
test("classifies the full parsed Kijiji set in unstable mode", async () => {
|
||||
const searchHtml = `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
@@ -336,7 +346,7 @@ describe("fetchKijijiItems", () => {
|
||||
global.fetch = mock((input: string | URL | Request) => {
|
||||
const url = typeof input === "string" ? input : input.toString();
|
||||
|
||||
if (url.includes("/k0c0l1700272")) {
|
||||
if (url.includes("/k0c0l1700272") && url.includes("priceMin=8000")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(searchHtml),
|
||||
@@ -389,11 +399,11 @@ describe("fetchKijijiItems", () => {
|
||||
expect.objectContaining({ title: "Stable Listing One" }),
|
||||
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||
],
|
||||
unstableResults: [],
|
||||
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||
});
|
||||
});
|
||||
|
||||
test("unstable mode keeps out-of-range stable listings out of final results", async () => {
|
||||
test("uses URL price filters so out-of-range listings do not influence Kijiji classification", async () => {
|
||||
const searchHtml = `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
@@ -403,8 +413,7 @@ describe("fetchKijijiItems", () => {
|
||||
__APOLLO_STATE__: {
|
||||
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
|
||||
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
|
||||
"Listing:3": { url: "/v-out-of-range/k0l0", title: "Out Of Range Stable" },
|
||||
"Listing:4": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
|
||||
"Listing:3": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -438,7 +447,7 @@ describe("fetchKijijiItems", () => {
|
||||
global.fetch = mock((input: string | URL | Request) => {
|
||||
const url = typeof input === "string" ? input : input.toString();
|
||||
|
||||
if (url.includes("/k0c0l1700272")) {
|
||||
if (url.includes("/k0c0l1700272") && url.includes("priceMin=8000") && url.includes("priceMax=15000")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(searchHtml),
|
||||
@@ -465,15 +474,6 @@ describe("fetchKijijiItems", () => {
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-out-of-range/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(listingHtml("Out Of Range Stable", 20000, "v-out-of-range/k0l0")),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-unstable/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
@@ -500,7 +500,7 @@ describe("fetchKijijiItems", () => {
|
||||
expect.objectContaining({ title: "Stable Listing One" }),
|
||||
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||
],
|
||||
unstableResults: [],
|
||||
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user