fix: finalize scraper unstable mode integration

This commit is contained in:
2026-04-23 00:20:21 -04:00
parent 55faee7dd5
commit 881c2ddf8c
6 changed files with 118 additions and 39 deletions

View File

@@ -40,6 +40,16 @@ export interface EbayListingDetails {
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C)\s*\$|\s*[$£¥])/u;
function canonicalizeEbayItemUrl(url: string): string {
try {
const parsed = new URL(url, "https://www.ebay.ca");
const match = parsed.pathname.match(/\/itm\/[^/?#]+/);
return match ? `${parsed.origin}${match[0]}` : `${parsed.origin}${parsed.pathname}`;
} catch {
return url;
}
}
// ----------------------------- Utilities -----------------------------
/**
@@ -121,7 +131,8 @@ function parseEbayListings(
: `https://www.ebay.ca${href}`;
}
if (seenUrls.has(href)) continue;
const canonicalUrl = canonicalizeEbayItemUrl(href);
if (seenUrls.has(canonicalUrl)) continue;
// Find the container - go up several levels to find the item container
// Modern eBay uses complex nested structures (often 5-10 levels deep)
@@ -334,7 +345,7 @@ function parseEbayListings(
};
results.push(listing);
seenUrls.add(href);
seenUrls.add(canonicalUrl);
} catch (err) {
console.warn(`Error parsing eBay listing: ${err}`);
}

View File

@@ -1289,13 +1289,15 @@ export async function fetchFacebookItem(
return null;
}
if (classification.unavailable) {
const itemData = extractFacebookItemData(itemHtml);
if (classification.unavailable && !itemData) {
logExtractionMetrics(false, itemId);
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
return null;
}
if (classification.kind !== "item") {
if (classification.kind !== "item" && !itemData) {
logExtractionMetrics(false, itemId);
console.warn(
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
@@ -1303,7 +1305,6 @@ export async function fetchFacebookItem(
return null;
}
const itemData = extractFacebookItemData(itemHtml);
if (!itemData) {
logExtractionMetrics(false, itemId);

View File

@@ -292,10 +292,14 @@ export function buildSearchUrl(
? SORT_MAPPINGS[options.sortBy]
: "relevancyDesc";
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
const priceMinParam =
typeof options.priceMin === "number" ? `&priceMin=${options.priceMin}` : "";
const priceMaxParam =
typeof options.priceMax === "number" ? `&priceMax=${options.priceMax}` : "";
const pageParam =
options.page && options.page > 1 ? `&page=${options.page}` : "";
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
url += `?sort=${sortValue}&view=list&order=${sortOrder}${priceMinParam}${priceMaxParam}${pageParam}`;
return url;
}
@@ -954,26 +958,12 @@ export default async function fetchKijijiItems(
matchesPriceFilters(listing, finalSearchOptions),
);
const finalListings = unstableMode.hideUnstableResults
? (() => {
const classified = classifyUnstableListings(allListings);
return {
results: classified.results.filter((listing) =>
matchesPriceFilters(listing, finalSearchOptions),
),
unstableResults: classified.unstableResults.filter((listing) =>
matchesPriceFilters(listing, finalSearchOptions),
),
};
})()
: filteredListings;
console.log(
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
);
return unstableMode.hideUnstableResults
? finalListings
: finalizeResults(finalListings);
? finalizeResults(allListings)
: finalizeResults(filteredListings);
}
// Re-export error classes for convenience

View File

@@ -101,6 +101,36 @@ describe("eBay Scraper Cookie Handling", () => {
);
});
test("deduplicates tracking variants of the same item URL", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(`
<html><body>
<li class="s-item">
<a href="/itm/123?_trkparms=foo"></a>
<h3>Stable Laptop Bundle</h3>
<span class="s-item__price">CA $100.00</span>
</li>
<li class="s-item">
<a href="https://www.ebay.ca/itm/123?hash=item123"></a>
<h3>Stable Laptop Bundle</h3>
<span class="s-item__price">CA $100.00</span>
</li>
</body></html>
`),
}),
) as typeof fetch;
const results = await fetchEbayItems("laptop", 1000);
expect(results).toHaveLength(1);
expect(results[0]).toEqual(
expect.objectContaining({ url: "https://www.ebay.ca/itm/123?_trkparms=foo" }),
);
});
test("treats bare dollar prices as CAD on ebay.ca", async () => {
global.fetch = mock(() =>
Promise.resolve({

View File

@@ -358,6 +358,53 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
);
});
test("should parse structured data even when an unavailable banner is present", async () => {
const unavailableStructuredHtml = `
<html><body>
<div>This listing is no longer available.</div>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
listing: {
id: "458",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Recovered Item",
formatted_price: { text: "CA$120" },
listing_price: {
amount: "120.00",
currency: "CAD",
amount_with_offset: "120.00",
},
is_live: true,
},
},
})}
</script>
</body></html>
`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(unavailableStructuredHtml),
url: "https://www.facebook.com/marketplace/item/458/",
headers: {
get: () => null,
},
}),
);
const result = await fetchFacebookItem("458");
expect(result).toEqual(
expect.objectContaining({
title: "Recovered Item",
listingStatus: "ACTIVE",
}),
);
});
test("should handle successful item extraction", async () => {
const mockData = {
require: [

View File

@@ -138,6 +138,16 @@ describe("URL Construction", () => {
expect(priceUrl).toContain("order=DESC");
});
test("includes price filters in the generated search URL", () => {
const url = buildSearchUrl("iphone", {
priceMin: 8000,
priceMax: 10000,
});
expect(url).toContain("priceMin=8000");
expect(url).toContain("priceMax=10000");
});
test("should handle string location/category inputs", () => {
const url = buildSearchUrl("iphone", {
location: "toronto",
@@ -292,7 +302,7 @@ describe("fetchKijijiItems", () => {
]);
});
test("applies price filters to unstable-mode buckets", async () => {
test("classifies the full parsed Kijiji set in unstable mode", async () => {
const searchHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
@@ -336,7 +346,7 @@ describe("fetchKijijiItems", () => {
global.fetch = mock((input: string | URL | Request) => {
const url = typeof input === "string" ? input : input.toString();
if (url.includes("/k0c0l1700272")) {
if (url.includes("/k0c0l1700272") && url.includes("priceMin=8000")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(searchHtml),
@@ -389,11 +399,11 @@ describe("fetchKijijiItems", () => {
expect.objectContaining({ title: "Stable Listing One" }),
expect.objectContaining({ title: "Stable Listing Two" }),
],
unstableResults: [],
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
});
});
test("unstable mode keeps out-of-range stable listings out of final results", async () => {
test("uses URL price filters so out-of-range listings do not influence Kijiji classification", async () => {
const searchHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
@@ -403,8 +413,7 @@ describe("fetchKijijiItems", () => {
__APOLLO_STATE__: {
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
"Listing:3": { url: "/v-out-of-range/k0l0", title: "Out Of Range Stable" },
"Listing:4": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
"Listing:3": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
},
},
},
@@ -438,7 +447,7 @@ describe("fetchKijijiItems", () => {
global.fetch = mock((input: string | URL | Request) => {
const url = typeof input === "string" ? input : input.toString();
if (url.includes("/k0c0l1700272")) {
if (url.includes("/k0c0l1700272") && url.includes("priceMin=8000") && url.includes("priceMax=15000")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(searchHtml),
@@ -465,15 +474,6 @@ describe("fetchKijijiItems", () => {
});
}
if (url.endsWith("/v-out-of-range/k0l0")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(listingHtml("Out Of Range Stable", 20000, "v-out-of-range/k0l0")),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-unstable/k0l0")) {
return Promise.resolve({
ok: true,
@@ -500,7 +500,7 @@ describe("fetchKijijiItems", () => {
expect.objectContaining({ title: "Stable Listing One" }),
expect.objectContaining({ title: "Stable Listing Two" }),
],
unstableResults: [],
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
});
});