fix: finalize scraper unstable mode integration
This commit is contained in:
@@ -40,6 +40,16 @@ export interface EbayListingDetails {
|
|||||||
|
|
||||||
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C)\s*\$|\s*[$£€¥])/u;
|
const EBAY_PRICE_TEXT_RE = /^(?:\s*(?:CA|C)\s*\$|\s*[$£€¥])/u;
|
||||||
|
|
||||||
|
function canonicalizeEbayItemUrl(url: string): string {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(url, "https://www.ebay.ca");
|
||||||
|
const match = parsed.pathname.match(/\/itm\/[^/?#]+/);
|
||||||
|
return match ? `${parsed.origin}${match[0]}` : `${parsed.origin}${parsed.pathname}`;
|
||||||
|
} catch {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ----------------------------- Utilities -----------------------------
|
// ----------------------------- Utilities -----------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -121,7 +131,8 @@ function parseEbayListings(
|
|||||||
: `https://www.ebay.ca${href}`;
|
: `https://www.ebay.ca${href}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (seenUrls.has(href)) continue;
|
const canonicalUrl = canonicalizeEbayItemUrl(href);
|
||||||
|
if (seenUrls.has(canonicalUrl)) continue;
|
||||||
|
|
||||||
// Find the container - go up several levels to find the item container
|
// Find the container - go up several levels to find the item container
|
||||||
// Modern eBay uses complex nested structures (often 5-10 levels deep)
|
// Modern eBay uses complex nested structures (often 5-10 levels deep)
|
||||||
@@ -334,7 +345,7 @@ function parseEbayListings(
|
|||||||
};
|
};
|
||||||
|
|
||||||
results.push(listing);
|
results.push(listing);
|
||||||
seenUrls.add(href);
|
seenUrls.add(canonicalUrl);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn(`Error parsing eBay listing: ${err}`);
|
console.warn(`Error parsing eBay listing: ${err}`);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1289,13 +1289,15 @@ export async function fetchFacebookItem(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (classification.unavailable) {
|
const itemData = extractFacebookItemData(itemHtml);
|
||||||
|
|
||||||
|
if (classification.unavailable && !itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (classification.kind !== "item") {
|
if (classification.kind !== "item" && !itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
console.warn(
|
console.warn(
|
||||||
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
|
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
|
||||||
@@ -1303,7 +1305,6 @@ export async function fetchFacebookItem(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const itemData = extractFacebookItemData(itemHtml);
|
|
||||||
if (!itemData) {
|
if (!itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
|
|
||||||
|
|||||||
@@ -292,10 +292,14 @@ export function buildSearchUrl(
|
|||||||
? SORT_MAPPINGS[options.sortBy]
|
? SORT_MAPPINGS[options.sortBy]
|
||||||
: "relevancyDesc";
|
: "relevancyDesc";
|
||||||
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
||||||
|
const priceMinParam =
|
||||||
|
typeof options.priceMin === "number" ? `&priceMin=${options.priceMin}` : "";
|
||||||
|
const priceMaxParam =
|
||||||
|
typeof options.priceMax === "number" ? `&priceMax=${options.priceMax}` : "";
|
||||||
const pageParam =
|
const pageParam =
|
||||||
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
||||||
|
|
||||||
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
|
url += `?sort=${sortValue}&view=list&order=${sortOrder}${priceMinParam}${priceMaxParam}${pageParam}`;
|
||||||
|
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
@@ -954,26 +958,12 @@ export default async function fetchKijijiItems(
|
|||||||
matchesPriceFilters(listing, finalSearchOptions),
|
matchesPriceFilters(listing, finalSearchOptions),
|
||||||
);
|
);
|
||||||
|
|
||||||
const finalListings = unstableMode.hideUnstableResults
|
|
||||||
? (() => {
|
|
||||||
const classified = classifyUnstableListings(allListings);
|
|
||||||
return {
|
|
||||||
results: classified.results.filter((listing) =>
|
|
||||||
matchesPriceFilters(listing, finalSearchOptions),
|
|
||||||
),
|
|
||||||
unstableResults: classified.unstableResults.filter((listing) =>
|
|
||||||
matchesPriceFilters(listing, finalSearchOptions),
|
|
||||||
),
|
|
||||||
};
|
|
||||||
})()
|
|
||||||
: filteredListings;
|
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
|
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
|
||||||
);
|
);
|
||||||
return unstableMode.hideUnstableResults
|
return unstableMode.hideUnstableResults
|
||||||
? finalListings
|
? finalizeResults(allListings)
|
||||||
: finalizeResults(finalListings);
|
: finalizeResults(filteredListings);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Re-export error classes for convenience
|
// Re-export error classes for convenience
|
||||||
|
|||||||
@@ -101,6 +101,36 @@ describe("eBay Scraper Cookie Handling", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("deduplicates tracking variants of the same item URL", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () =>
|
||||||
|
Promise.resolve(`
|
||||||
|
<html><body>
|
||||||
|
<li class="s-item">
|
||||||
|
<a href="/itm/123?_trkparms=foo"></a>
|
||||||
|
<h3>Stable Laptop Bundle</h3>
|
||||||
|
<span class="s-item__price">CA $100.00</span>
|
||||||
|
</li>
|
||||||
|
<li class="s-item">
|
||||||
|
<a href="https://www.ebay.ca/itm/123?hash=item123"></a>
|
||||||
|
<h3>Stable Laptop Bundle</h3>
|
||||||
|
<span class="s-item__price">CA $100.00</span>
|
||||||
|
</li>
|
||||||
|
</body></html>
|
||||||
|
`),
|
||||||
|
}),
|
||||||
|
) as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchEbayItems("laptop", 1000);
|
||||||
|
|
||||||
|
expect(results).toHaveLength(1);
|
||||||
|
expect(results[0]).toEqual(
|
||||||
|
expect.objectContaining({ url: "https://www.ebay.ca/itm/123?_trkparms=foo" }),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test("treats bare dollar prices as CAD on ebay.ca", async () => {
|
test("treats bare dollar prices as CAD on ebay.ca", async () => {
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
|
|||||||
@@ -358,6 +358,53 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("should parse structured data even when an unavailable banner is present", async () => {
|
||||||
|
const unavailableStructuredHtml = `
|
||||||
|
<html><body>
|
||||||
|
<div>This listing is no longer available.</div>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
listing: {
|
||||||
|
id: "458",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Recovered Item",
|
||||||
|
formatted_price: { text: "CA$120" },
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: "120.00",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(unavailableStructuredHtml),
|
||||||
|
url: "https://www.facebook.com/marketplace/item/458/",
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await fetchFacebookItem("458");
|
||||||
|
|
||||||
|
expect(result).toEqual(
|
||||||
|
expect.objectContaining({
|
||||||
|
title: "Recovered Item",
|
||||||
|
listingStatus: "ACTIVE",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle successful item extraction", async () => {
|
test("should handle successful item extraction", async () => {
|
||||||
const mockData = {
|
const mockData = {
|
||||||
require: [
|
require: [
|
||||||
|
|||||||
@@ -138,6 +138,16 @@ describe("URL Construction", () => {
|
|||||||
expect(priceUrl).toContain("order=DESC");
|
expect(priceUrl).toContain("order=DESC");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("includes price filters in the generated search URL", () => {
|
||||||
|
const url = buildSearchUrl("iphone", {
|
||||||
|
priceMin: 8000,
|
||||||
|
priceMax: 10000,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(url).toContain("priceMin=8000");
|
||||||
|
expect(url).toContain("priceMax=10000");
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle string location/category inputs", () => {
|
test("should handle string location/category inputs", () => {
|
||||||
const url = buildSearchUrl("iphone", {
|
const url = buildSearchUrl("iphone", {
|
||||||
location: "toronto",
|
location: "toronto",
|
||||||
@@ -292,7 +302,7 @@ describe("fetchKijijiItems", () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("applies price filters to unstable-mode buckets", async () => {
|
test("classifies the full parsed Kijiji set in unstable mode", async () => {
|
||||||
const searchHtml = `
|
const searchHtml = `
|
||||||
<html>
|
<html>
|
||||||
<script id="__NEXT_DATA__" type="application/json">
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
@@ -336,7 +346,7 @@ describe("fetchKijijiItems", () => {
|
|||||||
global.fetch = mock((input: string | URL | Request) => {
|
global.fetch = mock((input: string | URL | Request) => {
|
||||||
const url = typeof input === "string" ? input : input.toString();
|
const url = typeof input === "string" ? input : input.toString();
|
||||||
|
|
||||||
if (url.includes("/k0c0l1700272")) {
|
if (url.includes("/k0c0l1700272") && url.includes("priceMin=8000")) {
|
||||||
return Promise.resolve({
|
return Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
text: () => Promise.resolve(searchHtml),
|
text: () => Promise.resolve(searchHtml),
|
||||||
@@ -389,11 +399,11 @@ describe("fetchKijijiItems", () => {
|
|||||||
expect.objectContaining({ title: "Stable Listing One" }),
|
expect.objectContaining({ title: "Stable Listing One" }),
|
||||||
expect.objectContaining({ title: "Stable Listing Two" }),
|
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||||
],
|
],
|
||||||
unstableResults: [],
|
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
test("unstable mode keeps out-of-range stable listings out of final results", async () => {
|
test("uses URL price filters so out-of-range listings do not influence Kijiji classification", async () => {
|
||||||
const searchHtml = `
|
const searchHtml = `
|
||||||
<html>
|
<html>
|
||||||
<script id="__NEXT_DATA__" type="application/json">
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
@@ -403,8 +413,7 @@ describe("fetchKijijiItems", () => {
|
|||||||
__APOLLO_STATE__: {
|
__APOLLO_STATE__: {
|
||||||
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
|
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
|
||||||
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
|
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
|
||||||
"Listing:3": { url: "/v-out-of-range/k0l0", title: "Out Of Range Stable" },
|
"Listing:3": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
|
||||||
"Listing:4": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@@ -438,7 +447,7 @@ describe("fetchKijijiItems", () => {
|
|||||||
global.fetch = mock((input: string | URL | Request) => {
|
global.fetch = mock((input: string | URL | Request) => {
|
||||||
const url = typeof input === "string" ? input : input.toString();
|
const url = typeof input === "string" ? input : input.toString();
|
||||||
|
|
||||||
if (url.includes("/k0c0l1700272")) {
|
if (url.includes("/k0c0l1700272") && url.includes("priceMin=8000") && url.includes("priceMax=15000")) {
|
||||||
return Promise.resolve({
|
return Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
text: () => Promise.resolve(searchHtml),
|
text: () => Promise.resolve(searchHtml),
|
||||||
@@ -465,15 +474,6 @@ describe("fetchKijijiItems", () => {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (url.endsWith("/v-out-of-range/k0l0")) {
|
|
||||||
return Promise.resolve({
|
|
||||||
ok: true,
|
|
||||||
text: () => Promise.resolve(listingHtml("Out Of Range Stable", 20000, "v-out-of-range/k0l0")),
|
|
||||||
headers: { get: () => null },
|
|
||||||
url,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (url.endsWith("/v-unstable/k0l0")) {
|
if (url.endsWith("/v-unstable/k0l0")) {
|
||||||
return Promise.resolve({
|
return Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
@@ -500,7 +500,7 @@ describe("fetchKijijiItems", () => {
|
|||||||
expect.objectContaining({ title: "Stable Listing One" }),
|
expect.objectContaining({ title: "Stable Listing One" }),
|
||||||
expect.objectContaining({ title: "Stable Listing Two" }),
|
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||||
],
|
],
|
||||||
unstableResults: [],
|
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user