fix: tighten scraper parsing behavior
This commit is contained in:
@@ -62,7 +62,7 @@ function parseEbayPrice(
|
|||||||
const cents = Math.round(dollars * 100);
|
const cents = Math.round(dollars * 100);
|
||||||
|
|
||||||
// Extract currency - look for common formats like "CAD", "USD", "C $", "$CA", etc.
|
// Extract currency - look for common formats like "CAD", "USD", "C $", "$CA", etc.
|
||||||
let currency = "USD"; // Default
|
let currency = "CAD"; // Default for ebay.ca
|
||||||
|
|
||||||
if (
|
if (
|
||||||
cleaned.toUpperCase().includes("CAD") ||
|
cleaned.toUpperCase().includes("CAD") ||
|
||||||
@@ -70,7 +70,7 @@ function parseEbayPrice(
|
|||||||
cleaned.includes("C $")
|
cleaned.includes("C $")
|
||||||
) {
|
) {
|
||||||
currency = "CAD";
|
currency = "CAD";
|
||||||
} else if (cleaned.toUpperCase().includes("USD") || cleaned.includes("$")) {
|
} else if (cleaned.toUpperCase().includes("USD")) {
|
||||||
currency = "USD";
|
currency = "USD";
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,6 +101,7 @@ function parseEbayListings(
|
|||||||
): EbayListingDetails[] {
|
): EbayListingDetails[] {
|
||||||
const { document } = parseHTML(htmlString);
|
const { document } = parseHTML(htmlString);
|
||||||
const results: EbayListingDetails[] = [];
|
const results: EbayListingDetails[] = [];
|
||||||
|
const seenUrls = new Set<string>();
|
||||||
|
|
||||||
// Find all listing links by looking for eBay item URLs (/itm/)
|
// Find all listing links by looking for eBay item URLs (/itm/)
|
||||||
const linkElements = document.querySelectorAll('a[href*="itm/"]');
|
const linkElements = document.querySelectorAll('a[href*="itm/"]');
|
||||||
@@ -118,6 +119,8 @@ function parseEbayListings(
|
|||||||
: `https://www.ebay.ca${href}`;
|
: `https://www.ebay.ca${href}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (seenUrls.has(href)) continue;
|
||||||
|
|
||||||
// Find the container - go up several levels to find the item container
|
// Find the container - go up several levels to find the item container
|
||||||
// Modern eBay uses complex nested structures (often 5-10 levels deep)
|
// Modern eBay uses complex nested structures (often 5-10 levels deep)
|
||||||
let container: Element | null = linkElement;
|
let container: Element | null = linkElement;
|
||||||
@@ -329,6 +332,7 @@ function parseEbayListings(
|
|||||||
};
|
};
|
||||||
|
|
||||||
results.push(listing);
|
results.push(listing);
|
||||||
|
seenUrls.add(href);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn(`Error parsing eBay listing: ${err}`);
|
console.warn(`Error parsing eBay listing: ${err}`);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -202,6 +202,14 @@ const SORT_MAPPINGS: Record<string, string> = {
|
|||||||
distance: "DISTANCE",
|
distance: "DISTANCE",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const LOCATION_SLUGS = Object.fromEntries(
|
||||||
|
Object.entries(LOCATION_MAPPINGS).map(([slug, id]) => [id, slug.replace(/\s+/g, "-")]),
|
||||||
|
) as Record<number, string>;
|
||||||
|
|
||||||
|
const CATEGORY_SLUGS = Object.fromEntries(
|
||||||
|
Object.entries(CATEGORY_MAPPINGS).map(([slug, id]) => [id, slug.replace(/\s+/g, "-")]),
|
||||||
|
) as Record<number, string>;
|
||||||
|
|
||||||
// ----------------------------- Utilities -----------------------------
|
// ----------------------------- Utilities -----------------------------
|
||||||
|
|
||||||
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]);
|
||||||
@@ -241,8 +249,8 @@ export function buildSearchUrl(
|
|||||||
const locationId = resolveLocationId(options.location);
|
const locationId = resolveLocationId(options.location);
|
||||||
const categoryId = resolveCategoryId(options.category);
|
const categoryId = resolveCategoryId(options.category);
|
||||||
|
|
||||||
const categorySlug = categoryId === 0 ? "buy-sell" : "buy-sell";
|
const categorySlug = CATEGORY_SLUGS[categoryId] ?? "buy-sell";
|
||||||
const locationSlug = locationId === 0 ? "canada" : "canada";
|
const locationSlug = LOCATION_SLUGS[locationId] ?? "canada";
|
||||||
|
|
||||||
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||||||
|
|
||||||
@@ -893,8 +901,28 @@ export default async function fetchKijijiItems(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`\nParsed ${allListings.length} detailed listings.`);
|
const filteredListings = allListings.filter((listing) => {
|
||||||
return finalizeResults(allListings);
|
const cents = listing.listingPrice?.cents;
|
||||||
|
|
||||||
|
if (typeof cents !== "number") return false;
|
||||||
|
if (
|
||||||
|
typeof finalSearchOptions.priceMin === "number" &&
|
||||||
|
cents < finalSearchOptions.priceMin
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
typeof finalSearchOptions.priceMax === "number" &&
|
||||||
|
cents > finalSearchOptions.priceMax
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`\nParsed ${filteredListings.length} detailed listings.`);
|
||||||
|
return finalizeResults(filteredListings);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Re-export error classes for convenience
|
// Re-export error classes for convenience
|
||||||
|
|||||||
@@ -75,6 +75,58 @@ describe("eBay Scraper Cookie Handling", () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("deduplicates repeated item links from the same card", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () =>
|
||||||
|
Promise.resolve(`
|
||||||
|
<html><body>
|
||||||
|
<li class="s-item">
|
||||||
|
<a href="/itm/123"><span>Open</span></a>
|
||||||
|
<a href="/itm/123"><span>Image</span></a>
|
||||||
|
<h3>Stable Laptop Bundle</h3>
|
||||||
|
<span class="s-item__price">CA $100.00</span>
|
||||||
|
</li>
|
||||||
|
</body></html>
|
||||||
|
`),
|
||||||
|
}),
|
||||||
|
) as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchEbayItems("laptop", 1000);
|
||||||
|
|
||||||
|
expect(results).toHaveLength(1);
|
||||||
|
expect(results[0]).toEqual(
|
||||||
|
expect.objectContaining({ url: "https://www.ebay.ca/itm/123" }),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("treats bare dollar prices as CAD on ebay.ca", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () =>
|
||||||
|
Promise.resolve(`
|
||||||
|
<html><body>
|
||||||
|
<li class="s-item">
|
||||||
|
<a href="/itm/123"></a>
|
||||||
|
<h3>Stable Laptop Bundle</h3>
|
||||||
|
<span class="s-item__price">$100.00</span>
|
||||||
|
</li>
|
||||||
|
</body></html>
|
||||||
|
`),
|
||||||
|
}),
|
||||||
|
) as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchEbayItems("laptop", 1000);
|
||||||
|
|
||||||
|
expect(results).toEqual([
|
||||||
|
expect.objectContaining({
|
||||||
|
listingPrice: expect.objectContaining({ currency: "CAD" }),
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ describe("URL Construction", () => {
|
|||||||
sortOrder: "desc",
|
sortOrder: "desc",
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(url).toContain("b-buy-sell/canada/iphone/k0c132l1700272");
|
expect(url).toContain("b-phones/gta/iphone/k0c132l1700272");
|
||||||
expect(url).toContain("sort=relevancyDesc");
|
expect(url).toContain("sort=relevancyDesc");
|
||||||
expect(url).toContain("order=DESC");
|
expect(url).toContain("order=DESC");
|
||||||
});
|
});
|
||||||
@@ -141,6 +141,7 @@ describe("URL Construction", () => {
|
|||||||
category: "phones",
|
category: "phones",
|
||||||
});
|
});
|
||||||
|
|
||||||
|
expect(url).toContain("/b-phones/toronto/");
|
||||||
expect(url).toContain("k0c132l1700273"); // phones + toronto
|
expect(url).toContain("k0c132l1700273"); // phones + toronto
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -184,6 +185,110 @@ describe("Error Classes", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
describe("fetchKijijiItems", () => {
|
describe("fetchKijijiItems", () => {
|
||||||
|
test("filters fetched listings by priceMin and priceMax", async () => {
|
||||||
|
const searchHtml = `
|
||||||
|
<html>
|
||||||
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
|
${JSON.stringify({
|
||||||
|
props: {
|
||||||
|
pageProps: {
|
||||||
|
__APOLLO_STATE__: {
|
||||||
|
"Listing:1": {
|
||||||
|
url: "/v-low/k0l0",
|
||||||
|
title: "Low Listing",
|
||||||
|
},
|
||||||
|
"Listing:2": {
|
||||||
|
url: "/v-mid/k0l0",
|
||||||
|
title: "Mid Listing",
|
||||||
|
},
|
||||||
|
"Listing:3": {
|
||||||
|
url: "/v-high/k0l0",
|
||||||
|
title: "High Listing",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const listingHtml = (title: string, amount: number, slug: string) => `
|
||||||
|
<html>
|
||||||
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
|
${JSON.stringify({
|
||||||
|
props: {
|
||||||
|
pageProps: {
|
||||||
|
__APOLLO_STATE__: {
|
||||||
|
"Listing:detail": {
|
||||||
|
url: `/${slug}`,
|
||||||
|
title,
|
||||||
|
price: { amount, currency: "CAD", type: "FIXED" },
|
||||||
|
type: "OFFER",
|
||||||
|
status: "ACTIVE",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock((input: string | URL | Request) => {
|
||||||
|
const url = typeof input === "string" ? input : input.toString();
|
||||||
|
|
||||||
|
if (url.includes("/k0c0l1700272")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(searchHtml),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.endsWith("/v-low/k0l0")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(listingHtml("Low Listing", 7000, "v-low/k0l0")),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.endsWith("/v-mid/k0l0")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(listingHtml("Mid Listing", 9000, "v-mid/k0l0")),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.endsWith("/v-high/k0l0")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(listingHtml("High Listing", 12000, "v-high/k0l0")),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unexpected URL: ${url}`);
|
||||||
|
}) as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchKijijiItems(
|
||||||
|
"phone",
|
||||||
|
1000,
|
||||||
|
"https://www.kijiji.ca",
|
||||||
|
{ maxPages: 1, priceMin: 8000, priceMax: 10000 },
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(results).toEqual([
|
||||||
|
expect.objectContaining({ title: "Mid Listing" }),
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||||
const searchHtml = `
|
const searchHtml = `
|
||||||
<html>
|
<html>
|
||||||
@@ -237,7 +342,7 @@ describe("fetchKijijiItems", () => {
|
|||||||
global.fetch = mock((input: string | URL | Request) => {
|
global.fetch = mock((input: string | URL | Request) => {
|
||||||
const url = typeof input === "string" ? input : input.toString();
|
const url = typeof input === "string" ? input : input.toString();
|
||||||
|
|
||||||
if (url.includes("/b-buy-sell/")) {
|
if (url.includes("/k0c0l1700272")) {
|
||||||
return Promise.resolve({
|
return Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
text: () => Promise.resolve(searchHtml),
|
text: () => Promise.resolve(searchHtml),
|
||||||
|
|||||||
Reference in New Issue
Block a user