fix: tighten scraper edge case handling
This commit is contained in:
@@ -361,6 +361,9 @@ async function fetchHtml(
|
||||
await delay(DELAY_MS);
|
||||
return { html, responseUrl: res.url || url };
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
throw err;
|
||||
}
|
||||
if (attempt >= maxRetries) throw err;
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
}
|
||||
@@ -1286,7 +1289,7 @@ export async function fetchFacebookItem(
|
||||
return null;
|
||||
}
|
||||
|
||||
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||
if (classification.unavailable) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||
return null;
|
||||
@@ -1304,6 +1307,11 @@ export async function fetchFacebookItem(
|
||||
if (!itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
|
||||
if (itemHtml.includes("This item has been sold")) {
|
||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
||||
);
|
||||
|
||||
@@ -254,18 +254,31 @@ export function buildSearchUrl(
|
||||
|
||||
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||||
|
||||
const sortParam = options.sortBy
|
||||
? `&sort=${SORT_MAPPINGS[options.sortBy]}`
|
||||
: "";
|
||||
const sortValue =
|
||||
options.sortBy && options.sortBy !== "relevancy"
|
||||
? SORT_MAPPINGS[options.sortBy]
|
||||
: "relevancyDesc";
|
||||
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
||||
const pageParam =
|
||||
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
||||
|
||||
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
|
||||
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
function findApolloListingKey(
|
||||
apolloState: ApolloRecord,
|
||||
predicate: (value: Record<string, unknown>) => boolean,
|
||||
): string | undefined {
|
||||
return Object.keys(apolloState).find((key) => {
|
||||
if (!key.startsWith("Listing:")) return false;
|
||||
|
||||
const value = apolloState[key];
|
||||
return isRecord(value) && predicate(value);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Slugifies a string for Kijiji search URLs
|
||||
*/
|
||||
@@ -497,9 +510,9 @@ function _parseListing(
|
||||
const apolloState = extractApolloState(htmlString);
|
||||
if (!apolloState) return null;
|
||||
|
||||
// Find the listing root key
|
||||
const listingKey = Object.keys(apolloState).find((k) =>
|
||||
k.includes("Listing"),
|
||||
const listingKey = findApolloListingKey(
|
||||
apolloState,
|
||||
(value) => typeof value.url === "string" && typeof value.title === "string",
|
||||
);
|
||||
if (!listingKey) return null;
|
||||
|
||||
@@ -570,9 +583,12 @@ export async function parseDetailedListing(
|
||||
const apolloState = extractApolloState(htmlString);
|
||||
if (!apolloState) return null;
|
||||
|
||||
// Find the listing root key
|
||||
const listingKey = Object.keys(apolloState).find((k) =>
|
||||
k.includes("Listing"),
|
||||
const listingKey = findApolloListingKey(
|
||||
apolloState,
|
||||
(value) =>
|
||||
typeof value.url === "string" &&
|
||||
typeof value.title === "string" &&
|
||||
isRecord(value.price),
|
||||
);
|
||||
if (!listingKey) return null;
|
||||
|
||||
@@ -921,8 +937,12 @@ export default async function fetchKijijiItems(
|
||||
return true;
|
||||
});
|
||||
|
||||
console.log(`\nParsed ${filteredListings.length} detailed listings.`);
|
||||
return finalizeResults(filteredListings);
|
||||
console.log(
|
||||
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
|
||||
);
|
||||
return finalizeResults(
|
||||
unstableMode.hideUnstableResults ? allListings : filteredListings,
|
||||
);
|
||||
}
|
||||
|
||||
// Re-export error classes for convenience
|
||||
|
||||
@@ -177,6 +177,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
try {
|
||||
const result = await fetchFacebookItem("123");
|
||||
expect(result).toBeNull();
|
||||
expect(global.fetch).toHaveBeenCalledTimes(1);
|
||||
expect(warnMock).toHaveBeenCalledWith(
|
||||
"Authentication error: Invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
|
||||
);
|
||||
@@ -309,6 +310,54 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
expect(result?.listingStatus).toBe("SOLD");
|
||||
});
|
||||
|
||||
test("should still parse sold items when structured data exists", async () => {
|
||||
const soldStructuredHtml = `
|
||||
<html><body>
|
||||
<div>This item has been sold</div>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
listing: {
|
||||
id: "457",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Structured Sold Item",
|
||||
formatted_price: { text: "CA$90" },
|
||||
listing_price: {
|
||||
amount: "90.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "90.00",
|
||||
},
|
||||
is_sold: true,
|
||||
is_live: false,
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(soldStructuredHtml),
|
||||
url: "https://www.facebook.com/marketplace/item/457/",
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await fetchFacebookItem("457");
|
||||
|
||||
expect(result).toEqual(
|
||||
expect.objectContaining({
|
||||
title: "Structured Sold Item",
|
||||
listingStatus: "SOLD",
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
test("should handle successful item extraction", async () => {
|
||||
const mockData = {
|
||||
require: [
|
||||
|
||||
@@ -4,6 +4,7 @@ import {
|
||||
default as fetchKijijiItems,
|
||||
type DetailedListing,
|
||||
NetworkError,
|
||||
parseDetailedListing,
|
||||
ParseError,
|
||||
RateLimitError,
|
||||
resolveCategoryId,
|
||||
@@ -124,6 +125,7 @@ describe("URL Construction", () => {
|
||||
sortBy: "date",
|
||||
sortOrder: "asc",
|
||||
});
|
||||
expect(dateUrl.match(/sort=/g)?.length).toBe(1);
|
||||
expect(dateUrl).toContain("sort=DATE");
|
||||
expect(dateUrl).toContain("order=ASC");
|
||||
|
||||
@@ -289,6 +291,141 @@ describe("fetchKijijiItems", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
test("classifies unstable mode using all parsed listings before price filtering", async () => {
|
||||
const searchHtml = `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
${JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
__APOLLO_STATE__: {
|
||||
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
|
||||
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
|
||||
"Listing:3": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
|
||||
},
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const listingHtml = (title: string, amount: number, slug: string) => `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
${JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
__APOLLO_STATE__: {
|
||||
"Listing:detail": {
|
||||
url: `/${slug}`,
|
||||
title,
|
||||
price: { amount, currency: "CAD", type: "FIXED" },
|
||||
type: "OFFER",
|
||||
status: "ACTIVE",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</html>
|
||||
`;
|
||||
|
||||
global.fetch = mock((input: string | URL | Request) => {
|
||||
const url = typeof input === "string" ? input : input.toString();
|
||||
|
||||
if (url.includes("/k0c0l1700272")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(searchHtml),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-stable-one/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0")),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-stable-two/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0")),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-unstable/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(listingHtml("Unstable Listing", 7000, "v-unstable/k0l0")),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected URL: ${url}`);
|
||||
}) as typeof fetch;
|
||||
|
||||
const results = await fetchKijijiItems(
|
||||
"phone",
|
||||
1000,
|
||||
"https://www.kijiji.ca",
|
||||
{ maxPages: 1, priceMin: 8000 },
|
||||
{},
|
||||
{ hideUnstableResults: true },
|
||||
);
|
||||
|
||||
expect(results).toEqual({
|
||||
results: [
|
||||
expect.objectContaining({ title: "Stable Listing One" }),
|
||||
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||
],
|
||||
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||
});
|
||||
});
|
||||
|
||||
test("parseDetailedListing ignores non-root listing-like entities", async () => {
|
||||
const html = `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
${JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
__APOLLO_STATE__: {
|
||||
"SearchListingCard:1": {
|
||||
url: "/v-card/k0l0",
|
||||
title: "Card Listing",
|
||||
},
|
||||
"Listing:detail": {
|
||||
url: "/v-detailed/k0l0",
|
||||
title: "Detailed Listing",
|
||||
price: { amount: 10000, currency: "CAD", type: "FIXED" },
|
||||
type: "OFFER",
|
||||
status: "ACTIVE",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const result = await parseDetailedListing(html, "https://www.kijiji.ca");
|
||||
|
||||
expect(result).toEqual(
|
||||
expect.objectContaining({ title: "Detailed Listing" }),
|
||||
);
|
||||
});
|
||||
|
||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||
const searchHtml = `
|
||||
<html>
|
||||
|
||||
Reference in New Issue
Block a user