fix: tighten scraper edge case handling

This commit is contained in:
2026-04-22 23:46:52 -04:00
parent 6f9d4db419
commit b5e14e686a
4 changed files with 227 additions and 13 deletions

View File

@@ -361,6 +361,9 @@ async function fetchHtml(
await delay(DELAY_MS);
return { html, responseUrl: res.url || url };
} catch (err) {
if (err instanceof HttpError) {
throw err;
}
if (attempt >= maxRetries) throw err;
await delay((attempt + 1) * retryBaseMs);
}
@@ -1286,7 +1289,7 @@ export async function fetchFacebookItem(
return null;
}
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
if (classification.unavailable) {
logExtractionMetrics(false, itemId);
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
return null;
@@ -1304,6 +1307,11 @@ export async function fetchFacebookItem(
if (!itemData) {
logExtractionMetrics(false, itemId);
if (itemHtml.includes("This item has been sold")) {
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
return null;
}
console.warn(
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
);

View File

@@ -254,18 +254,31 @@ export function buildSearchUrl(
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
const sortParam = options.sortBy
? `&sort=${SORT_MAPPINGS[options.sortBy]}`
: "";
const sortValue =
options.sortBy && options.sortBy !== "relevancy"
? SORT_MAPPINGS[options.sortBy]
: "relevancyDesc";
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
const pageParam =
options.page && options.page > 1 ? `&page=${options.page}` : "";
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
return url;
}
function findApolloListingKey(
apolloState: ApolloRecord,
predicate: (value: Record<string, unknown>) => boolean,
): string | undefined {
return Object.keys(apolloState).find((key) => {
if (!key.startsWith("Listing:")) return false;
const value = apolloState[key];
return isRecord(value) && predicate(value);
});
}
/**
* Slugifies a string for Kijiji search URLs
*/
@@ -497,9 +510,9 @@ function _parseListing(
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing"),
const listingKey = findApolloListingKey(
apolloState,
(value) => typeof value.url === "string" && typeof value.title === "string",
);
if (!listingKey) return null;
@@ -570,9 +583,12 @@ export async function parseDetailedListing(
const apolloState = extractApolloState(htmlString);
if (!apolloState) return null;
// Find the listing root key
const listingKey = Object.keys(apolloState).find((k) =>
k.includes("Listing"),
const listingKey = findApolloListingKey(
apolloState,
(value) =>
typeof value.url === "string" &&
typeof value.title === "string" &&
isRecord(value.price),
);
if (!listingKey) return null;
@@ -921,8 +937,12 @@ export default async function fetchKijijiItems(
return true;
});
console.log(`\nParsed ${filteredListings.length} detailed listings.`);
return finalizeResults(filteredListings);
console.log(
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
);
return finalizeResults(
unstableMode.hideUnstableResults ? allListings : filteredListings,
);
}
// Re-export error classes for convenience

View File

@@ -177,6 +177,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
try {
const result = await fetchFacebookItem("123");
expect(result).toBeNull();
expect(global.fetch).toHaveBeenCalledTimes(1);
expect(warnMock).toHaveBeenCalledWith(
"Authentication error: Invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
);
@@ -309,6 +310,54 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
expect(result?.listingStatus).toBe("SOLD");
});
test("should still parse sold items when structured data exists", async () => {
const soldStructuredHtml = `
<html><body>
<div>This item has been sold</div>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
listing: {
id: "457",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Structured Sold Item",
formatted_price: { text: "CA$90" },
listing_price: {
amount: "90.00",
currency: "CAD",
amount_with_offset: "90.00",
},
is_sold: true,
is_live: false,
},
},
})}
</script>
</body></html>
`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(soldStructuredHtml),
url: "https://www.facebook.com/marketplace/item/457/",
headers: {
get: () => null,
},
}),
);
const result = await fetchFacebookItem("457");
expect(result).toEqual(
expect.objectContaining({
title: "Structured Sold Item",
listingStatus: "SOLD",
}),
);
});
test("should handle successful item extraction", async () => {
const mockData = {
require: [

View File

@@ -4,6 +4,7 @@ import {
default as fetchKijijiItems,
type DetailedListing,
NetworkError,
parseDetailedListing,
ParseError,
RateLimitError,
resolveCategoryId,
@@ -124,6 +125,7 @@ describe("URL Construction", () => {
sortBy: "date",
sortOrder: "asc",
});
expect(dateUrl.match(/sort=/g)?.length).toBe(1);
expect(dateUrl).toContain("sort=DATE");
expect(dateUrl).toContain("order=ASC");
@@ -289,6 +291,141 @@ describe("fetchKijijiItems", () => {
]);
});
test("classifies unstable mode using all parsed listings before price filtering", async () => {
const searchHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
"Listing:3": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
},
},
},
})}
</script>
</html>
`;
const listingHtml = (title: string, amount: number, slug: string) => `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:detail": {
url: `/${slug}`,
title,
price: { amount, currency: "CAD", type: "FIXED" },
type: "OFFER",
status: "ACTIVE",
},
},
},
},
})}
</script>
</html>
`;
global.fetch = mock((input: string | URL | Request) => {
const url = typeof input === "string" ? input : input.toString();
if (url.includes("/k0c0l1700272")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(searchHtml),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-stable-one/k0l0")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0")),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-stable-two/k0l0")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0")),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-unstable/k0l0")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(listingHtml("Unstable Listing", 7000, "v-unstable/k0l0")),
headers: { get: () => null },
url,
});
}
throw new Error(`Unexpected URL: ${url}`);
}) as typeof fetch;
const results = await fetchKijijiItems(
"phone",
1000,
"https://www.kijiji.ca",
{ maxPages: 1, priceMin: 8000 },
{},
{ hideUnstableResults: true },
);
expect(results).toEqual({
results: [
expect.objectContaining({ title: "Stable Listing One" }),
expect.objectContaining({ title: "Stable Listing Two" }),
],
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
});
});
test("parseDetailedListing ignores non-root listing-like entities", async () => {
const html = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"SearchListingCard:1": {
url: "/v-card/k0l0",
title: "Card Listing",
},
"Listing:detail": {
url: "/v-detailed/k0l0",
title: "Detailed Listing",
price: { amount: 10000, currency: "CAD", type: "FIXED" },
type: "OFFER",
status: "ACTIVE",
},
},
},
},
})}
</script>
</html>
`;
const result = await parseDetailedListing(html, "https://www.kijiji.ca");
expect(result).toEqual(
expect.objectContaining({ title: "Detailed Listing" }),
);
});
test("returns results and unstableResults when unstable mode is enabled", async () => {
const searchHtml = `
<html>