fix: tighten scraper edge case handling
This commit is contained in:
@@ -361,6 +361,9 @@ async function fetchHtml(
|
|||||||
await delay(DELAY_MS);
|
await delay(DELAY_MS);
|
||||||
return { html, responseUrl: res.url || url };
|
return { html, responseUrl: res.url || url };
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
if (err instanceof HttpError) {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
if (attempt >= maxRetries) throw err;
|
if (attempt >= maxRetries) throw err;
|
||||||
await delay((attempt + 1) * retryBaseMs);
|
await delay((attempt + 1) * retryBaseMs);
|
||||||
}
|
}
|
||||||
@@ -1286,7 +1289,7 @@ export async function fetchFacebookItem(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
if (classification.unavailable) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||||
return null;
|
return null;
|
||||||
@@ -1304,6 +1307,11 @@ export async function fetchFacebookItem(
|
|||||||
if (!itemData) {
|
if (!itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
|
|
||||||
|
if (itemHtml.includes("This item has been sold")) {
|
||||||
|
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
console.warn(
|
console.warn(
|
||||||
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -254,18 +254,31 @@ export function buildSearchUrl(
|
|||||||
|
|
||||||
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||||||
|
|
||||||
const sortParam = options.sortBy
|
const sortValue =
|
||||||
? `&sort=${SORT_MAPPINGS[options.sortBy]}`
|
options.sortBy && options.sortBy !== "relevancy"
|
||||||
: "";
|
? SORT_MAPPINGS[options.sortBy]
|
||||||
|
: "relevancyDesc";
|
||||||
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
||||||
const pageParam =
|
const pageParam =
|
||||||
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
||||||
|
|
||||||
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
|
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
|
||||||
|
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function findApolloListingKey(
|
||||||
|
apolloState: ApolloRecord,
|
||||||
|
predicate: (value: Record<string, unknown>) => boolean,
|
||||||
|
): string | undefined {
|
||||||
|
return Object.keys(apolloState).find((key) => {
|
||||||
|
if (!key.startsWith("Listing:")) return false;
|
||||||
|
|
||||||
|
const value = apolloState[key];
|
||||||
|
return isRecord(value) && predicate(value);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Slugifies a string for Kijiji search URLs
|
* Slugifies a string for Kijiji search URLs
|
||||||
*/
|
*/
|
||||||
@@ -497,9 +510,9 @@ function _parseListing(
|
|||||||
const apolloState = extractApolloState(htmlString);
|
const apolloState = extractApolloState(htmlString);
|
||||||
if (!apolloState) return null;
|
if (!apolloState) return null;
|
||||||
|
|
||||||
// Find the listing root key
|
const listingKey = findApolloListingKey(
|
||||||
const listingKey = Object.keys(apolloState).find((k) =>
|
apolloState,
|
||||||
k.includes("Listing"),
|
(value) => typeof value.url === "string" && typeof value.title === "string",
|
||||||
);
|
);
|
||||||
if (!listingKey) return null;
|
if (!listingKey) return null;
|
||||||
|
|
||||||
@@ -570,9 +583,12 @@ export async function parseDetailedListing(
|
|||||||
const apolloState = extractApolloState(htmlString);
|
const apolloState = extractApolloState(htmlString);
|
||||||
if (!apolloState) return null;
|
if (!apolloState) return null;
|
||||||
|
|
||||||
// Find the listing root key
|
const listingKey = findApolloListingKey(
|
||||||
const listingKey = Object.keys(apolloState).find((k) =>
|
apolloState,
|
||||||
k.includes("Listing"),
|
(value) =>
|
||||||
|
typeof value.url === "string" &&
|
||||||
|
typeof value.title === "string" &&
|
||||||
|
isRecord(value.price),
|
||||||
);
|
);
|
||||||
if (!listingKey) return null;
|
if (!listingKey) return null;
|
||||||
|
|
||||||
@@ -921,8 +937,12 @@ export default async function fetchKijijiItems(
|
|||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`\nParsed ${filteredListings.length} detailed listings.`);
|
console.log(
|
||||||
return finalizeResults(filteredListings);
|
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
|
||||||
|
);
|
||||||
|
return finalizeResults(
|
||||||
|
unstableMode.hideUnstableResults ? allListings : filteredListings,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Re-export error classes for convenience
|
// Re-export error classes for convenience
|
||||||
|
|||||||
@@ -177,6 +177,7 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
try {
|
try {
|
||||||
const result = await fetchFacebookItem("123");
|
const result = await fetchFacebookItem("123");
|
||||||
expect(result).toBeNull();
|
expect(result).toBeNull();
|
||||||
|
expect(global.fetch).toHaveBeenCalledTimes(1);
|
||||||
expect(warnMock).toHaveBeenCalledWith(
|
expect(warnMock).toHaveBeenCalledWith(
|
||||||
"Authentication error: Invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
|
"Authentication error: Invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
|
||||||
);
|
);
|
||||||
@@ -309,6 +310,54 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
expect(result?.listingStatus).toBe("SOLD");
|
expect(result?.listingStatus).toBe("SOLD");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("should still parse sold items when structured data exists", async () => {
|
||||||
|
const soldStructuredHtml = `
|
||||||
|
<html><body>
|
||||||
|
<div>This item has been sold</div>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
listing: {
|
||||||
|
id: "457",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Structured Sold Item",
|
||||||
|
formatted_price: { text: "CA$90" },
|
||||||
|
listing_price: {
|
||||||
|
amount: "90.00",
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: "90.00",
|
||||||
|
},
|
||||||
|
is_sold: true,
|
||||||
|
is_live: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(soldStructuredHtml),
|
||||||
|
url: "https://www.facebook.com/marketplace/item/457/",
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await fetchFacebookItem("457");
|
||||||
|
|
||||||
|
expect(result).toEqual(
|
||||||
|
expect.objectContaining({
|
||||||
|
title: "Structured Sold Item",
|
||||||
|
listingStatus: "SOLD",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle successful item extraction", async () => {
|
test("should handle successful item extraction", async () => {
|
||||||
const mockData = {
|
const mockData = {
|
||||||
require: [
|
require: [
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import {
|
|||||||
default as fetchKijijiItems,
|
default as fetchKijijiItems,
|
||||||
type DetailedListing,
|
type DetailedListing,
|
||||||
NetworkError,
|
NetworkError,
|
||||||
|
parseDetailedListing,
|
||||||
ParseError,
|
ParseError,
|
||||||
RateLimitError,
|
RateLimitError,
|
||||||
resolveCategoryId,
|
resolveCategoryId,
|
||||||
@@ -124,6 +125,7 @@ describe("URL Construction", () => {
|
|||||||
sortBy: "date",
|
sortBy: "date",
|
||||||
sortOrder: "asc",
|
sortOrder: "asc",
|
||||||
});
|
});
|
||||||
|
expect(dateUrl.match(/sort=/g)?.length).toBe(1);
|
||||||
expect(dateUrl).toContain("sort=DATE");
|
expect(dateUrl).toContain("sort=DATE");
|
||||||
expect(dateUrl).toContain("order=ASC");
|
expect(dateUrl).toContain("order=ASC");
|
||||||
|
|
||||||
@@ -289,6 +291,141 @@ describe("fetchKijijiItems", () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("classifies unstable mode using all parsed listings before price filtering", async () => {
|
||||||
|
const searchHtml = `
|
||||||
|
<html>
|
||||||
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
|
${JSON.stringify({
|
||||||
|
props: {
|
||||||
|
pageProps: {
|
||||||
|
__APOLLO_STATE__: {
|
||||||
|
"Listing:1": { url: "/v-stable-one/k0l0", title: "Stable Listing One" },
|
||||||
|
"Listing:2": { url: "/v-stable-two/k0l0", title: "Stable Listing Two" },
|
||||||
|
"Listing:3": { url: "/v-unstable/k0l0", title: "Unstable Listing" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const listingHtml = (title: string, amount: number, slug: string) => `
|
||||||
|
<html>
|
||||||
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
|
${JSON.stringify({
|
||||||
|
props: {
|
||||||
|
pageProps: {
|
||||||
|
__APOLLO_STATE__: {
|
||||||
|
"Listing:detail": {
|
||||||
|
url: `/${slug}`,
|
||||||
|
title,
|
||||||
|
price: { amount, currency: "CAD", type: "FIXED" },
|
||||||
|
type: "OFFER",
|
||||||
|
status: "ACTIVE",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock((input: string | URL | Request) => {
|
||||||
|
const url = typeof input === "string" ? input : input.toString();
|
||||||
|
|
||||||
|
if (url.includes("/k0c0l1700272")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(searchHtml),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.endsWith("/v-stable-one/k0l0")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0")),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.endsWith("/v-stable-two/k0l0")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0")),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.endsWith("/v-unstable/k0l0")) {
|
||||||
|
return Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(listingHtml("Unstable Listing", 7000, "v-unstable/k0l0")),
|
||||||
|
headers: { get: () => null },
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unexpected URL: ${url}`);
|
||||||
|
}) as typeof fetch;
|
||||||
|
|
||||||
|
const results = await fetchKijijiItems(
|
||||||
|
"phone",
|
||||||
|
1000,
|
||||||
|
"https://www.kijiji.ca",
|
||||||
|
{ maxPages: 1, priceMin: 8000 },
|
||||||
|
{},
|
||||||
|
{ hideUnstableResults: true },
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(results).toEqual({
|
||||||
|
results: [
|
||||||
|
expect.objectContaining({ title: "Stable Listing One" }),
|
||||||
|
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||||
|
],
|
||||||
|
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("parseDetailedListing ignores non-root listing-like entities", async () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
|
${JSON.stringify({
|
||||||
|
props: {
|
||||||
|
pageProps: {
|
||||||
|
__APOLLO_STATE__: {
|
||||||
|
"SearchListingCard:1": {
|
||||||
|
url: "/v-card/k0l0",
|
||||||
|
title: "Card Listing",
|
||||||
|
},
|
||||||
|
"Listing:detail": {
|
||||||
|
url: "/v-detailed/k0l0",
|
||||||
|
title: "Detailed Listing",
|
||||||
|
price: { amount: 10000, currency: "CAD", type: "FIXED" },
|
||||||
|
type: "OFFER",
|
||||||
|
status: "ACTIVE",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await parseDetailedListing(html, "https://www.kijiji.ca");
|
||||||
|
|
||||||
|
expect(result).toEqual(
|
||||||
|
expect.objectContaining({ title: "Detailed Listing" }),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||||
const searchHtml = `
|
const searchHtml = `
|
||||||
<html>
|
<html>
|
||||||
|
|||||||
Reference in New Issue
Block a user