fix: tighten scraper edge case handling
This commit is contained in:
@@ -361,6 +361,9 @@ async function fetchHtml(
|
||||
await delay(DELAY_MS);
|
||||
return { html, responseUrl: res.url || url };
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
throw err;
|
||||
}
|
||||
if (attempt >= maxRetries) throw err;
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
}
|
||||
@@ -1286,7 +1289,7 @@ export async function fetchFacebookItem(
|
||||
return null;
|
||||
}
|
||||
|
||||
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||
if (classification.unavailable) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||
return null;
|
||||
@@ -1304,6 +1307,11 @@ export async function fetchFacebookItem(
|
||||
if (!itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
|
||||
if (itemHtml.includes("This item has been sold")) {
|
||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
||||
);
|
||||
|
||||
@@ -254,18 +254,31 @@ export function buildSearchUrl(
|
||||
|
||||
let url = `${BASE_URL}/b-${categorySlug}/${locationSlug}/${slugify(keywords)}/k0c${categoryId}l${locationId}`;
|
||||
|
||||
const sortParam = options.sortBy
|
||||
? `&sort=${SORT_MAPPINGS[options.sortBy]}`
|
||||
: "";
|
||||
const sortValue =
|
||||
options.sortBy && options.sortBy !== "relevancy"
|
||||
? SORT_MAPPINGS[options.sortBy]
|
||||
: "relevancyDesc";
|
||||
const sortOrder = options.sortOrder === "asc" ? "ASC" : "DESC";
|
||||
const pageParam =
|
||||
options.page && options.page > 1 ? `&page=${options.page}` : "";
|
||||
|
||||
url += `?sort=relevancyDesc&view=list${sortParam}&order=${sortOrder}${pageParam}`;
|
||||
url += `?sort=${sortValue}&view=list&order=${sortOrder}${pageParam}`;
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
function findApolloListingKey(
|
||||
apolloState: ApolloRecord,
|
||||
predicate: (value: Record<string, unknown>) => boolean,
|
||||
): string | undefined {
|
||||
return Object.keys(apolloState).find((key) => {
|
||||
if (!key.startsWith("Listing:")) return false;
|
||||
|
||||
const value = apolloState[key];
|
||||
return isRecord(value) && predicate(value);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Slugifies a string for Kijiji search URLs
|
||||
*/
|
||||
@@ -497,9 +510,9 @@ function _parseListing(
|
||||
const apolloState = extractApolloState(htmlString);
|
||||
if (!apolloState) return null;
|
||||
|
||||
// Find the listing root key
|
||||
const listingKey = Object.keys(apolloState).find((k) =>
|
||||
k.includes("Listing"),
|
||||
const listingKey = findApolloListingKey(
|
||||
apolloState,
|
||||
(value) => typeof value.url === "string" && typeof value.title === "string",
|
||||
);
|
||||
if (!listingKey) return null;
|
||||
|
||||
@@ -570,9 +583,12 @@ export async function parseDetailedListing(
|
||||
const apolloState = extractApolloState(htmlString);
|
||||
if (!apolloState) return null;
|
||||
|
||||
// Find the listing root key
|
||||
const listingKey = Object.keys(apolloState).find((k) =>
|
||||
k.includes("Listing"),
|
||||
const listingKey = findApolloListingKey(
|
||||
apolloState,
|
||||
(value) =>
|
||||
typeof value.url === "string" &&
|
||||
typeof value.title === "string" &&
|
||||
isRecord(value.price),
|
||||
);
|
||||
if (!listingKey) return null;
|
||||
|
||||
@@ -921,8 +937,12 @@ export default async function fetchKijijiItems(
|
||||
return true;
|
||||
});
|
||||
|
||||
console.log(`\nParsed ${filteredListings.length} detailed listings.`);
|
||||
return finalizeResults(filteredListings);
|
||||
console.log(
|
||||
`\nParsed ${unstableMode.hideUnstableResults ? allListings.length : filteredListings.length} detailed listings.`,
|
||||
);
|
||||
return finalizeResults(
|
||||
unstableMode.hideUnstableResults ? allListings : filteredListings,
|
||||
);
|
||||
}
|
||||
|
||||
// Re-export error classes for convenience
|
||||
|
||||
Reference in New Issue
Block a user