refactor: clean kijiji scraper internals
This commit is contained in:
@@ -11,6 +11,7 @@ import {
|
|||||||
formatCookiesForHeader,
|
formatCookiesForHeader,
|
||||||
loadCookiesOptional,
|
loadCookiesOptional,
|
||||||
} from "../utils/cookies";
|
} from "../utils/cookies";
|
||||||
|
import { delay } from "../utils/delay";
|
||||||
import { formatCentsToCurrency } from "../utils/format";
|
import { formatCentsToCurrency } from "../utils/format";
|
||||||
import {
|
import {
|
||||||
fetchHtml,
|
fetchHtml,
|
||||||
@@ -568,78 +569,6 @@ export function parseSearch(
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
Parse a listing page into a typed object (backward compatible).
|
|
||||||
*/
|
|
||||||
function _parseListing(
|
|
||||||
htmlString: HTMLString,
|
|
||||||
BASE_URL: string,
|
|
||||||
): KijijiListingDetails | null {
|
|
||||||
const apolloState = extractApolloState(htmlString);
|
|
||||||
if (!apolloState) return null;
|
|
||||||
|
|
||||||
const listingKey = findApolloListingKey(
|
|
||||||
apolloState,
|
|
||||||
(value) => typeof value.url === "string" && typeof value.title === "string",
|
|
||||||
);
|
|
||||||
if (!listingKey) return null;
|
|
||||||
|
|
||||||
const root = apolloState[listingKey];
|
|
||||||
if (!isRecord(root)) return null;
|
|
||||||
|
|
||||||
const {
|
|
||||||
url,
|
|
||||||
title,
|
|
||||||
description,
|
|
||||||
price,
|
|
||||||
type,
|
|
||||||
status,
|
|
||||||
activationDate,
|
|
||||||
endDate,
|
|
||||||
metrics,
|
|
||||||
location,
|
|
||||||
} = root as ApolloListingRoot;
|
|
||||||
|
|
||||||
const cents = price?.amount != null ? Number(price.amount) : undefined;
|
|
||||||
const amountFormatted =
|
|
||||||
cents != null ? formatCentsToCurrency(cents, "en-CA") : undefined;
|
|
||||||
|
|
||||||
const numberOfViews =
|
|
||||||
metrics?.views != null ? Number(metrics.views) : undefined;
|
|
||||||
|
|
||||||
const listingUrl =
|
|
||||||
typeof url === "string"
|
|
||||||
? url.startsWith("http")
|
|
||||||
? url
|
|
||||||
: `${BASE_URL}${url}`
|
|
||||||
: "";
|
|
||||||
|
|
||||||
if (!listingUrl || !title) return null;
|
|
||||||
|
|
||||||
return {
|
|
||||||
url: listingUrl,
|
|
||||||
title,
|
|
||||||
description,
|
|
||||||
listingPrice: amountFormatted
|
|
||||||
? {
|
|
||||||
amountFormatted,
|
|
||||||
cents:
|
|
||||||
cents !== undefined && Number.isFinite(cents) ? cents : undefined,
|
|
||||||
currency: price?.currency,
|
|
||||||
}
|
|
||||||
: undefined,
|
|
||||||
listingType: type,
|
|
||||||
listingStatus: status,
|
|
||||||
creationDate: activationDate,
|
|
||||||
endDate,
|
|
||||||
numberOfViews:
|
|
||||||
numberOfViews !== undefined && Number.isFinite(numberOfViews)
|
|
||||||
? numberOfViews
|
|
||||||
: undefined,
|
|
||||||
address: location?.address ?? null,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse a listing page into a detailed object with all available fields
|
* Parse a listing page into a detailed object with all available fields
|
||||||
*/
|
*/
|
||||||
@@ -928,9 +857,7 @@ export default async function fetchKijijiItems(
|
|||||||
const batchPromises = batch.map(async (link, batchIndex) => {
|
const batchPromises = batch.map(async (link, batchIndex) => {
|
||||||
try {
|
try {
|
||||||
if (batchIndex > 0) {
|
if (batchIndex > 0) {
|
||||||
await new Promise((resolve) =>
|
await delay(DELAY_MS * batchIndex);
|
||||||
setTimeout(resolve, DELAY_MS * batchIndex),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const html = await fetchHtml(link, 0, {
|
const html = await fetchHtml(link, 0, {
|
||||||
@@ -952,11 +879,11 @@ export default async function fetchKijijiItems(
|
|||||||
return parsed;
|
return parsed;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof HttpError) {
|
if (err instanceof HttpError) {
|
||||||
console.error(
|
logger.warn(
|
||||||
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
|
`\nFailed to fetch ${link}\n - ${err.statusCode} ${err.message}`,
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
console.error(
|
logger.warn(
|
||||||
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
`\nFailed to fetch ${link}\n - ${String((err as Error)?.message || err)}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -974,7 +901,7 @@ export default async function fetchKijijiItems(
|
|||||||
results.push(...batchResults);
|
results.push(...batchResults);
|
||||||
|
|
||||||
if (i + CONCURRENT_REQUESTS < newListingLinks.length) {
|
if (i + CONCURRENT_REQUESTS < newListingLinks.length) {
|
||||||
await new Promise((resolve) => setTimeout(resolve, DELAY_MS));
|
await delay(DELAY_MS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user