feat: add unidecode dependency and slugify function for improved Kijiji search

This commit introduces the `unidecode` library to handle non-ASCII characters in search queries. A `slugify` function is implemented to improve Kijiji search URL generation by creating more user-friendly and reliable URLs. It converts the user query to a URL friendly string.

It also addresses issues in Kijiji scraping: improves listing filtering logic to ensure only valid listings with prices are added and enhances error handling to provide more informative messages and prevent process termination due to HTTP errors during listing detail retrieval.

Additionally, refactors the Kijiji search query URL, leveraging the new `slugify` function for enhanced URL generation using the search query. Added debugging logs for better traceability.
This commit is contained in:
2025-09-18 11:35:10 -04:00
parent add93dc6bd
commit cb1fb2bae6
3 changed files with 47 additions and 12 deletions

View File

@@ -5,9 +5,11 @@
"name": "sone4ka-tok", "name": "sone4ka-tok",
"dependencies": { "dependencies": {
"linkedom": "^0.18.12", "linkedom": "^0.18.12",
"unidecode": "^1.1.0",
}, },
"devDependencies": { "devDependencies": {
"@types/bun": "latest", "@types/bun": "latest",
"@types/unidecode": "^1.1.0",
}, },
"peerDependencies": { "peerDependencies": {
"typescript": "^5", "typescript": "^5",
@@ -21,6 +23,8 @@
"@types/react": ["@types/react@19.1.9", "", { "dependencies": { "csstype": "^3.0.2" } }, "sha512-WmdoynAX8Stew/36uTSVMcLJJ1KRh6L3IZRx1PZ7qJtBqT3dYTgyDTx8H1qoRghErydW7xw9mSJ3wS//tCRpFA=="], "@types/react": ["@types/react@19.1.9", "", { "dependencies": { "csstype": "^3.0.2" } }, "sha512-WmdoynAX8Stew/36uTSVMcLJJ1KRh6L3IZRx1PZ7qJtBqT3dYTgyDTx8H1qoRghErydW7xw9mSJ3wS//tCRpFA=="],
"@types/unidecode": ["@types/unidecode@1.1.0", "", {}, "sha512-NTIsFsTe9WRek39/8DDj7KiQ0nU33DHMrKwNHcD1rKlUvn4N0Rc4Di8q/Xavs8bsDZmBa4MMtQA8+HNgwfxC/A=="],
"boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="],
"bun-types": ["bun-types@1.2.19", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-uAOTaZSPuYsWIXRpj7o56Let0g/wjihKCkeRqUBhlLVM/Bt+Fj9xTo+LhC1OV1XDaGkz4hNC80et5xgy+9KTHQ=="], "bun-types": ["bun-types@1.2.19", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-uAOTaZSPuYsWIXRpj7o56Let0g/wjihKCkeRqUBhlLVM/Bt+Fj9xTo+LhC1OV1XDaGkz4hNC80et5xgy+9KTHQ=="],
@@ -57,6 +61,8 @@
"undici-types": ["undici-types@7.8.0", "", {}, "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw=="], "undici-types": ["undici-types@7.8.0", "", {}, "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw=="],
"unidecode": ["unidecode@1.1.0", "", {}, "sha512-GIp57N6DVVJi8dpeIU6/leJGdv7W65ZSXFLFiNmxvexXkc0nXdqUvhA/qL9KqBKsILxMwg5MnmYNOIDJLb5JVA=="],
"dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],
} }
} }

View File

@@ -9,12 +9,14 @@
"type": "module", "type": "module",
"private": true, "private": true,
"devDependencies": { "devDependencies": {
"@types/bun": "latest" "@types/bun": "latest",
"@types/unidecode": "^1.1.0"
}, },
"peerDependencies": { "peerDependencies": {
"typescript": "^5" "typescript": "^5"
}, },
"dependencies": { "dependencies": {
"linkedom": "^0.18.12" "linkedom": "^0.18.12",
"unidecode": "^1.1.0"
} }
} }

View File

@@ -1,5 +1,8 @@
/* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable @typescript-eslint/no-explicit-any */
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import unidecode from "unidecode";
// const unidecode = require("unidecode");
// ----------------------------- Types ----------------------------- // ----------------------------- Types -----------------------------
@@ -49,10 +52,37 @@ type ListingDetails = {
address?: string | null; address?: string | null;
}; };
// ----------------------------- Config -----------------------------
// ----------------------------- Utilities ----------------------------- // ----------------------------- Utilities -----------------------------
const SEPS = new Set([" ", "", "—", "/", ":", ";", ",", ".", "-"]);
/**
* Slugifies a string for search
*/
export function slugify(input: string): string {
const s = unidecode(input).toLowerCase();
const out: string[] = [];
let lastHyphen = false;
for (let i = 0; i < s.length; i++) {
const ch = s[i];
const code = ch!.charCodeAt(0);
// a-z or 0-9
if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) {
out.push(ch!);
lastHyphen = false;
} else if (SEPS.has(ch!)) {
if (!lastHyphen) {
out.push("-");
lastHyphen = true;
}
}
// else drop character
}
return out.join("");
}
/** /**
* Turns cents to localized currency string. * Turns cents to localized currency string.
*/ */
@@ -112,6 +142,7 @@ async function fetchHtml(
for (let attempt = 0; attempt <= maxRetries; attempt++) { for (let attempt = 0; attempt <= maxRetries; attempt++) {
try { try {
console.log(`Fetching: `, url);
const res = await fetch(url, { const res = await fetch(url, {
method: "GET", method: "GET",
headers: { headers: {
@@ -289,7 +320,7 @@ export default async function fetchKijijiItems(
) { ) {
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${encodeURIComponent(SEARCH_QUERY)}/k0l0?dc=true&view=list`; const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`;
console.log(`Fetching search: ${searchUrl}`); console.log(`Fetching search: ${searchUrl}`);
const searchHtml = await fetchHtml(searchUrl, DELAY_MS, { const searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
@@ -330,7 +361,9 @@ export default async function fetchKijijiItems(
}, },
}); });
const parsed = parseListing(html, BASE_URL); const parsed = parseListing(html, BASE_URL);
if (parsed) items.push(parsed); if (parsed) {
if (parsed.listingPrice?.cents) items.push(parsed);
}
} catch (err) { } catch (err) {
if (err instanceof HttpError) { if (err instanceof HttpError) {
console.error(`Failed to fetch ${link} - ${err.status} ${err.message}`); console.error(`Failed to fetch ${link} - ${err.status} ${err.message}`);
@@ -344,10 +377,4 @@ export default async function fetchKijijiItems(
console.log(`Parsed ${items.length} listings.`); console.log(`Parsed ${items.length} listings.`);
return items; return items;
// console.log(items);
} }
// void main().catch((err) => {
// console.error("Fatal error:", err);
// process.exitCode = 1;
// });