From cb1fb2bae62c48a5b706f04c2ead1a7f265b188b Mon Sep 17 00:00:00 2001 From: Dmytro Stanchiev Date: Thu, 18 Sep 2025 11:35:10 -0400 Subject: [PATCH] feat: add unidecode dependency and slugify function for improved Kijiji search This commit introduces the `unidecode` library to handle non-ASCII characters in search queries. A `slugify` function is implemented to improve Kijiji search URL generation by creating more user-friendly and reliable URLs. It converts the user query to a URL friendly string. It also addresses issues in Kijiji scraping: improves listing filtering logic to ensure only valid listings with prices are added and enhances error handling to provide more informative messages and prevent process termination due to HTTP errors during listing detail retrieval. Additionally, refactors the Kijiji search query URL, leveraging the new `slugify` function for enhanced URL generation using the search query. Added debugging logs for better traceability. --- bun.lock | 6 ++++++ package.json | 6 ++++-- src/kijiji.ts | 47 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/bun.lock b/bun.lock index db2e64d..6bc6405 100644 --- a/bun.lock +++ b/bun.lock @@ -5,9 +5,11 @@ "name": "sone4ka-tok", "dependencies": { "linkedom": "^0.18.12", + "unidecode": "^1.1.0", }, "devDependencies": { "@types/bun": "latest", + "@types/unidecode": "^1.1.0", }, "peerDependencies": { "typescript": "^5", @@ -21,6 +23,8 @@ "@types/react": ["@types/react@19.1.9", "", { "dependencies": { "csstype": "^3.0.2" } }, "sha512-WmdoynAX8Stew/36uTSVMcLJJ1KRh6L3IZRx1PZ7qJtBqT3dYTgyDTx8H1qoRghErydW7xw9mSJ3wS//tCRpFA=="], + "@types/unidecode": ["@types/unidecode@1.1.0", "", {}, "sha512-NTIsFsTe9WRek39/8DDj7KiQ0nU33DHMrKwNHcD1rKlUvn4N0Rc4Di8q/Xavs8bsDZmBa4MMtQA8+HNgwfxC/A=="], + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], "bun-types": ["bun-types@1.2.19", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-uAOTaZSPuYsWIXRpj7o56Let0g/wjihKCkeRqUBhlLVM/Bt+Fj9xTo+LhC1OV1XDaGkz4hNC80et5xgy+9KTHQ=="], @@ -57,6 +61,8 @@ "undici-types": ["undici-types@7.8.0", "", {}, "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw=="], + "unidecode": ["unidecode@1.1.0", "", {}, "sha512-GIp57N6DVVJi8dpeIU6/leJGdv7W65ZSXFLFiNmxvexXkc0nXdqUvhA/qL9KqBKsILxMwg5MnmYNOIDJLb5JVA=="], + "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], } } diff --git a/package.json b/package.json index 7e85a20..ef3496e 100644 --- a/package.json +++ b/package.json @@ -9,12 +9,14 @@ "type": "module", "private": true, "devDependencies": { - "@types/bun": "latest" + "@types/bun": "latest", + "@types/unidecode": "^1.1.0" }, "peerDependencies": { "typescript": "^5" }, "dependencies": { - "linkedom": "^0.18.12" + "linkedom": "^0.18.12", + "unidecode": "^1.1.0" } } diff --git a/src/kijiji.ts b/src/kijiji.ts index 3784da1..ad7dd20 100644 --- a/src/kijiji.ts +++ b/src/kijiji.ts @@ -1,5 +1,8 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ import { parseHTML } from "linkedom"; +import unidecode from "unidecode"; + +// const unidecode = require("unidecode"); // ----------------------------- Types ----------------------------- @@ -49,10 +52,37 @@ type ListingDetails = { address?: string | null; }; -// ----------------------------- Config ----------------------------- - // ----------------------------- Utilities ----------------------------- +const SEPS = new Set([" ", "–", "—", "/", ":", ";", ",", ".", "-"]); + +/** + * Slugifies a string for search + */ +export function slugify(input: string): string { + const s = unidecode(input).toLowerCase(); + const out: string[] = []; + let lastHyphen = false; + + for (let i = 0; i < s.length; i++) { + const ch = s[i]; + const code = ch!.charCodeAt(0); + + // a-z or 0-9 + if ((code >= 97 && code <= 122) || (code >= 48 && code <= 57)) { + out.push(ch!); + lastHyphen = false; + } else if (SEPS.has(ch!)) { + if (!lastHyphen) { + out.push("-"); + lastHyphen = true; + } + } + // else drop character + } + return out.join(""); +} + /** * Turns cents to localized currency string. */ @@ -112,6 +142,7 @@ async function fetchHtml( for (let attempt = 0; attempt <= maxRetries; attempt++) { try { + console.log(`Fetching: `, url); const res = await fetch(url, { method: "GET", headers: { @@ -289,7 +320,7 @@ export default async function fetchKijijiItems( ) { const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); - const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${encodeURIComponent(SEARCH_QUERY)}/k0l0?dc=true&view=list`; + const searchUrl = `${BASE_URL}/b-gta-greater-toronto-area/${slugify(SEARCH_QUERY)}/k0l1700272?sort=relevancyDesc&view=list`; console.log(`Fetching search: ${searchUrl}`); const searchHtml = await fetchHtml(searchUrl, DELAY_MS, { @@ -330,7 +361,9 @@ export default async function fetchKijijiItems( }, }); const parsed = parseListing(html, BASE_URL); - if (parsed) items.push(parsed); + if (parsed) { + if (parsed.listingPrice?.cents) items.push(parsed); + } } catch (err) { if (err instanceof HttpError) { console.error(`Failed to fetch ${link} - ${err.status} ${err.message}`); @@ -344,10 +377,4 @@ export default async function fetchKijijiItems( console.log(`Parsed ${items.length} listings.`); return items; - // console.log(items); } - -// void main().catch((err) => { -// console.error("Fatal error:", err); -// process.exitCode = 1; -// });