Files
ca-marketplace-scraper/packages/core/test/kijiji-integration.test.ts
Dmytro Stanchiev 50d56201af feat: port upstream scraper improvements to monorepo
Kijiji improvements:
- Add error classes: NetworkError, ParseError, RateLimitError, ValidationError
- Add exponential backoff with jitter for retries
- Add request timeout (30s abort)
- Add pagination support (SearchOptions.maxPages)
- Add location/category mappings and resolution functions
- Add enhanced DetailedListing interface with images, seller info, attributes
- Add GraphQL client for seller details

Facebook improvements:
- Add parseFacebookCookieString() for parsing cookie strings
- Add ensureFacebookCookies() with env var fallback
- Add extractFacebookItemData() with multiple extraction paths
- Add fetchFacebookItem() for individual item fetching
- Add extraction metrics and API stability monitoring
- Add vehicle-specific field extraction
- Improve error handling with specific guidance for auth errors

Shared utilities:
- Update http.ts with new error classes and improved fetchHtml

Documentation:
- Port KIJIJI.md, FMARKETPLACE.md, AGENTS.md from upstream

Tests:
- Port kijiji-core, kijiji-integration, kijiji-utils tests
- Port facebook-core, facebook-integration tests
- Add test setup file

Scripts:
- Port parse-facebook-cookies.ts script

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 00:34:50 -05:00

364 lines
11 KiB
TypeScript

import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import {
extractApolloState,
parseDetailedListing,
parseSearch,
} from "../src/scrapers/kijiji";
// Mock fetch globally
const originalFetch = global.fetch;
describe("HTML Parsing Integration", () => {
beforeEach(() => {
// Mock fetch for all tests
global.fetch = mock(() => {
throw new Error("fetch should be mocked in individual tests");
});
});
afterEach(() => {
global.fetch = originalFetch;
});
describe("extractApolloState", () => {
test("should extract Apollo state from valid HTML", () => {
const mockHtml =
'<html><head><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"__APOLLO_STATE__":{"ROOT_QUERY":{"test":"value"}}}}}</script></head></html>';
const result = extractApolloState(mockHtml);
expect(result).toEqual({
ROOT_QUERY: { test: "value" },
});
});
test("should return null for HTML without Apollo state", () => {
const mockHtml = "<html><body>No data here</body></html>";
const result = extractApolloState(mockHtml);
expect(result).toBeNull();
});
test("should return null for malformed JSON", () => {
const mockHtml =
'<html><script id="__NEXT_DATA__" type="application/json">{"invalid": json}</script></html>';
const result = extractApolloState(mockHtml);
expect(result).toBeNull();
});
test("should handle missing __NEXT_DATA__ element", () => {
const mockHtml = "<html><body><div>Content</div></body></html>";
const result = extractApolloState(mockHtml);
expect(result).toBeNull();
});
});
describe("parseSearch", () => {
test("should parse search results from HTML", () => {
const mockHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:123": {
url: "/v-iphone/k0l0",
title: "iPhone 13 Pro",
},
"Listing:456": {
url: "/v-samsung/k0l0",
title: "Samsung Galaxy",
},
ROOT_QUERY: { test: "value" },
},
},
},
})}
</script>
</html>
`;
const results = parseSearch(mockHtml, "https://www.kijiji.ca");
expect(results).toHaveLength(2);
expect(results[0]).toEqual({
name: "iPhone 13 Pro",
listingLink: "https://www.kijiji.ca/v-iphone/k0l0",
});
expect(results[1]).toEqual({
name: "Samsung Galaxy",
listingLink: "https://www.kijiji.ca/v-samsung/k0l0",
});
});
test("should handle absolute URLs", () => {
const mockHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:123": {
url: "https://www.kijiji.ca/v-iphone/k0l0",
title: "iPhone 13 Pro",
},
},
},
},
})}
</script>
</html>
`;
const results = parseSearch(mockHtml, "https://www.kijiji.ca");
expect(results[0].listingLink).toBe(
"https://www.kijiji.ca/v-iphone/k0l0",
);
});
test("should filter out invalid listings", () => {
const mockHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:123": {
url: "/v-iphone/k0l0",
title: "iPhone 13 Pro",
},
"Listing:456": {
url: "/v-samsung/k0l0",
// Missing title
},
"Other:789": {
url: "/v-other/k0l0",
title: "Other Item",
},
},
},
},
})}
</script>
</html>
`;
const results = parseSearch(mockHtml, "https://www.kijiji.ca");
expect(results).toHaveLength(1);
expect(results[0].name).toBe("iPhone 13 Pro");
});
test("should return empty array for invalid HTML", () => {
const results = parseSearch(
"<html><body>Invalid</body></html>",
"https://www.kijiji.ca",
);
expect(results).toEqual([]);
});
});
describe("parseDetailedListing", () => {
test("should parse detailed listing with all fields", async () => {
const mockHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:123": {
url: "/v-iphone-13-pro/k0l0",
title: "iPhone 13 Pro 256GB",
description: "Excellent condition iPhone 13 Pro",
price: {
amount: 80000,
currency: "CAD",
type: "FIXED",
},
type: "OFFER",
status: "ACTIVE",
activationDate: "2024-01-15T10:00:00.000Z",
endDate: "2025-01-15T10:00:00.000Z",
metrics: { views: 150 },
location: {
address: "Toronto, ON",
id: 1700273,
name: "Toronto",
coordinates: {
latitude: 43.6532,
longitude: -79.3832,
},
},
imageUrls: [
"https://media.kijiji.ca/api/v1/image1.jpg",
"https://media.kijiji.ca/api/v1/image2.jpg",
],
imageCount: 2,
categoryId: 132,
adSource: "ORGANIC",
flags: {
topAd: false,
priceDrop: true,
},
posterInfo: {
posterId: "user123",
rating: 4.8,
},
attributes: [
{
canonicalName: "forsaleby",
canonicalValues: ["ownr"],
},
{
canonicalName: "phonecarrier",
canonicalValues: ["unlocked"],
},
],
},
},
},
},
})}
</script>
</html>
`;
const result = await parseDetailedListing(
mockHtml,
"https://www.kijiji.ca",
);
expect(result).toEqual({
url: "https://www.kijiji.ca/v-iphone-13-pro/k0l0",
title: "iPhone 13 Pro 256GB",
description: "Excellent condition iPhone 13 Pro",
listingPrice: {
amountFormatted: "$800.00",
cents: 80000,
currency: "CAD",
},
listingType: "OFFER",
listingStatus: "ACTIVE",
creationDate: "2024-01-15T10:00:00.000Z",
endDate: "2025-01-15T10:00:00.000Z",
numberOfViews: 150,
address: "Toronto, ON",
images: [
"https://media.kijiji.ca/api/v1/image1.jpg",
"https://media.kijiji.ca/api/v1/image2.jpg",
],
categoryId: 132,
adSource: "ORGANIC",
flags: {
topAd: false,
priceDrop: true,
},
attributes: {
forsaleby: ["ownr"],
phonecarrier: ["unlocked"],
},
location: {
id: 1700273,
name: "Toronto",
coordinates: {
latitude: 43.6532,
longitude: -79.3832,
},
},
sellerInfo: {
posterId: "user123",
rating: 4.8,
},
});
});
test("should return null for contact-based pricing", async () => {
const mockHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:123": {
url: "/v-iphone/k0l0",
title: "iPhone for Sale",
price: {
type: "CONTACT",
amount: null,
},
},
},
},
},
})}
</script>
</html>
`;
const result = await parseDetailedListing(
mockHtml,
"https://www.kijiji.ca",
);
expect(result).toBeNull();
});
test("should handle missing optional fields", async () => {
const mockHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:123": {
url: "/v-iphone/k0l0",
title: "iPhone 13",
price: { amount: 50000 },
},
},
},
},
})}
</script>
</html>
`;
const result = await parseDetailedListing(
mockHtml,
"https://www.kijiji.ca",
);
expect(result).toEqual({
url: "https://www.kijiji.ca/v-iphone/k0l0",
title: "iPhone 13",
description: undefined,
listingPrice: {
amountFormatted: "$500.00",
cents: 50000,
currency: undefined,
},
listingType: undefined,
listingStatus: undefined,
creationDate: undefined,
endDate: undefined,
numberOfViews: undefined,
address: null,
images: [],
categoryId: 0,
adSource: "UNKNOWN",
flags: {
topAd: false,
priceDrop: false,
},
attributes: {},
location: {
id: 0,
name: "Unknown",
coordinates: undefined,
},
sellerInfo: undefined,
});
});
});
});