feat: add unstable mode to scraper results

This commit is contained in:
2026-04-22 23:23:31 -04:00
parent 8141de5b4b
commit 1ee41fb346
6 changed files with 364 additions and 12 deletions

View File

@@ -1,4 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import type { UnstableListingModeOptions } from "../types/common";
import { classifyUnstableListings } from "../utils/unstable";
import { import {
type CookieConfig, type CookieConfig,
ensureCookies, ensureCookies,
@@ -362,7 +364,16 @@ export default async function fetchEbayItems(
buyItNowOnly?: boolean; buyItNowOnly?: boolean;
canadaOnly?: boolean; canadaOnly?: boolean;
} = {}, } = {},
unstableMode: UnstableListingModeOptions = {},
) { ) {
const finalizeResults = (listings: EbayListingDetails[]) => {
if (!unstableMode.hideUnstableResults) {
return listings;
}
return classifyUnstableListings(listings);
};
const { const {
minPrice = 0, minPrice = 0,
maxPrice = Number.MAX_SAFE_INTEGER, maxPrice = Number.MAX_SAFE_INTEGER,
@@ -452,13 +463,13 @@ export default async function fetchEbayItems(
}); });
console.log(`Parsed ${filteredListings.length} eBay listings.`); console.log(`Parsed ${filteredListings.length} eBay listings.`);
return filteredListings; return finalizeResults(filteredListings);
} catch (err) { } catch (err) {
if (err instanceof HttpError) { if (err instanceof HttpError) {
console.error( console.error(
`Failed to fetch eBay search (${err.status}): ${err.message}`, `Failed to fetch eBay search (${err.status}): ${err.message}`,
); );
return []; return finalizeResults([]);
} }
throw err; throw err;
} }

View File

@@ -1,6 +1,7 @@
import cliProgress from "cli-progress"; import cliProgress from "cli-progress";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import type { HTMLString } from "../types/common"; import type { HTMLString, UnstableListingModeOptions } from "../types/common";
import { classifyUnstableListings } from "../utils/unstable";
import { import {
type Cookie, type Cookie,
type CookieConfig, type CookieConfig,
@@ -1065,7 +1066,20 @@ export default async function fetchFacebookItems(
REQUESTS_PER_SECOND = 1, REQUESTS_PER_SECOND = 1,
LOCATION = "toronto", LOCATION = "toronto",
MAX_ITEMS = 25, MAX_ITEMS = 25,
unstableMode: UnstableListingModeOptions = {},
) { ) {
const finalizeResults = (listings: FacebookListingDetails[]) => {
if (!unstableMode.hideUnstableResults) {
return listings.slice(0, MAX_ITEMS);
}
const classified = classifyUnstableListings(listings);
return {
results: classified.results.slice(0, MAX_ITEMS),
unstableResults: classified.unstableResults,
};
};
const cookies = await ensureFacebookCookies(); const cookies = await ensureFacebookCookies();
// Format cookies for HTTP header // Format cookies for HTTP header
@@ -1114,7 +1128,7 @@ export default async function fetchFacebookItems(
"This might indicate invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.", "This might indicate invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
); );
} }
return []; return finalizeResults([]);
} }
throw err; throw err;
} }
@@ -1122,25 +1136,25 @@ export default async function fetchFacebookItems(
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl); const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
if (classification.authGated) { if (classification.authGated) {
console.warn("Facebook marketplace search redirected to login. Cookies may be expired."); console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
return []; return finalizeResults([]);
} }
if (classification.unavailable) { if (classification.unavailable) {
console.warn("Facebook marketplace search returned an unavailable route."); console.warn("Facebook marketplace search returned an unavailable route.");
return []; return finalizeResults([]);
} }
if (classification.kind !== "search") { if (classification.kind !== "search") {
console.warn( console.warn(
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`, `Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
); );
return []; return finalizeResults([]);
} }
const ads = extractFacebookMarketplaceData(searchHtml); const ads = extractFacebookMarketplaceData(searchHtml);
if (!ads || ads.length === 0) { if (!ads || ads.length === 0) {
console.warn("No ads parsed from Facebook marketplace page."); console.warn("No ads parsed from Facebook marketplace page.");
return []; return finalizeResults([]);
} }
console.log(`\nFound ${ads.length} raw ads. Processing...`); console.log(`\nFound ${ads.length} raw ads. Processing...`);
@@ -1164,7 +1178,7 @@ export default async function fetchFacebookItems(
progressBar.stop(); progressBar.stop();
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`); console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
return pricedItems.slice(0, MAX_ITEMS); // Limit results return finalizeResults(pricedItems);
} }
/** /**

View File

@@ -1,7 +1,8 @@
import cliProgress from "cli-progress"; import cliProgress from "cli-progress";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import unidecode from "unidecode"; import unidecode from "unidecode";
import type { HTMLString } from "../types/common"; import type { HTMLString, UnstableListingModeOptions } from "../types/common";
import { classifyUnstableListings } from "../utils/unstable";
import { import {
type CookieConfig, type CookieConfig,
formatCookiesForHeader, formatCookiesForHeader,
@@ -702,7 +703,16 @@ export default async function fetchKijijiItems(
BASE_URL = "https://www.kijiji.ca", BASE_URL = "https://www.kijiji.ca",
searchOptions: SearchOptions = {}, searchOptions: SearchOptions = {},
listingOptions: ListingFetchOptions = {}, listingOptions: ListingFetchOptions = {},
unstableMode: UnstableListingModeOptions = {},
) { ) {
const finalizeResults = (listings: DetailedListing[]) => {
if (!unstableMode.hideUnstableResults) {
return listings;
}
return classifyUnstableListings(listings);
};
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND)); const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
// Load Kijiji cookies (optional - helps bypass bot detection) // Load Kijiji cookies (optional - helps bypass bot detection)
@@ -860,7 +870,7 @@ export default async function fetchKijijiItems(
} }
console.log(`\nParsed ${allListings.length} detailed listings.`); console.log(`\nParsed ${allListings.length} detailed listings.`);
return allListings; return finalizeResults(allListings);
} }
// Re-export error classes for convenience // Re-export error classes for convenience

View File

@@ -38,4 +38,49 @@ describe("eBay Scraper Cookie Handling", () => {
"No valid eBay cookies found in EBAY_COOKIE. eBay may block requests without a raw Cookie header string.", "No valid eBay cookies found in EBAY_COOKIE. eBay may block requests without a raw Cookie header string.",
); );
}); });
test("returns results and unstableResults when unstable mode is enabled", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(`
<html><body>
<li class="s-item">
<a href="https://www.ebay.ca/itm/1"></a>
<h3>Stable Laptop Bundle</h3>
<span class="s-item__price">CA $100.00</span>
</li>
<li class="s-item">
<a href="https://www.ebay.ca/itm/2"></a>
<h3>Another Laptop Bundle</h3>
<span class="s-item__price">CA $110.00</span>
</li>
<li class="s-item">
<a href="https://www.ebay.ca/itm/3"></a>
<h3>Cheap Laptop Bundle</h3>
<span class="s-item__price">CA $70.00</span>
</li>
</body></html>
`),
}),
) as typeof fetch;
const results = await fetchEbayItems(
"laptop",
1000,
{},
{ hideUnstableResults: true },
);
expect(results).toEqual({
results: [
expect.objectContaining({ title: "Stable Laptop Bundle" }),
expect.objectContaining({ title: "Another Laptop Bundle" }),
],
unstableResults: [
expect.objectContaining({ title: "Cheap Laptop Bundle" }),
],
});
});
}); });

View File

@@ -5,6 +5,7 @@ import {
extractFacebookBootstrapCandidates, extractFacebookBootstrapCandidates,
extractFacebookItemData, extractFacebookItemData,
extractFacebookMarketplaceData, extractFacebookMarketplaceData,
default as fetchFacebookItems,
fetchFacebookItem, fetchFacebookItem,
parseFacebookAds, parseFacebookAds,
parseFacebookCookieString, parseFacebookCookieString,
@@ -367,6 +368,143 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
}); });
}); });
describe("fetchFacebookItems", () => {
let previousCookie: string | undefined;
beforeEach(() => {
previousCookie = process.env.FACEBOOK_COOKIE;
process.env.FACEBOOK_COOKIE = "c_user=12345; xs=abc123";
});
afterEach(() => {
if (previousCookie === undefined) {
delete process.env.FACEBOOK_COOKIE;
} else {
process.env.FACEBOOK_COOKIE = previousCookie;
}
});
test("returns an array by default", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Stable Chair Listing",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(mockSearchHtml),
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("chair", 1, "toronto", 25);
expect(Array.isArray(results)).toBe(true);
expect(results).toHaveLength(1);
});
test("returns results and unstableResults when unstable mode is enabled", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Stable Chair Listing",
listing_price: {
amount: "100.00",
formatted_amount: "CA$100",
currency: "CAD",
},
is_live: true,
},
},
},
{
node: {
listing: {
id: "2",
marketplace_listing_title: "Another Stable Chair",
listing_price: {
amount: "110.00",
formatted_amount: "CA$110",
currency: "CAD",
},
is_live: true,
},
},
},
{
node: {
listing: {
id: "3",
marketplace_listing_title: "Suspiciously Cheap Chair",
listing_price: {
amount: "70.00",
formatted_amount: "CA$70",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(mockSearchHtml),
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("chair", 1, "toronto", 1, {
hideUnstableResults: true,
});
expect(results).toEqual({
results: [expect.objectContaining({ title: "Stable Chair Listing" })],
unstableResults: [
expect.objectContaining({ title: "Suspiciously Cheap Chair" }),
],
});
});
});
describe("Data Extraction", () => { describe("Data Extraction", () => {
describe("extractFacebookItemData", () => { describe("extractFacebookItemData", () => {
test("extracts item details from Comet permalink bootstrap candidates", () => { test("extracts item details from Comet permalink bootstrap candidates", () => {

View File

@@ -1,6 +1,7 @@
import { describe, expect, test } from "bun:test"; import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import { import {
buildSearchUrl, buildSearchUrl,
default as fetchKijijiItems,
NetworkError, NetworkError,
ParseError, ParseError,
RateLimitError, RateLimitError,
@@ -9,6 +10,18 @@ import {
ValidationError, ValidationError,
} from "../src/scrapers/kijiji"; } from "../src/scrapers/kijiji";
const originalFetch = global.fetch;
beforeEach(() => {
global.fetch = mock(() => {
throw new Error("fetch should be mocked in individual tests");
});
});
afterEach(() => {
global.fetch = originalFetch;
});
describe("Location and Category Resolution", () => { describe("Location and Category Resolution", () => {
describe("resolveLocationId", () => { describe("resolveLocationId", () => {
test("should return numeric IDs as-is", () => { test("should return numeric IDs as-is", () => {
@@ -155,3 +168,124 @@ describe("Error Classes", () => {
expect(error.name).toBe("ValidationError"); expect(error.name).toBe("ValidationError");
}); });
}); });
describe("fetchKijijiItems", () => {
test("returns results and unstableResults when unstable mode is enabled", async () => {
const searchHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:1": {
url: "/v-stable-one/k0l0",
title: "Stable Listing One",
},
"Listing:2": {
url: "/v-stable-two/k0l0",
title: "Stable Listing Two",
},
"Listing:3": {
url: "/v-unstable/k0l0",
title: "Unstable Listing",
},
},
},
},
})}
</script>
</html>
`;
const listingHtml = (title: string, amount: number, slug: string) => `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:detail": {
url: `/${slug}`,
title,
price: { amount, currency: "CAD", type: "FIXED" },
type: "OFFER",
status: "ACTIVE",
},
},
},
},
})}
</script>
</html>
`;
global.fetch = mock((input: string | URL | Request) => {
const url = typeof input === "string" ? input : input.toString();
if (url.includes("/b-buy-sell/")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(searchHtml),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-stable-one/k0l0")) {
return Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0"),
),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-stable-two/k0l0")) {
return Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0"),
),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-unstable/k0l0")) {
return Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
listingHtml("Unstable Listing", 7000, "v-unstable/k0l0"),
),
headers: { get: () => null },
url,
});
}
throw new Error(`Unexpected URL: ${url}`);
}) as typeof fetch;
const results = await fetchKijijiItems(
"phone",
1000,
"https://www.kijiji.ca",
{ maxPages: 1 },
{},
{ hideUnstableResults: true },
);
expect(results).toEqual({
results: [
expect.objectContaining({ title: "Stable Listing One" }),
expect.objectContaining({ title: "Stable Listing Two" }),
],
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
});
});
});