feat: add unstable mode to scraper results

This commit is contained in:
2026-04-22 23:23:31 -04:00
parent 8141de5b4b
commit 1ee41fb346
6 changed files with 364 additions and 12 deletions

View File

@@ -1,4 +1,6 @@
import { parseHTML } from "linkedom";
import type { UnstableListingModeOptions } from "../types/common";
import { classifyUnstableListings } from "../utils/unstable";
import {
type CookieConfig,
ensureCookies,
@@ -362,7 +364,16 @@ export default async function fetchEbayItems(
buyItNowOnly?: boolean;
canadaOnly?: boolean;
} = {},
unstableMode: UnstableListingModeOptions = {},
) {
const finalizeResults = (listings: EbayListingDetails[]) => {
if (!unstableMode.hideUnstableResults) {
return listings;
}
return classifyUnstableListings(listings);
};
const {
minPrice = 0,
maxPrice = Number.MAX_SAFE_INTEGER,
@@ -452,13 +463,13 @@ export default async function fetchEbayItems(
});
console.log(`Parsed ${filteredListings.length} eBay listings.`);
return filteredListings;
return finalizeResults(filteredListings);
} catch (err) {
if (err instanceof HttpError) {
console.error(
`Failed to fetch eBay search (${err.status}): ${err.message}`,
);
return [];
return finalizeResults([]);
}
throw err;
}

View File

@@ -1,6 +1,7 @@
import cliProgress from "cli-progress";
import { parseHTML } from "linkedom";
import type { HTMLString } from "../types/common";
import type { HTMLString, UnstableListingModeOptions } from "../types/common";
import { classifyUnstableListings } from "../utils/unstable";
import {
type Cookie,
type CookieConfig,
@@ -1065,7 +1066,20 @@ export default async function fetchFacebookItems(
REQUESTS_PER_SECOND = 1,
LOCATION = "toronto",
MAX_ITEMS = 25,
unstableMode: UnstableListingModeOptions = {},
) {
const finalizeResults = (listings: FacebookListingDetails[]) => {
if (!unstableMode.hideUnstableResults) {
return listings.slice(0, MAX_ITEMS);
}
const classified = classifyUnstableListings(listings);
return {
results: classified.results.slice(0, MAX_ITEMS),
unstableResults: classified.unstableResults,
};
};
const cookies = await ensureFacebookCookies();
// Format cookies for HTTP header
@@ -1114,7 +1128,7 @@ export default async function fetchFacebookItems(
"This might indicate invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
);
}
return [];
return finalizeResults([]);
}
throw err;
}
@@ -1122,25 +1136,25 @@ export default async function fetchFacebookItems(
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
if (classification.authGated) {
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
return [];
return finalizeResults([]);
}
if (classification.unavailable) {
console.warn("Facebook marketplace search returned an unavailable route.");
return [];
return finalizeResults([]);
}
if (classification.kind !== "search") {
console.warn(
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
);
return [];
return finalizeResults([]);
}
const ads = extractFacebookMarketplaceData(searchHtml);
if (!ads || ads.length === 0) {
console.warn("No ads parsed from Facebook marketplace page.");
return [];
return finalizeResults([]);
}
console.log(`\nFound ${ads.length} raw ads. Processing...`);
@@ -1164,7 +1178,7 @@ export default async function fetchFacebookItems(
progressBar.stop();
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
return pricedItems.slice(0, MAX_ITEMS); // Limit results
return finalizeResults(pricedItems);
}
/**

View File

@@ -1,7 +1,8 @@
import cliProgress from "cli-progress";
import { parseHTML } from "linkedom";
import unidecode from "unidecode";
import type { HTMLString } from "../types/common";
import type { HTMLString, UnstableListingModeOptions } from "../types/common";
import { classifyUnstableListings } from "../utils/unstable";
import {
type CookieConfig,
formatCookiesForHeader,
@@ -702,7 +703,16 @@ export default async function fetchKijijiItems(
BASE_URL = "https://www.kijiji.ca",
searchOptions: SearchOptions = {},
listingOptions: ListingFetchOptions = {},
unstableMode: UnstableListingModeOptions = {},
) {
const finalizeResults = (listings: DetailedListing[]) => {
if (!unstableMode.hideUnstableResults) {
return listings;
}
return classifyUnstableListings(listings);
};
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
// Load Kijiji cookies (optional - helps bypass bot detection)
@@ -860,7 +870,7 @@ export default async function fetchKijijiItems(
}
console.log(`\nParsed ${allListings.length} detailed listings.`);
return allListings;
return finalizeResults(allListings);
}
// Re-export error classes for convenience

View File

@@ -38,4 +38,49 @@ describe("eBay Scraper Cookie Handling", () => {
"No valid eBay cookies found in EBAY_COOKIE. eBay may block requests without a raw Cookie header string.",
);
});
test("returns results and unstableResults when unstable mode is enabled", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(`
<html><body>
<li class="s-item">
<a href="https://www.ebay.ca/itm/1"></a>
<h3>Stable Laptop Bundle</h3>
<span class="s-item__price">CA $100.00</span>
</li>
<li class="s-item">
<a href="https://www.ebay.ca/itm/2"></a>
<h3>Another Laptop Bundle</h3>
<span class="s-item__price">CA $110.00</span>
</li>
<li class="s-item">
<a href="https://www.ebay.ca/itm/3"></a>
<h3>Cheap Laptop Bundle</h3>
<span class="s-item__price">CA $70.00</span>
</li>
</body></html>
`),
}),
) as typeof fetch;
const results = await fetchEbayItems(
"laptop",
1000,
{},
{ hideUnstableResults: true },
);
expect(results).toEqual({
results: [
expect.objectContaining({ title: "Stable Laptop Bundle" }),
expect.objectContaining({ title: "Another Laptop Bundle" }),
],
unstableResults: [
expect.objectContaining({ title: "Cheap Laptop Bundle" }),
],
});
});
});

View File

@@ -5,6 +5,7 @@ import {
extractFacebookBootstrapCandidates,
extractFacebookItemData,
extractFacebookMarketplaceData,
default as fetchFacebookItems,
fetchFacebookItem,
parseFacebookAds,
parseFacebookCookieString,
@@ -367,6 +368,143 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
});
});
describe("fetchFacebookItems", () => {
let previousCookie: string | undefined;
beforeEach(() => {
previousCookie = process.env.FACEBOOK_COOKIE;
process.env.FACEBOOK_COOKIE = "c_user=12345; xs=abc123";
});
afterEach(() => {
if (previousCookie === undefined) {
delete process.env.FACEBOOK_COOKIE;
} else {
process.env.FACEBOOK_COOKIE = previousCookie;
}
});
test("returns an array by default", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Stable Chair Listing",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(mockSearchHtml),
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("chair", 1, "toronto", 25);
expect(Array.isArray(results)).toBe(true);
expect(results).toHaveLength(1);
});
test("returns results and unstableResults when unstable mode is enabled", async () => {
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Stable Chair Listing",
listing_price: {
amount: "100.00",
formatted_amount: "CA$100",
currency: "CAD",
},
is_live: true,
},
},
},
{
node: {
listing: {
id: "2",
marketplace_listing_title: "Another Stable Chair",
listing_price: {
amount: "110.00",
formatted_amount: "CA$110",
currency: "CAD",
},
is_live: true,
},
},
},
{
node: {
listing: {
id: "3",
marketplace_listing_title: "Suspiciously Cheap Chair",
listing_price: {
amount: "70.00",
formatted_amount: "CA$70",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () => Promise.resolve(mockSearchHtml),
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("chair", 1, "toronto", 1, {
hideUnstableResults: true,
});
expect(results).toEqual({
results: [expect.objectContaining({ title: "Stable Chair Listing" })],
unstableResults: [
expect.objectContaining({ title: "Suspiciously Cheap Chair" }),
],
});
});
});
describe("Data Extraction", () => {
describe("extractFacebookItemData", () => {
test("extracts item details from Comet permalink bootstrap candidates", () => {

View File

@@ -1,6 +1,7 @@
import { describe, expect, test } from "bun:test";
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import {
buildSearchUrl,
default as fetchKijijiItems,
NetworkError,
ParseError,
RateLimitError,
@@ -9,6 +10,18 @@ import {
ValidationError,
} from "../src/scrapers/kijiji";
const originalFetch = global.fetch;
beforeEach(() => {
global.fetch = mock(() => {
throw new Error("fetch should be mocked in individual tests");
});
});
afterEach(() => {
global.fetch = originalFetch;
});
describe("Location and Category Resolution", () => {
describe("resolveLocationId", () => {
test("should return numeric IDs as-is", () => {
@@ -155,3 +168,124 @@ describe("Error Classes", () => {
expect(error.name).toBe("ValidationError");
});
});
describe("fetchKijijiItems", () => {
test("returns results and unstableResults when unstable mode is enabled", async () => {
const searchHtml = `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:1": {
url: "/v-stable-one/k0l0",
title: "Stable Listing One",
},
"Listing:2": {
url: "/v-stable-two/k0l0",
title: "Stable Listing Two",
},
"Listing:3": {
url: "/v-unstable/k0l0",
title: "Unstable Listing",
},
},
},
},
})}
</script>
</html>
`;
const listingHtml = (title: string, amount: number, slug: string) => `
<html>
<script id="__NEXT_DATA__" type="application/json">
${JSON.stringify({
props: {
pageProps: {
__APOLLO_STATE__: {
"Listing:detail": {
url: `/${slug}`,
title,
price: { amount, currency: "CAD", type: "FIXED" },
type: "OFFER",
status: "ACTIVE",
},
},
},
},
})}
</script>
</html>
`;
global.fetch = mock((input: string | URL | Request) => {
const url = typeof input === "string" ? input : input.toString();
if (url.includes("/b-buy-sell/")) {
return Promise.resolve({
ok: true,
text: () => Promise.resolve(searchHtml),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-stable-one/k0l0")) {
return Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0"),
),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-stable-two/k0l0")) {
return Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0"),
),
headers: { get: () => null },
url,
});
}
if (url.endsWith("/v-unstable/k0l0")) {
return Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
listingHtml("Unstable Listing", 7000, "v-unstable/k0l0"),
),
headers: { get: () => null },
url,
});
}
throw new Error(`Unexpected URL: ${url}`);
}) as typeof fetch;
const results = await fetchKijijiItems(
"phone",
1000,
"https://www.kijiji.ca",
{ maxPages: 1 },
{},
{ hideUnstableResults: true },
);
expect(results).toEqual({
results: [
expect.objectContaining({ title: "Stable Listing One" }),
expect.objectContaining({ title: "Stable Listing Two" }),
],
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
});
});
});