feat: add unstable mode to scraper results
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
import { parseHTML } from "linkedom";
|
||||
import type { UnstableListingModeOptions } from "../types/common";
|
||||
import { classifyUnstableListings } from "../utils/unstable";
|
||||
import {
|
||||
type CookieConfig,
|
||||
ensureCookies,
|
||||
@@ -362,7 +364,16 @@ export default async function fetchEbayItems(
|
||||
buyItNowOnly?: boolean;
|
||||
canadaOnly?: boolean;
|
||||
} = {},
|
||||
unstableMode: UnstableListingModeOptions = {},
|
||||
) {
|
||||
const finalizeResults = (listings: EbayListingDetails[]) => {
|
||||
if (!unstableMode.hideUnstableResults) {
|
||||
return listings;
|
||||
}
|
||||
|
||||
return classifyUnstableListings(listings);
|
||||
};
|
||||
|
||||
const {
|
||||
minPrice = 0,
|
||||
maxPrice = Number.MAX_SAFE_INTEGER,
|
||||
@@ -452,13 +463,13 @@ export default async function fetchEbayItems(
|
||||
});
|
||||
|
||||
console.log(`Parsed ${filteredListings.length} eBay listings.`);
|
||||
return filteredListings;
|
||||
return finalizeResults(filteredListings);
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.error(
|
||||
`Failed to fetch eBay search (${err.status}): ${err.message}`,
|
||||
);
|
||||
return [];
|
||||
return finalizeResults([]);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import cliProgress from "cli-progress";
|
||||
import { parseHTML } from "linkedom";
|
||||
import type { HTMLString } from "../types/common";
|
||||
import type { HTMLString, UnstableListingModeOptions } from "../types/common";
|
||||
import { classifyUnstableListings } from "../utils/unstable";
|
||||
import {
|
||||
type Cookie,
|
||||
type CookieConfig,
|
||||
@@ -1065,7 +1066,20 @@ export default async function fetchFacebookItems(
|
||||
REQUESTS_PER_SECOND = 1,
|
||||
LOCATION = "toronto",
|
||||
MAX_ITEMS = 25,
|
||||
unstableMode: UnstableListingModeOptions = {},
|
||||
) {
|
||||
const finalizeResults = (listings: FacebookListingDetails[]) => {
|
||||
if (!unstableMode.hideUnstableResults) {
|
||||
return listings.slice(0, MAX_ITEMS);
|
||||
}
|
||||
|
||||
const classified = classifyUnstableListings(listings);
|
||||
return {
|
||||
results: classified.results.slice(0, MAX_ITEMS),
|
||||
unstableResults: classified.unstableResults,
|
||||
};
|
||||
};
|
||||
|
||||
const cookies = await ensureFacebookCookies();
|
||||
|
||||
// Format cookies for HTTP header
|
||||
@@ -1114,7 +1128,7 @@ export default async function fetchFacebookItems(
|
||||
"This might indicate invalid or expired cookies. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.",
|
||||
);
|
||||
}
|
||||
return [];
|
||||
return finalizeResults([]);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
@@ -1122,25 +1136,25 @@ export default async function fetchFacebookItems(
|
||||
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
|
||||
if (classification.authGated) {
|
||||
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
|
||||
return [];
|
||||
return finalizeResults([]);
|
||||
}
|
||||
|
||||
if (classification.unavailable) {
|
||||
console.warn("Facebook marketplace search returned an unavailable route.");
|
||||
return [];
|
||||
return finalizeResults([]);
|
||||
}
|
||||
|
||||
if (classification.kind !== "search") {
|
||||
console.warn(
|
||||
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
|
||||
);
|
||||
return [];
|
||||
return finalizeResults([]);
|
||||
}
|
||||
|
||||
const ads = extractFacebookMarketplaceData(searchHtml);
|
||||
if (!ads || ads.length === 0) {
|
||||
console.warn("No ads parsed from Facebook marketplace page.");
|
||||
return [];
|
||||
return finalizeResults([]);
|
||||
}
|
||||
|
||||
console.log(`\nFound ${ads.length} raw ads. Processing...`);
|
||||
@@ -1164,7 +1178,7 @@ export default async function fetchFacebookItems(
|
||||
progressBar.stop();
|
||||
|
||||
console.log(`\nParsed ${pricedItems.length} Facebook marketplace listings.`);
|
||||
return pricedItems.slice(0, MAX_ITEMS); // Limit results
|
||||
return finalizeResults(pricedItems);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import cliProgress from "cli-progress";
|
||||
import { parseHTML } from "linkedom";
|
||||
import unidecode from "unidecode";
|
||||
import type { HTMLString } from "../types/common";
|
||||
import type { HTMLString, UnstableListingModeOptions } from "../types/common";
|
||||
import { classifyUnstableListings } from "../utils/unstable";
|
||||
import {
|
||||
type CookieConfig,
|
||||
formatCookiesForHeader,
|
||||
@@ -702,7 +703,16 @@ export default async function fetchKijijiItems(
|
||||
BASE_URL = "https://www.kijiji.ca",
|
||||
searchOptions: SearchOptions = {},
|
||||
listingOptions: ListingFetchOptions = {},
|
||||
unstableMode: UnstableListingModeOptions = {},
|
||||
) {
|
||||
const finalizeResults = (listings: DetailedListing[]) => {
|
||||
if (!unstableMode.hideUnstableResults) {
|
||||
return listings;
|
||||
}
|
||||
|
||||
return classifyUnstableListings(listings);
|
||||
};
|
||||
|
||||
const DELAY_MS = Math.max(1, Math.floor(1000 / REQUESTS_PER_SECOND));
|
||||
|
||||
// Load Kijiji cookies (optional - helps bypass bot detection)
|
||||
@@ -860,7 +870,7 @@ export default async function fetchKijijiItems(
|
||||
}
|
||||
|
||||
console.log(`\nParsed ${allListings.length} detailed listings.`);
|
||||
return allListings;
|
||||
return finalizeResults(allListings);
|
||||
}
|
||||
|
||||
// Re-export error classes for convenience
|
||||
|
||||
@@ -38,4 +38,49 @@ describe("eBay Scraper Cookie Handling", () => {
|
||||
"No valid eBay cookies found in EBAY_COOKIE. eBay may block requests without a raw Cookie header string.",
|
||||
);
|
||||
});
|
||||
|
||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(`
|
||||
<html><body>
|
||||
<li class="s-item">
|
||||
<a href="https://www.ebay.ca/itm/1"></a>
|
||||
<h3>Stable Laptop Bundle</h3>
|
||||
<span class="s-item__price">CA $100.00</span>
|
||||
</li>
|
||||
<li class="s-item">
|
||||
<a href="https://www.ebay.ca/itm/2"></a>
|
||||
<h3>Another Laptop Bundle</h3>
|
||||
<span class="s-item__price">CA $110.00</span>
|
||||
</li>
|
||||
<li class="s-item">
|
||||
<a href="https://www.ebay.ca/itm/3"></a>
|
||||
<h3>Cheap Laptop Bundle</h3>
|
||||
<span class="s-item__price">CA $70.00</span>
|
||||
</li>
|
||||
</body></html>
|
||||
`),
|
||||
}),
|
||||
) as typeof fetch;
|
||||
|
||||
const results = await fetchEbayItems(
|
||||
"laptop",
|
||||
1000,
|
||||
{},
|
||||
{ hideUnstableResults: true },
|
||||
);
|
||||
|
||||
expect(results).toEqual({
|
||||
results: [
|
||||
expect.objectContaining({ title: "Stable Laptop Bundle" }),
|
||||
expect.objectContaining({ title: "Another Laptop Bundle" }),
|
||||
],
|
||||
unstableResults: [
|
||||
expect.objectContaining({ title: "Cheap Laptop Bundle" }),
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
extractFacebookBootstrapCandidates,
|
||||
extractFacebookItemData,
|
||||
extractFacebookMarketplaceData,
|
||||
default as fetchFacebookItems,
|
||||
fetchFacebookItem,
|
||||
parseFacebookAds,
|
||||
parseFacebookCookieString,
|
||||
@@ -367,6 +368,143 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("fetchFacebookItems", () => {
|
||||
let previousCookie: string | undefined;
|
||||
|
||||
beforeEach(() => {
|
||||
previousCookie = process.env.FACEBOOK_COOKIE;
|
||||
process.env.FACEBOOK_COOKIE = "c_user=12345; xs=abc123";
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (previousCookie === undefined) {
|
||||
delete process.env.FACEBOOK_COOKIE;
|
||||
} else {
|
||||
process.env.FACEBOOK_COOKIE = previousCookie;
|
||||
}
|
||||
});
|
||||
|
||||
test("returns an array by default", async () => {
|
||||
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "Stable Chair Listing",
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
formatted_amount: "CA$120",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}</script></body></html>`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockSearchHtml),
|
||||
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await fetchFacebookItems("chair", 1, "toronto", 25);
|
||||
|
||||
expect(Array.isArray(results)).toBe(true);
|
||||
expect(results).toHaveLength(1);
|
||||
});
|
||||
|
||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "Stable Chair Listing",
|
||||
listing_price: {
|
||||
amount: "100.00",
|
||||
formatted_amount: "CA$100",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "2",
|
||||
marketplace_listing_title: "Another Stable Chair",
|
||||
listing_price: {
|
||||
amount: "110.00",
|
||||
formatted_amount: "CA$110",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "3",
|
||||
marketplace_listing_title: "Suspiciously Cheap Chair",
|
||||
listing_price: {
|
||||
amount: "70.00",
|
||||
formatted_amount: "CA$70",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}</script></body></html>`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockSearchHtml),
|
||||
url: "https://www.facebook.com/marketplace/toronto/search?query=chair",
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await fetchFacebookItems("chair", 1, "toronto", 1, {
|
||||
hideUnstableResults: true,
|
||||
});
|
||||
|
||||
expect(results).toEqual({
|
||||
results: [expect.objectContaining({ title: "Stable Chair Listing" })],
|
||||
unstableResults: [
|
||||
expect.objectContaining({ title: "Suspiciously Cheap Chair" }),
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("Data Extraction", () => {
|
||||
describe("extractFacebookItemData", () => {
|
||||
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||
import {
|
||||
buildSearchUrl,
|
||||
default as fetchKijijiItems,
|
||||
NetworkError,
|
||||
ParseError,
|
||||
RateLimitError,
|
||||
@@ -9,6 +10,18 @@ import {
|
||||
ValidationError,
|
||||
} from "../src/scrapers/kijiji";
|
||||
|
||||
const originalFetch = global.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
global.fetch = mock(() => {
|
||||
throw new Error("fetch should be mocked in individual tests");
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
global.fetch = originalFetch;
|
||||
});
|
||||
|
||||
describe("Location and Category Resolution", () => {
|
||||
describe("resolveLocationId", () => {
|
||||
test("should return numeric IDs as-is", () => {
|
||||
@@ -155,3 +168,124 @@ describe("Error Classes", () => {
|
||||
expect(error.name).toBe("ValidationError");
|
||||
});
|
||||
});
|
||||
|
||||
describe("fetchKijijiItems", () => {
|
||||
test("returns results and unstableResults when unstable mode is enabled", async () => {
|
||||
const searchHtml = `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
${JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
__APOLLO_STATE__: {
|
||||
"Listing:1": {
|
||||
url: "/v-stable-one/k0l0",
|
||||
title: "Stable Listing One",
|
||||
},
|
||||
"Listing:2": {
|
||||
url: "/v-stable-two/k0l0",
|
||||
title: "Stable Listing Two",
|
||||
},
|
||||
"Listing:3": {
|
||||
url: "/v-unstable/k0l0",
|
||||
title: "Unstable Listing",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const listingHtml = (title: string, amount: number, slug: string) => `
|
||||
<html>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
${JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
__APOLLO_STATE__: {
|
||||
"Listing:detail": {
|
||||
url: `/${slug}`,
|
||||
title,
|
||||
price: { amount, currency: "CAD", type: "FIXED" },
|
||||
type: "OFFER",
|
||||
status: "ACTIVE",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</html>
|
||||
`;
|
||||
|
||||
global.fetch = mock((input: string | URL | Request) => {
|
||||
const url = typeof input === "string" ? input : input.toString();
|
||||
|
||||
if (url.includes("/b-buy-sell/")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(searchHtml),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-stable-one/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
listingHtml("Stable Listing One", 10000, "v-stable-one/k0l0"),
|
||||
),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-stable-two/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
listingHtml("Stable Listing Two", 11000, "v-stable-two/k0l0"),
|
||||
),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
if (url.endsWith("/v-unstable/k0l0")) {
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
listingHtml("Unstable Listing", 7000, "v-unstable/k0l0"),
|
||||
),
|
||||
headers: { get: () => null },
|
||||
url,
|
||||
});
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected URL: ${url}`);
|
||||
}) as typeof fetch;
|
||||
|
||||
const results = await fetchKijijiItems(
|
||||
"phone",
|
||||
1000,
|
||||
"https://www.kijiji.ca",
|
||||
{ maxPages: 1 },
|
||||
{},
|
||||
{ hideUnstableResults: true },
|
||||
);
|
||||
|
||||
expect(results).toEqual({
|
||||
results: [
|
||||
expect.objectContaining({ title: "Stable Listing One" }),
|
||||
expect.objectContaining({ title: "Stable Listing Two" }),
|
||||
],
|
||||
unstableResults: [expect.objectContaining({ title: "Unstable Listing" })],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user