diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index 6ea2132..b177dda 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -283,7 +283,7 @@ async function fetchHtml( onRateInfo?: (remaining: string | null, reset: string | null) => void; cookies?: string; }, -): Promise { +): Promise<{ html: HTMLString; responseUrl: string }> { const maxRetries = opts?.maxRetries ?? 3; const retryBaseMs = opts?.retryBaseMs ?? 500; @@ -354,7 +354,7 @@ async function fetchHtml( const html = await res.text(); // Respect per-request delay to keep at or under REQUESTS_PER_SECOND await delay(DELAY_MS); - return html; + return { html, responseUrl: res.url || url }; } catch (err) { if (attempt >= maxRetries) throw err; await delay((attempt + 1) * retryBaseMs); @@ -394,6 +394,10 @@ export function classifyFacebookResponse( return { kind: "unavailable" as const, authGated: false, unavailable: true }; } + if (responseUrl.includes("/marketplace/item/")) { + return { kind: "item" as const, authGated: false, unavailable: false }; + } + if (htmlString.includes("XCometMarketplaceSearchController")) { return { kind: "search" as const, authGated: false, unavailable: false }; } @@ -1085,8 +1089,9 @@ export default async function fetchFacebookItems( console.log(`Using ${cookies.length} cookies for authentication`); let searchHtml: string; + let searchResponseUrl = searchUrl; try { - searchHtml = await fetchHtml(searchUrl, DELAY_MS, { + const response = await fetchHtml(searchUrl, DELAY_MS, { maxRetries: 3, onRateInfo: (remaining, reset) => { if (remaining && reset) { @@ -1097,6 +1102,8 @@ export default async function fetchFacebookItems( }, cookies: cookiesHeader, }); + searchHtml = response.html; + searchResponseUrl = response.responseUrl; } catch (err) { if (err instanceof HttpError) { console.warn( @@ -1112,6 +1119,24 @@ export default async function fetchFacebookItems( throw err; } + const classification = classifyFacebookResponse(searchHtml, searchResponseUrl); + if (classification.authGated) { + console.warn("Facebook marketplace search redirected to login. Cookies may be expired."); + return []; + } + + if (classification.unavailable) { + console.warn("Facebook marketplace search returned an unavailable route."); + return []; + } + + if (classification.kind !== "search") { + console.warn( + `Facebook marketplace search returned unexpected route kind: ${classification.kind}.`, + ); + return []; + } + const ads = extractFacebookMarketplaceData(searchHtml); if (!ads || ads.length === 0) { console.warn("No ads parsed from Facebook marketplace page."); @@ -1163,8 +1188,9 @@ export async function fetchFacebookItem( console.log(`Fetching Facebook marketplace item: ${itemUrl}`); let itemHtml: string; + let itemResponseUrl = itemUrl; try { - itemHtml = await fetchHtml(itemUrl, 1000, { + const response = await fetchHtml(itemUrl, 1000, { onRateInfo: (remaining, reset) => { if (remaining && reset) { console.log( @@ -1174,6 +1200,8 @@ export async function fetchFacebookItem( }, cookies: cookiesHeader, }); + itemHtml = response.html; + itemResponseUrl = response.responseUrl; } catch (err) { if (err instanceof HttpError) { console.warn( @@ -1214,26 +1242,32 @@ export async function fetchFacebookItem( throw err; } + const classification = classifyFacebookResponse(itemHtml, itemResponseUrl); + + if (classification.authGated) { + logExtractionMetrics(false, itemId); + console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`); + return null; + } + + if (classification.unavailable || itemHtml.includes("This item has been sold")) { + logExtractionMetrics(false, itemId); + console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`); + return null; + } + + if (classification.kind !== "item") { + logExtractionMetrics(false, itemId); + console.warn( + `Item ${itemId} returned unexpected route kind: ${classification.kind}.`, + ); + return null; + } + const itemData = extractFacebookItemData(itemHtml); if (!itemData) { logExtractionMetrics(false, itemId); - const classification = classifyFacebookResponse(itemHtml, itemUrl); - - if (classification.authGated) { - console.warn( - `Authentication failed for item ${itemId}. Cookies may be expired.`, - ); - return null; - } - - if (classification.unavailable || itemHtml.includes("This item has been sold")) { - console.warn( - `Item ${itemId} appears to be sold or removed from marketplace.`, - ); - return null; - } - console.warn( `No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`, ); diff --git a/packages/core/test/facebook-integration.test.ts b/packages/core/test/facebook-integration.test.ts index 22c7c60..df24bd0 100644 --- a/packages/core/test/facebook-integration.test.ts +++ b/packages/core/test/facebook-integration.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; -import fetchFacebookItems from "../src/scrapers/facebook"; +import fetchFacebookItems, { fetchFacebookItem } from "../src/scrapers/facebook"; // Mock fetch globally const originalFetch = global.fetch; @@ -125,7 +125,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -180,7 +180,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -221,7 +221,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -254,6 +254,76 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { expect(results).toEqual([]); }); + test("should return empty array for auth-gated search HTML", async () => { + const authGatedSearchHtml = ` + + + + + Vintage Lamp + CA$45 + Toronto, ON + + + + `; + + global.fetch = mock(() => + Promise.resolve({ + ok: true, + url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch", + text: () => Promise.resolve(authGatedSearchHtml), + headers: { + get: () => null, + }, + }), + ); + + const results = await fetchFacebookItems("lamp", 1, "toronto", 25); + expect(results).toEqual([]); + }); + + test("should return empty array when search request lands on unknown route", async () => { + const wrongRouteHtml = ``; + + global.fetch = mock(() => + Promise.resolve({ + ok: true, + url: "https://www.facebook.com/marketplace/toronto/", + text: () => Promise.resolve(wrongRouteHtml), + headers: { + get: () => null, + }, + }), + ); + + const results = await fetchFacebookItems("lamp", 1, "toronto", 25); + expect(results).toEqual([]); + }); + test("should handle network errors", async () => { global.fetch = mock(() => Promise.reject(new Error("Network error"))); @@ -320,7 +390,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -393,7 +463,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -462,7 +532,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -533,7 +603,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { ok: true, text: () => Promise.resolve( - ``, + ``, ), headers: { get: () => null, @@ -599,4 +669,45 @@ describe("Facebook Marketplace Scraper Integration Tests", () => { expect(results).toEqual([]); }); }); + + describe("Item Fetch Function", () => { + test("should return null for unavailable item responses", async () => { + const unavailableItemHtml = ` + + + + + + `; + + global.fetch = mock(() => + Promise.resolve({ + ok: true, + url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1", + text: () => Promise.resolve(unavailableItemHtml), + headers: { + get: () => null, + }, + }), + ); + + const result = await fetchFacebookItem("123"); + expect(result).toBeNull(); + }); + }); });