From b072599bc63798292b497272e1cb5c4da241e03f Mon Sep 17 00:00:00 2001 From: Dmytro Stanchiev Date: Tue, 21 Apr 2026 23:31:45 -0400 Subject: [PATCH] refactor: add facebook response classification --- packages/core/src/scrapers/facebook.ts | 59 ++++++++--- packages/core/test/facebook-core.test.ts | 121 +++++++++++++++++++++++ 2 files changed, 167 insertions(+), 13 deletions(-) diff --git a/packages/core/src/scrapers/facebook.ts b/packages/core/src/scrapers/facebook.ts index d331471..6eca4cb 100644 --- a/packages/core/src/scrapers/facebook.ts +++ b/packages/core/src/scrapers/facebook.ts @@ -369,6 +369,45 @@ async function fetchHtml( // ----------------------------- Parsing ----------------------------- +export type FacebookResponseKind = + | "search" + | "item" + | "auth_gated" + | "unavailable" + | "unknown"; + +export function classifyFacebookResponse( + htmlString: HTMLString, + responseUrl: string, +) { + const authGated = + responseUrl.includes("/login/") || + htmlString.includes("You must log in") || + htmlString.includes("log in to continue"); + + if (authGated) { + return { kind: "auth_gated" as const, authGated: true, unavailable: false }; + } + + const unavailable = + responseUrl.includes("unavailable_product=1") || + htmlString.includes("This listing is no longer available") || + htmlString.includes("listing has been removed"); + if (unavailable) { + return { kind: "unavailable" as const, authGated: false, unavailable: true }; + } + + if (htmlString.includes("XCometMarketplaceSearchController")) { + return { kind: "search" as const, authGated: false, unavailable: false }; + } + + if (htmlString.includes("XCometMarketplacePermalinkController")) { + return { kind: "item" as const, authGated: false, unavailable: false }; + } + + return { kind: "unknown" as const, authGated: false, unavailable: false }; +} + /** Extract marketplace search data from Facebook page script tags */ @@ -970,25 +1009,19 @@ export async function fetchFacebookItem( const itemData = extractFacebookItemData(itemHtml); if (!itemData) { logExtractionMetrics(false, itemId); - // Enhanced checking for specific failure scenarios - if ( - itemHtml.includes("This listing is no longer available") || - itemHtml.includes("listing has been removed") || - itemHtml.includes("This item has been sold") - ) { + + const classification = classifyFacebookResponse(itemHtml, itemUrl); + + if (classification.authGated) { console.warn( - `Item ${itemId} appears to be sold or removed from marketplace.`, + `Authentication failed for item ${itemId}. Cookies may be expired.`, ); return null; } - if ( - itemHtml.includes("log in to Facebook") || - itemHtml.includes("You must log in") || - itemHtml.includes("authentication required") - ) { + if (classification.unavailable || itemHtml.includes("This item has been sold")) { console.warn( - `Authentication failed for item ${itemId}. Cookies may be expired.`, + `Item ${itemId} appears to be sold or removed from marketplace.`, ); return null; } diff --git a/packages/core/test/facebook-core.test.ts b/packages/core/test/facebook-core.test.ts index ea8a222..82bdd20 100644 --- a/packages/core/test/facebook-core.test.ts +++ b/packages/core/test/facebook-core.test.ts @@ -1,5 +1,6 @@ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; import { + classifyFacebookResponse, ensureFacebookCookies, extractFacebookItemData, extractFacebookMarketplaceData, @@ -571,6 +572,126 @@ describe("Facebook Marketplace Scraper Core Tests", () => { const result = extractFacebookMarketplaceData(html); expect(result).toBeNull(); }); + + test("classifies Comet search responses", () => { + const html = ` + + Marketplace + + + + + + `; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/marketplace/toronto/search?query=bike", + ), + ).toEqual({ + kind: "search", + authGated: false, + unavailable: false, + }); + }); + + test("classifies Comet item responses", () => { + const html = ` + + + + + + + `; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/marketplace/item/123/", + ), + ).toEqual({ + kind: "item", + authGated: false, + unavailable: false, + }); + }); + + test("classifies login-gated responses before parsing", () => { + const html = `You must log in to Facebook`; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F", + ), + ).toEqual({ + kind: "auth_gated", + authGated: true, + unavailable: false, + }); + }); + + test("classifies unavailable item responses", () => { + const html = `Marketplace`; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/marketplace/toronto/?unavailable_product=1", + ), + ).toEqual({ + kind: "unavailable", + authGated: false, + unavailable: true, + }); + }); + + test("classifies unknown responses when no signal is present", () => { + const html = `Some random page`; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/marketplace/toronto/", + ), + ).toEqual({ + kind: "unknown", + authGated: false, + unavailable: false, + }); + }); + + test("does not false-positive on incidental login text", () => { + const html = ``; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/marketplace/toronto/search?query=bike", + ), + ).toEqual({ + kind: "unknown", + authGated: false, + unavailable: false, + }); + }); + + test("detects auth gating from URL redirect", () => { + const html = `Redirecting...`; + + expect( + classifyFacebookResponse( + html, + "https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F", + ), + ).toEqual({ + kind: "auth_gated", + authGated: true, + unavailable: false, + }); + }); }); });