refactor: add facebook response classification

This commit is contained in:
2026-04-21 23:31:45 -04:00
parent 2617afc62f
commit b072599bc6
2 changed files with 167 additions and 13 deletions

View File

@@ -369,6 +369,45 @@ async function fetchHtml(
// ----------------------------- Parsing ----------------------------- // ----------------------------- Parsing -----------------------------
export type FacebookResponseKind =
| "search"
| "item"
| "auth_gated"
| "unavailable"
| "unknown";
export function classifyFacebookResponse(
htmlString: HTMLString,
responseUrl: string,
) {
const authGated =
responseUrl.includes("/login/") ||
htmlString.includes("You must log in") ||
htmlString.includes("log in to continue");
if (authGated) {
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
}
const unavailable =
responseUrl.includes("unavailable_product=1") ||
htmlString.includes("This listing is no longer available") ||
htmlString.includes("listing has been removed");
if (unavailable) {
return { kind: "unavailable" as const, authGated: false, unavailable: true };
}
if (htmlString.includes("XCometMarketplaceSearchController")) {
return { kind: "search" as const, authGated: false, unavailable: false };
}
if (htmlString.includes("XCometMarketplacePermalinkController")) {
return { kind: "item" as const, authGated: false, unavailable: false };
}
return { kind: "unknown" as const, authGated: false, unavailable: false };
}
/** /**
Extract marketplace search data from Facebook page script tags Extract marketplace search data from Facebook page script tags
*/ */
@@ -970,25 +1009,19 @@ export async function fetchFacebookItem(
const itemData = extractFacebookItemData(itemHtml); const itemData = extractFacebookItemData(itemHtml);
if (!itemData) { if (!itemData) {
logExtractionMetrics(false, itemId); logExtractionMetrics(false, itemId);
// Enhanced checking for specific failure scenarios
if ( const classification = classifyFacebookResponse(itemHtml, itemUrl);
itemHtml.includes("This listing is no longer available") ||
itemHtml.includes("listing has been removed") || if (classification.authGated) {
itemHtml.includes("This item has been sold")
) {
console.warn( console.warn(
`Item ${itemId} appears to be sold or removed from marketplace.`, `Authentication failed for item ${itemId}. Cookies may be expired.`,
); );
return null; return null;
} }
if ( if (classification.unavailable || itemHtml.includes("This item has been sold")) {
itemHtml.includes("log in to Facebook") ||
itemHtml.includes("You must log in") ||
itemHtml.includes("authentication required")
) {
console.warn( console.warn(
`Authentication failed for item ${itemId}. Cookies may be expired.`, `Item ${itemId} appears to be sold or removed from marketplace.`,
); );
return null; return null;
} }

View File

@@ -1,5 +1,6 @@
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import { import {
classifyFacebookResponse,
ensureFacebookCookies, ensureFacebookCookies,
extractFacebookItemData, extractFacebookItemData,
extractFacebookMarketplaceData, extractFacebookMarketplaceData,
@@ -571,6 +572,126 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
const result = extractFacebookMarketplaceData(html); const result = extractFacebookMarketplaceData(html);
expect(result).toBeNull(); expect(result).toBeNull();
}); });
test("classifies Comet search responses", () => {
const html = `
<html>
<head><title>Marketplace</title></head>
<body>
<script>"XCometMarketplaceSearchController"</script>
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
</body>
</html>
`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/search?query=bike",
),
).toEqual({
kind: "search",
authGated: false,
unavailable: false,
});
});
test("classifies Comet item responses", () => {
const html = `
<html>
<body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{"routing_namespace":"fb_comet"}</script>
</body>
</html>
`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/item/123/",
),
).toEqual({
kind: "item",
authGated: false,
unavailable: false,
});
});
test("classifies login-gated responses before parsing", () => {
const html = `<html><body>You must log in to Facebook</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F",
),
).toEqual({
kind: "auth_gated",
authGated: true,
unavailable: false,
});
});
test("classifies unavailable item responses", () => {
const html = `<html><body>Marketplace</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
),
).toEqual({
kind: "unavailable",
authGated: false,
unavailable: true,
});
});
test("classifies unknown responses when no signal is present", () => {
const html = `<html><body>Some random page</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/",
),
).toEqual({
kind: "unknown",
authGated: false,
unavailable: false,
});
});
test("does not false-positive on incidental login text", () => {
const html = `<html><body><footer>log in to Facebook to see your notifications</footer></body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/search?query=bike",
),
).toEqual({
kind: "unknown",
authGated: false,
unavailable: false,
});
});
test("detects auth gating from URL redirect", () => {
const html = `<html><body>Redirecting...</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F",
),
).toEqual({
kind: "auth_gated",
authGated: true,
unavailable: false,
});
});
}); });
}); });