refactor: handle facebook route-aware failure states

This commit is contained in:
2026-04-22 11:48:47 -04:00
parent 7ddc96dfdf
commit 9070f76412
2 changed files with 173 additions and 28 deletions

View File

@@ -283,7 +283,7 @@ async function fetchHtml(
onRateInfo?: (remaining: string | null, reset: string | null) => void; onRateInfo?: (remaining: string | null, reset: string | null) => void;
cookies?: string; cookies?: string;
}, },
): Promise<HTMLString> { ): Promise<{ html: HTMLString; responseUrl: string }> {
const maxRetries = opts?.maxRetries ?? 3; const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500; const retryBaseMs = opts?.retryBaseMs ?? 500;
@@ -354,7 +354,7 @@ async function fetchHtml(
const html = await res.text(); const html = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND // Respect per-request delay to keep at or under REQUESTS_PER_SECOND
await delay(DELAY_MS); await delay(DELAY_MS);
return html; return { html, responseUrl: res.url || url };
} catch (err) { } catch (err) {
if (attempt >= maxRetries) throw err; if (attempt >= maxRetries) throw err;
await delay((attempt + 1) * retryBaseMs); await delay((attempt + 1) * retryBaseMs);
@@ -394,6 +394,10 @@ export function classifyFacebookResponse(
return { kind: "unavailable" as const, authGated: false, unavailable: true }; return { kind: "unavailable" as const, authGated: false, unavailable: true };
} }
if (responseUrl.includes("/marketplace/item/")) {
return { kind: "item" as const, authGated: false, unavailable: false };
}
if (htmlString.includes("XCometMarketplaceSearchController")) { if (htmlString.includes("XCometMarketplaceSearchController")) {
return { kind: "search" as const, authGated: false, unavailable: false }; return { kind: "search" as const, authGated: false, unavailable: false };
} }
@@ -1085,8 +1089,9 @@ export default async function fetchFacebookItems(
console.log(`Using ${cookies.length} cookies for authentication`); console.log(`Using ${cookies.length} cookies for authentication`);
let searchHtml: string; let searchHtml: string;
let searchResponseUrl = searchUrl;
try { try {
searchHtml = await fetchHtml(searchUrl, DELAY_MS, { const response = await fetchHtml(searchUrl, DELAY_MS, {
maxRetries: 3, maxRetries: 3,
onRateInfo: (remaining, reset) => { onRateInfo: (remaining, reset) => {
if (remaining && reset) { if (remaining && reset) {
@@ -1097,6 +1102,8 @@ export default async function fetchFacebookItems(
}, },
cookies: cookiesHeader, cookies: cookiesHeader,
}); });
searchHtml = response.html;
searchResponseUrl = response.responseUrl;
} catch (err) { } catch (err) {
if (err instanceof HttpError) { if (err instanceof HttpError) {
console.warn( console.warn(
@@ -1112,6 +1119,24 @@ export default async function fetchFacebookItems(
throw err; throw err;
} }
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
if (classification.authGated) {
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
return [];
}
if (classification.unavailable) {
console.warn("Facebook marketplace search returned an unavailable route.");
return [];
}
if (classification.kind !== "search") {
console.warn(
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
);
return [];
}
const ads = extractFacebookMarketplaceData(searchHtml); const ads = extractFacebookMarketplaceData(searchHtml);
if (!ads || ads.length === 0) { if (!ads || ads.length === 0) {
console.warn("No ads parsed from Facebook marketplace page."); console.warn("No ads parsed from Facebook marketplace page.");
@@ -1163,8 +1188,9 @@ export async function fetchFacebookItem(
console.log(`Fetching Facebook marketplace item: ${itemUrl}`); console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
let itemHtml: string; let itemHtml: string;
let itemResponseUrl = itemUrl;
try { try {
itemHtml = await fetchHtml(itemUrl, 1000, { const response = await fetchHtml(itemUrl, 1000, {
onRateInfo: (remaining, reset) => { onRateInfo: (remaining, reset) => {
if (remaining && reset) { if (remaining && reset) {
console.log( console.log(
@@ -1174,6 +1200,8 @@ export async function fetchFacebookItem(
}, },
cookies: cookiesHeader, cookies: cookiesHeader,
}); });
itemHtml = response.html;
itemResponseUrl = response.responseUrl;
} catch (err) { } catch (err) {
if (err instanceof HttpError) { if (err instanceof HttpError) {
console.warn( console.warn(
@@ -1214,26 +1242,32 @@ export async function fetchFacebookItem(
throw err; throw err;
} }
const classification = classifyFacebookResponse(itemHtml, itemResponseUrl);
if (classification.authGated) {
logExtractionMetrics(false, itemId);
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
return null;
}
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
logExtractionMetrics(false, itemId);
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
return null;
}
if (classification.kind !== "item") {
logExtractionMetrics(false, itemId);
console.warn(
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
);
return null;
}
const itemData = extractFacebookItemData(itemHtml); const itemData = extractFacebookItemData(itemHtml);
if (!itemData) { if (!itemData) {
logExtractionMetrics(false, itemId); logExtractionMetrics(false, itemId);
const classification = classifyFacebookResponse(itemHtml, itemUrl);
if (classification.authGated) {
console.warn(
`Authentication failed for item ${itemId}. Cookies may be expired.`,
);
return null;
}
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
console.warn(
`Item ${itemId} appears to be sold or removed from marketplace.`,
);
return null;
}
console.warn( console.warn(
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`, `No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
); );

View File

@@ -1,5 +1,5 @@
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import fetchFacebookItems from "../src/scrapers/facebook"; import fetchFacebookItems, { fetchFacebookItem } from "../src/scrapers/facebook";
// Mock fetch globally // Mock fetch globally
const originalFetch = global.fetch; const originalFetch = global.fetch;
@@ -125,7 +125,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -180,7 +180,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -221,7 +221,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -254,6 +254,76 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
expect(results).toEqual([]); expect(results).toEqual([]);
}); });
test("should return empty array for auth-gated search HTML", async () => {
const authGatedSearchHtml = `
<html>
<body>
<script>"XCometMarketplaceSearchController"</script>
<a href="/marketplace/item/123456789/">
<span>Vintage Lamp</span>
<span>CA$45</span>
<span>Toronto, ON</span>
</a>
</body>
</html>
`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
text: () => Promise.resolve(authGatedSearchHtml),
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
expect(results).toEqual([]);
});
test("should return empty array when search request lands on unknown route", async () => {
const wrongRouteHtml = `<html><body><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Leaked Search Result",
listing_price: {
amount: "75.00",
formatted_amount: "CA$75",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/marketplace/toronto/",
text: () => Promise.resolve(wrongRouteHtml),
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
expect(results).toEqual([]);
});
test("should handle network errors", async () => { test("should handle network errors", async () => {
global.fetch = mock(() => Promise.reject(new Error("Network error"))); global.fetch = mock(() => Promise.reject(new Error("Network error")));
@@ -320,7 +390,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -393,7 +463,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -462,7 +532,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -533,7 +603,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true, ok: true,
text: () => text: () =>
Promise.resolve( Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`, `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
), ),
headers: { headers: {
get: () => null, get: () => null,
@@ -599,4 +669,45 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
expect(results).toEqual([]); expect(results).toEqual([]);
}); });
}); });
describe("Item Fetch Function", () => {
test("should return null for unavailable item responses", async () => {
const unavailableItemHtml = `
<html>
<body>
<script>${JSON.stringify({
payload: {
listing: {
id: "related-123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Related Listing",
formatted_price: { text: "CA$90" },
listing_price: {
amount: "90.00",
currency: "CAD",
amount_with_offset: "90.00",
},
is_live: true,
},
},
})}</script>
</body>
</html>
`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
text: () => Promise.resolve(unavailableItemHtml),
headers: {
get: () => null,
},
}),
);
const result = await fetchFacebookItem("123");
expect(result).toBeNull();
});
});
}); });