refactor: handle facebook route-aware failure states
This commit is contained in:
@@ -283,7 +283,7 @@ async function fetchHtml(
|
|||||||
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||||||
cookies?: string;
|
cookies?: string;
|
||||||
},
|
},
|
||||||
): Promise<HTMLString> {
|
): Promise<{ html: HTMLString; responseUrl: string }> {
|
||||||
const maxRetries = opts?.maxRetries ?? 3;
|
const maxRetries = opts?.maxRetries ?? 3;
|
||||||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||||||
|
|
||||||
@@ -354,7 +354,7 @@ async function fetchHtml(
|
|||||||
const html = await res.text();
|
const html = await res.text();
|
||||||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||||||
await delay(DELAY_MS);
|
await delay(DELAY_MS);
|
||||||
return html;
|
return { html, responseUrl: res.url || url };
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (attempt >= maxRetries) throw err;
|
if (attempt >= maxRetries) throw err;
|
||||||
await delay((attempt + 1) * retryBaseMs);
|
await delay((attempt + 1) * retryBaseMs);
|
||||||
@@ -394,6 +394,10 @@ export function classifyFacebookResponse(
|
|||||||
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (responseUrl.includes("/marketplace/item/")) {
|
||||||
|
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||||
return { kind: "search" as const, authGated: false, unavailable: false };
|
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||||
}
|
}
|
||||||
@@ -1085,8 +1089,9 @@ export default async function fetchFacebookItems(
|
|||||||
console.log(`Using ${cookies.length} cookies for authentication`);
|
console.log(`Using ${cookies.length} cookies for authentication`);
|
||||||
|
|
||||||
let searchHtml: string;
|
let searchHtml: string;
|
||||||
|
let searchResponseUrl = searchUrl;
|
||||||
try {
|
try {
|
||||||
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
const response = await fetchHtml(searchUrl, DELAY_MS, {
|
||||||
maxRetries: 3,
|
maxRetries: 3,
|
||||||
onRateInfo: (remaining, reset) => {
|
onRateInfo: (remaining, reset) => {
|
||||||
if (remaining && reset) {
|
if (remaining && reset) {
|
||||||
@@ -1097,6 +1102,8 @@ export default async function fetchFacebookItems(
|
|||||||
},
|
},
|
||||||
cookies: cookiesHeader,
|
cookies: cookiesHeader,
|
||||||
});
|
});
|
||||||
|
searchHtml = response.html;
|
||||||
|
searchResponseUrl = response.responseUrl;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof HttpError) {
|
if (err instanceof HttpError) {
|
||||||
console.warn(
|
console.warn(
|
||||||
@@ -1112,6 +1119,24 @@ export default async function fetchFacebookItems(
|
|||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
|
||||||
|
if (classification.authGated) {
|
||||||
|
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.unavailable) {
|
||||||
|
console.warn("Facebook marketplace search returned an unavailable route.");
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.kind !== "search") {
|
||||||
|
console.warn(
|
||||||
|
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
|
||||||
|
);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
const ads = extractFacebookMarketplaceData(searchHtml);
|
const ads = extractFacebookMarketplaceData(searchHtml);
|
||||||
if (!ads || ads.length === 0) {
|
if (!ads || ads.length === 0) {
|
||||||
console.warn("No ads parsed from Facebook marketplace page.");
|
console.warn("No ads parsed from Facebook marketplace page.");
|
||||||
@@ -1163,8 +1188,9 @@ export async function fetchFacebookItem(
|
|||||||
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
|
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
|
||||||
|
|
||||||
let itemHtml: string;
|
let itemHtml: string;
|
||||||
|
let itemResponseUrl = itemUrl;
|
||||||
try {
|
try {
|
||||||
itemHtml = await fetchHtml(itemUrl, 1000, {
|
const response = await fetchHtml(itemUrl, 1000, {
|
||||||
onRateInfo: (remaining, reset) => {
|
onRateInfo: (remaining, reset) => {
|
||||||
if (remaining && reset) {
|
if (remaining && reset) {
|
||||||
console.log(
|
console.log(
|
||||||
@@ -1174,6 +1200,8 @@ export async function fetchFacebookItem(
|
|||||||
},
|
},
|
||||||
cookies: cookiesHeader,
|
cookies: cookiesHeader,
|
||||||
});
|
});
|
||||||
|
itemHtml = response.html;
|
||||||
|
itemResponseUrl = response.responseUrl;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof HttpError) {
|
if (err instanceof HttpError) {
|
||||||
console.warn(
|
console.warn(
|
||||||
@@ -1214,26 +1242,32 @@ export async function fetchFacebookItem(
|
|||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const classification = classifyFacebookResponse(itemHtml, itemResponseUrl);
|
||||||
|
|
||||||
|
if (classification.authGated) {
|
||||||
|
logExtractionMetrics(false, itemId);
|
||||||
|
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||||
|
logExtractionMetrics(false, itemId);
|
||||||
|
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.kind !== "item") {
|
||||||
|
logExtractionMetrics(false, itemId);
|
||||||
|
console.warn(
|
||||||
|
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
const itemData = extractFacebookItemData(itemHtml);
|
const itemData = extractFacebookItemData(itemHtml);
|
||||||
if (!itemData) {
|
if (!itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
|
|
||||||
const classification = classifyFacebookResponse(itemHtml, itemUrl);
|
|
||||||
|
|
||||||
if (classification.authGated) {
|
|
||||||
console.warn(
|
|
||||||
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
|
||||||
);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
|
||||||
console.warn(
|
|
||||||
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
|
||||||
);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.warn(
|
console.warn(
|
||||||
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||||
import fetchFacebookItems from "../src/scrapers/facebook";
|
import fetchFacebookItems, { fetchFacebookItem } from "../src/scrapers/facebook";
|
||||||
|
|
||||||
// Mock fetch globally
|
// Mock fetch globally
|
||||||
const originalFetch = global.fetch;
|
const originalFetch = global.fetch;
|
||||||
@@ -125,7 +125,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -180,7 +180,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -221,7 +221,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -254,6 +254,76 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
expect(results).toEqual([]);
|
expect(results).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("should return empty array for auth-gated search HTML", async () => {
|
||||||
|
const authGatedSearchHtml = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<a href="/marketplace/item/123456789/">
|
||||||
|
<span>Vintage Lamp</span>
|
||||||
|
<span>CA$45</span>
|
||||||
|
<span>Toronto, ON</span>
|
||||||
|
</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
|
||||||
|
text: () => Promise.resolve(authGatedSearchHtml),
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
|
||||||
|
expect(results).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should return empty array when search request lands on unknown route", async () => {
|
||||||
|
const wrongRouteHtml = `<html><body><script>${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Leaked Search Result",
|
||||||
|
listing_price: {
|
||||||
|
amount: "75.00",
|
||||||
|
formatted_amount: "CA$75",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}</script></body></html>`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/marketplace/toronto/",
|
||||||
|
text: () => Promise.resolve(wrongRouteHtml),
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
|
||||||
|
expect(results).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle network errors", async () => {
|
test("should handle network errors", async () => {
|
||||||
global.fetch = mock(() => Promise.reject(new Error("Network error")));
|
global.fetch = mock(() => Promise.reject(new Error("Network error")));
|
||||||
|
|
||||||
@@ -320,7 +390,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -393,7 +463,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -462,7 +532,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -533,7 +603,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -599,4 +669,45 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
expect(results).toEqual([]);
|
expect(results).toEqual([]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("Item Fetch Function", () => {
|
||||||
|
test("should return null for unavailable item responses", async () => {
|
||||||
|
const unavailableItemHtml = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
listing: {
|
||||||
|
id: "related-123",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Related Listing",
|
||||||
|
formatted_price: { text: "CA$90" },
|
||||||
|
listing_price: {
|
||||||
|
amount: "90.00",
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: "90.00",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||||
|
text: () => Promise.resolve(unavailableItemHtml),
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await fetchFacebookItem("123");
|
||||||
|
expect(result).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user