refactor: add facebook response classification
This commit is contained in:
@@ -369,6 +369,45 @@ async function fetchHtml(
|
||||
|
||||
// ----------------------------- Parsing -----------------------------
|
||||
|
||||
export type FacebookResponseKind =
|
||||
| "search"
|
||||
| "item"
|
||||
| "auth_gated"
|
||||
| "unavailable"
|
||||
| "unknown";
|
||||
|
||||
export function classifyFacebookResponse(
|
||||
htmlString: HTMLString,
|
||||
responseUrl: string,
|
||||
) {
|
||||
const authGated =
|
||||
responseUrl.includes("/login/") ||
|
||||
htmlString.includes("You must log in") ||
|
||||
htmlString.includes("log in to continue");
|
||||
|
||||
if (authGated) {
|
||||
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
|
||||
}
|
||||
|
||||
const unavailable =
|
||||
responseUrl.includes("unavailable_product=1") ||
|
||||
htmlString.includes("This listing is no longer available") ||
|
||||
htmlString.includes("listing has been removed");
|
||||
if (unavailable) {
|
||||
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
@@ -970,25 +1009,19 @@ export async function fetchFacebookItem(
|
||||
const itemData = extractFacebookItemData(itemHtml);
|
||||
if (!itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
// Enhanced checking for specific failure scenarios
|
||||
if (
|
||||
itemHtml.includes("This listing is no longer available") ||
|
||||
itemHtml.includes("listing has been removed") ||
|
||||
itemHtml.includes("This item has been sold")
|
||||
) {
|
||||
|
||||
const classification = classifyFacebookResponse(itemHtml, itemUrl);
|
||||
|
||||
if (classification.authGated) {
|
||||
console.warn(
|
||||
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
||||
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
itemHtml.includes("log in to Facebook") ||
|
||||
itemHtml.includes("You must log in") ||
|
||||
itemHtml.includes("authentication required")
|
||||
) {
|
||||
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||
console.warn(
|
||||
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
||||
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||
import {
|
||||
classifyFacebookResponse,
|
||||
ensureFacebookCookies,
|
||||
extractFacebookItemData,
|
||||
extractFacebookMarketplaceData,
|
||||
@@ -571,6 +572,126 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
const result = extractFacebookMarketplaceData(html);
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
test("classifies Comet search responses", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head><title>Marketplace</title></head>
|
||||
<body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "search",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies Comet item responses", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{"routing_namespace":"fb_comet"}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/item/123/",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "item",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies login-gated responses before parsing", () => {
|
||||
const html = `<html><body>You must log in to Facebook</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "auth_gated",
|
||||
authGated: true,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies unavailable item responses", () => {
|
||||
const html = `<html><body>Marketplace</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "unavailable",
|
||||
authGated: false,
|
||||
unavailable: true,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies unknown responses when no signal is present", () => {
|
||||
const html = `<html><body>Some random page</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "unknown",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("does not false-positive on incidental login text", () => {
|
||||
const html = `<html><body><footer>log in to Facebook to see your notifications</footer></body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "unknown",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("detects auth gating from URL redirect", () => {
|
||||
const html = `<html><body>Redirecting...</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "auth_gated",
|
||||
authGated: true,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user