refactor: add facebook response classification
This commit is contained in:
@@ -369,6 +369,45 @@ async function fetchHtml(
|
|||||||
|
|
||||||
// ----------------------------- Parsing -----------------------------
|
// ----------------------------- Parsing -----------------------------
|
||||||
|
|
||||||
|
export type FacebookResponseKind =
|
||||||
|
| "search"
|
||||||
|
| "item"
|
||||||
|
| "auth_gated"
|
||||||
|
| "unavailable"
|
||||||
|
| "unknown";
|
||||||
|
|
||||||
|
export function classifyFacebookResponse(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
responseUrl: string,
|
||||||
|
) {
|
||||||
|
const authGated =
|
||||||
|
responseUrl.includes("/login/") ||
|
||||||
|
htmlString.includes("You must log in") ||
|
||||||
|
htmlString.includes("log in to continue");
|
||||||
|
|
||||||
|
if (authGated) {
|
||||||
|
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
const unavailable =
|
||||||
|
responseUrl.includes("unavailable_product=1") ||
|
||||||
|
htmlString.includes("This listing is no longer available") ||
|
||||||
|
htmlString.includes("listing has been removed");
|
||||||
|
if (unavailable) {
|
||||||
|
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||||
|
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||||
|
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace search data from Facebook page script tags
|
Extract marketplace search data from Facebook page script tags
|
||||||
*/
|
*/
|
||||||
@@ -970,25 +1009,19 @@ export async function fetchFacebookItem(
|
|||||||
const itemData = extractFacebookItemData(itemHtml);
|
const itemData = extractFacebookItemData(itemHtml);
|
||||||
if (!itemData) {
|
if (!itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
// Enhanced checking for specific failure scenarios
|
|
||||||
if (
|
const classification = classifyFacebookResponse(itemHtml, itemUrl);
|
||||||
itemHtml.includes("This listing is no longer available") ||
|
|
||||||
itemHtml.includes("listing has been removed") ||
|
if (classification.authGated) {
|
||||||
itemHtml.includes("This item has been sold")
|
|
||||||
) {
|
|
||||||
console.warn(
|
console.warn(
|
||||||
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
||||||
);
|
);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||||
itemHtml.includes("log in to Facebook") ||
|
|
||||||
itemHtml.includes("You must log in") ||
|
|
||||||
itemHtml.includes("authentication required")
|
|
||||||
) {
|
|
||||||
console.warn(
|
console.warn(
|
||||||
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
||||||
);
|
);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||||
import {
|
import {
|
||||||
|
classifyFacebookResponse,
|
||||||
ensureFacebookCookies,
|
ensureFacebookCookies,
|
||||||
extractFacebookItemData,
|
extractFacebookItemData,
|
||||||
extractFacebookMarketplaceData,
|
extractFacebookMarketplaceData,
|
||||||
@@ -571,6 +572,126 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
const result = extractFacebookMarketplaceData(html);
|
const result = extractFacebookMarketplaceData(html);
|
||||||
expect(result).toBeNull();
|
expect(result).toBeNull();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("classifies Comet search responses", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head><title>Marketplace</title></head>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "search",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies Comet item responses", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{"routing_namespace":"fb_comet"}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/item/123/",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "item",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies login-gated responses before parsing", () => {
|
||||||
|
const html = `<html><body>You must log in to Facebook</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "auth_gated",
|
||||||
|
authGated: true,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies unavailable item responses", () => {
|
||||||
|
const html = `<html><body>Marketplace</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "unavailable",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: true,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies unknown responses when no signal is present", () => {
|
||||||
|
const html = `<html><body>Some random page</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "unknown",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does not false-positive on incidental login text", () => {
|
||||||
|
const html = `<html><body><footer>log in to Facebook to see your notifications</footer></body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "unknown",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("detects auth gating from URL redirect", () => {
|
||||||
|
const html = `<html><body>Redirecting...</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "auth_gated",
|
||||||
|
authGated: true,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user