Compare commits
8 Commits
45cff20377
...
9070f76412
| Author | SHA1 | Date | |
|---|---|---|---|
| 9070f76412 | |||
| 7ddc96dfdf | |||
| 63ca006696 | |||
| c90ee54cc1 | |||
| cfd7619737 | |||
| b072599bc6 | |||
| 2617afc62f | |||
| ba889a1f9d |
772
docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md
Normal file
772
docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md
Normal file
@@ -0,0 +1,772 @@
|
||||
# Facebook Comet Rewrite Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Replace the legacy Facebook Marketplace scraper with a route-aware hybrid Comet-bootstrap parser for both search and item routes.
|
||||
|
||||
**Architecture:** Keep authenticated direct HTTP fetches as the transport. Classify each Facebook response first, then parse route-specific Comet bootstrap/state candidates, and fall back to rendered-HTML extraction only when bootstrap decoding cannot produce the expected search or item shape.
|
||||
|
||||
**Tech Stack:** Bun, TypeScript, `bun:test`, `linkedom`, existing shared cookie/http helpers
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Owns Facebook fetch flow, response classification, bootstrap candidate extraction, search parsing, item parsing, and HTML fallbacks.
|
||||
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||
- Owns unit coverage for response classification, bootstrap parsing, fallback parsing, and route-aware item/search extraction behavior.
|
||||
- Modify: `packages/core/test/facebook-integration.test.ts`
|
||||
- Owns higher-level fetch flow tests, auth/degradation behavior, and result shaping for search/item entrypoints.
|
||||
|
||||
### Task 1: Add Route Classification Coverage
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Test: `packages/core/test/facebook-core.test.ts`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add these tests near the Facebook parser tests in `packages/core/test/facebook-core.test.ts`:
|
||||
|
||||
```ts
|
||||
test("classifies Comet search responses", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head><title>Marketplace</title></head>
|
||||
<body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/search?query=bike")).toEqual({
|
||||
kind: "search",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies Comet item responses", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{"routing_namespace":"fb_comet"}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/item/123/")).toEqual({
|
||||
kind: "item",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies login-gated responses before parsing", () => {
|
||||
const html = `<html><body>You must log in to Facebook</body></html>`;
|
||||
|
||||
expect(classifyFacebookResponse(html, "https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F")).toEqual({
|
||||
kind: "auth_gated",
|
||||
authGated: true,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies unavailable item responses", () => {
|
||||
const html = `<html><body>Marketplace</body></html>`;
|
||||
|
||||
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/?unavailable_product=1")).toEqual({
|
||||
kind: "unavailable",
|
||||
authGated: false,
|
||||
unavailable: true,
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"`
|
||||
Expected: FAIL because `classifyFacebookResponse` does not exist yet.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Add this type and function near the parsing section in `packages/core/src/scrapers/facebook.ts`:
|
||||
|
||||
```ts
|
||||
type FacebookResponseKind = "search" | "item" | "auth_gated" | "unavailable" | "unknown";
|
||||
|
||||
export function classifyFacebookResponse(htmlString: HTMLString, responseUrl: string) {
|
||||
const authGated =
|
||||
responseUrl.includes("/login/") ||
|
||||
htmlString.includes("You must log in to Facebook") ||
|
||||
htmlString.includes("log in to Facebook");
|
||||
|
||||
if (authGated) {
|
||||
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
|
||||
}
|
||||
|
||||
const unavailable = responseUrl.includes("unavailable_product=1");
|
||||
if (unavailable) {
|
||||
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||
git commit -m "refactor: add facebook response classification"
|
||||
```
|
||||
|
||||
### Task 2: Add Bootstrap Candidate Extraction
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Test: `packages/core/test/facebook-core.test.ts`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add these tests:
|
||||
|
||||
```ts
|
||||
test("extracts Comet bootstrap candidates from script tags", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>{"routing_namespace":"fb_comet"}</script>
|
||||
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
|
||||
<script>not json</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const candidates = extractFacebookBootstrapCandidates(html);
|
||||
expect(candidates).toHaveLength(2);
|
||||
expect(candidates[1]).toEqual({
|
||||
data: {
|
||||
marketplace_search_bootstrap: {
|
||||
edges: [{ node: { listing: { id: "1" } } }],
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("keeps candidate order stable for later scoring", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>{"marker":"first"}</script>
|
||||
<script>{"marker":"second"}</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const candidates = extractFacebookBootstrapCandidates(html);
|
||||
expect(candidates.map((candidate) => candidate.marker)).toEqual(["first", "second"]);
|
||||
});
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"`
|
||||
Expected: FAIL because `extractFacebookBootstrapCandidates` does not exist.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Add this helper near the parser utilities in `packages/core/src/scrapers/facebook.ts`:
|
||||
|
||||
```ts
|
||||
export function extractFacebookBootstrapCandidates(htmlString: HTMLString): Record<string, unknown>[] {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates: Record<string, unknown>[] = [];
|
||||
|
||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||
const scriptText = script.textContent?.trim();
|
||||
if (!scriptText) continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
if (isRecord(parsed)) {
|
||||
candidates.push(parsed);
|
||||
}
|
||||
} catch {
|
||||
// Ignore non-JSON script bodies.
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||
git commit -m "refactor: add facebook bootstrap candidate extraction"
|
||||
```
|
||||
|
||||
### Task 3: Replace Search Parsing With Candidate Scoring
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||
- Modify: `packages/core/test/facebook-integration.test.ts`
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Test: `packages/core/test/facebook-core.test.ts`
|
||||
- Test: `packages/core/test/facebook-integration.test.ts`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add a core test for route-aware search extraction:
|
||||
|
||||
```ts
|
||||
test("extracts search results from Comet bootstrap candidates", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "Bike",
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
formatted_amount: "CA$120",
|
||||
currency: "CAD",
|
||||
},
|
||||
location: {
|
||||
reverse_geocode: {
|
||||
city_page: { display_name: "Toronto" },
|
||||
},
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const ads = extractFacebookMarketplaceData(html);
|
||||
expect(ads).toHaveLength(1);
|
||||
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
|
||||
});
|
||||
```
|
||||
|
||||
Replace one integration fixture with a current-shape search fixture:
|
||||
|
||||
```ts
|
||||
const mockSearchHtml = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "iPhone 13",
|
||||
listing_price: {
|
||||
amount: "500.00",
|
||||
formatted_amount: "CA$500",
|
||||
currency: "CAD",
|
||||
},
|
||||
location: { reverse_geocode: { city_page: { display_name: "Toronto" } } },
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}</script>
|
||||
</body></html>
|
||||
`;
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet bootstrap candidates"`
|
||||
Expected: FAIL because the current search extractor only understands legacy `marketplace_search` shapes.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Replace the search extraction internals in `extractFacebookMarketplaceData()` with candidate scoring like this:
|
||||
|
||||
```ts
|
||||
function findSearchEdges(candidate: unknown): FacebookEdge[] | null {
|
||||
if (Array.isArray(candidate)) {
|
||||
for (const item of candidate) {
|
||||
const result = findSearchEdges(item);
|
||||
if (result) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isRecord(candidate)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const directEdges = candidate.feed_units?.edges;
|
||||
if (Array.isArray(directEdges)) {
|
||||
return directEdges as FacebookEdge[];
|
||||
}
|
||||
|
||||
const resultGroups = candidate.resultGroups;
|
||||
if (Array.isArray(resultGroups)) {
|
||||
for (const group of resultGroups) {
|
||||
if (isRecord(group) && Array.isArray(group.edges)) {
|
||||
return group.edges as FacebookEdge[];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const value of Object.values(candidate)) {
|
||||
const result = findSearchEdges(value);
|
||||
if (result) return result;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export function extractFacebookMarketplaceData(htmlString: HTMLString): FacebookAdNode[] | null {
|
||||
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||
|
||||
for (const candidate of candidates) {
|
||||
const edges = findSearchEdges(candidate);
|
||||
if (edges?.length) {
|
||||
return edges.map((edge) => ({ node: edge.node }));
|
||||
}
|
||||
}
|
||||
|
||||
console.warn("No marketplace data found in HTML response");
|
||||
return null;
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts`
|
||||
Expected: PASS for the rewritten search fixtures and existing unaffected tests.
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts
|
||||
git commit -m "refactor: rewrite facebook search parser for comet bootstrap"
|
||||
```
|
||||
|
||||
### Task 4: Replace Item Parsing With Candidate Scoring
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Test: `packages/core/test/facebook-core.test.ts`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Replace one old item fixture with a current-shape item fixture:
|
||||
|
||||
```ts
|
||||
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
listing: {
|
||||
id: "123",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Vintage Chair",
|
||||
formatted_price: { text: "CA$80" },
|
||||
listing_price: { amount: "80.00", currency: "CAD", amount_with_offset: "80.00" },
|
||||
redacted_description: { text: "Solid wood chair" },
|
||||
location_text: { text: "Toronto, ON" },
|
||||
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
|
||||
condition: "USED",
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const item = extractFacebookItemData(html);
|
||||
expect(item?.id).toBe("123");
|
||||
expect(item?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
});
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet permalink bootstrap"`
|
||||
Expected: FAIL because the current item extractor depends on legacy permalink markers.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Replace the item extraction internals with a semantic candidate finder like this:
|
||||
|
||||
```ts
|
||||
function findMarketplaceItemCandidate(candidate: unknown): FacebookMarketplaceItem | null {
|
||||
if (Array.isArray(candidate)) {
|
||||
for (const item of candidate) {
|
||||
const result = findMarketplaceItemCandidate(item);
|
||||
if (result) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isRecord(candidate)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
candidate.id &&
|
||||
candidate.__typename === "GroupCommerceProductItem" &&
|
||||
candidate.marketplace_listing_title
|
||||
) {
|
||||
return candidate as FacebookMarketplaceItem;
|
||||
}
|
||||
|
||||
for (const value of Object.values(candidate)) {
|
||||
const result = findMarketplaceItemCandidate(value);
|
||||
if (result) return result;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export function extractFacebookItemData(htmlString: HTMLString): FacebookMarketplaceItem | null {
|
||||
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||
|
||||
for (const candidate of candidates) {
|
||||
const item = findMarketplaceItemCandidate(candidate);
|
||||
if (item) {
|
||||
return item;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts`
|
||||
Expected: PASS for current-shape item tests and remaining parser tests.
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||
git commit -m "refactor: rewrite facebook item parser for comet bootstrap"
|
||||
```
|
||||
|
||||
### Task 5: Add HTML Fallback Extraction
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Test: `packages/core/test/facebook-core.test.ts`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add these fallback tests:
|
||||
|
||||
```ts
|
||||
test("falls back to rendered search HTML when bootstrap payloads are undecodable", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<a href="https://www.facebook.com/marketplace/item/123/?ref=search">Vintage Lamp</a>
|
||||
<span>CA$45</span>
|
||||
<span>Toronto, ON</span>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const ads = extractFacebookMarketplaceData(html);
|
||||
const parsed = ads ? parseFacebookAds(ads) : [];
|
||||
expect(parsed[0].title).toBe("Vintage Lamp");
|
||||
expect(parsed[0].listingPrice?.amountFormatted).toBe("CA$45");
|
||||
});
|
||||
|
||||
test("falls back to rendered item HTML when bootstrap payloads are undecodable", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<h1>Vintage Desk</h1>
|
||||
<span>CA$120</span>
|
||||
<span>Condition Used - Good</span>
|
||||
<div>Description Solid oak desk.</div>
|
||||
<div>Seller information Jordan</div>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const item = extractFacebookItemData(html);
|
||||
expect(item?.marketplace_listing_title).toBe("Vintage Desk");
|
||||
expect(item?.formatted_price?.text).toBe("CA$120");
|
||||
});
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"`
|
||||
Expected: FAIL because the extractor currently returns `null` without a structured candidate.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Add route-specific HTML fallback helpers in `packages/core/src/scrapers/facebook.ts`:
|
||||
|
||||
```ts
|
||||
function extractSearchFallback(htmlString: HTMLString): FacebookAdNode[] | null {
|
||||
const idMatch = htmlString.match(/marketplace\/item\/(\d+)/);
|
||||
const titleMatch = htmlString.match(/marketplace\/item\/\d+\/[^>]*>([^<]+)</);
|
||||
const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/);
|
||||
const cityMatch = htmlString.match(/([A-Z][a-z]+,\s*[A-Z]{2})/);
|
||||
|
||||
if (!idMatch || !titleMatch || !priceMatch) return null;
|
||||
|
||||
return [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: idMatch[1],
|
||||
marketplace_listing_title: titleMatch[1].trim(),
|
||||
listing_price: {
|
||||
amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
|
||||
formatted_amount: priceMatch[0],
|
||||
currency: "CAD",
|
||||
},
|
||||
location: cityMatch
|
||||
? { reverse_geocode: { city_page: { display_name: cityMatch[1].split(",")[0] } } }
|
||||
: undefined,
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
function extractItemFallback(htmlString: HTMLString): FacebookMarketplaceItem | null {
|
||||
const titleMatch = htmlString.match(/<h1[^>]*>([^<]+)<\/h1>/i);
|
||||
const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/);
|
||||
if (!titleMatch || !priceMatch) return null;
|
||||
|
||||
return {
|
||||
id: "fallback-item",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: titleMatch[1].trim(),
|
||||
formatted_price: { text: priceMatch[0] },
|
||||
listing_price: {
|
||||
amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
|
||||
currency: "CAD",
|
||||
amount_with_offset: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
|
||||
},
|
||||
redacted_description: { text: htmlString.includes("Description") ? htmlString.split("Description")[1].split("<")[0].trim() : "" },
|
||||
is_live: true,
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
Then call these helpers as the last fallback inside `extractFacebookMarketplaceData()` and `extractFacebookItemData()`.
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||
git commit -m "refactor: add facebook html fallbacks"
|
||||
```
|
||||
|
||||
### Task 6: Wire Route-Aware Failures Into Entry Points
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/test/facebook-integration.test.ts`
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||
- Test: `packages/core/test/facebook-integration.test.ts`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add these integration tests:
|
||||
|
||||
```ts
|
||||
test("returns empty search results for auth-gated search HTML", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
|
||||
text: () => Promise.resolve("<html><body>You must log in to Facebook</body></html>"),
|
||||
headers: { get: () => null },
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await fetchFacebookItems("bike", 1, "toronto", 25);
|
||||
expect(results).toEqual([]);
|
||||
});
|
||||
|
||||
test("returns null for unavailable item responses", async () => {
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||
text: () => Promise.resolve("<html><body>Marketplace</body></html>"),
|
||||
headers: { get: () => null },
|
||||
}),
|
||||
);
|
||||
|
||||
const item = await fetchFacebookItem("123");
|
||||
expect(item).toBeNull();
|
||||
});
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-integration.test.ts --test-name-pattern "auth-gated|unavailable"`
|
||||
Expected: FAIL because the entrypoints do not yet classify successful HTML responses by route/auth state.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Update both entrypoints to classify successful HTML before parsing:
|
||||
|
||||
```ts
|
||||
const responseClass = classifyFacebookResponse(searchHtml, searchUrl);
|
||||
if (responseClass.kind === "auth_gated") {
|
||||
console.warn("Facebook marketplace search is auth-gated. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.");
|
||||
return [];
|
||||
}
|
||||
|
||||
const itemResponseClass = classifyFacebookResponse(itemHtml, itemUrl);
|
||||
if (itemResponseClass.kind === "auth_gated") {
|
||||
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (itemResponseClass.kind === "unavailable") {
|
||||
console.warn(`Item ${itemId} appears to be unavailable in the marketplace.`);
|
||||
return null;
|
||||
}
|
||||
```
|
||||
|
||||
Use the actual response URL from `fetchHtml` plumbing if that helper is extended to return both HTML and final URL; otherwise start by threading final URL support through the fetch helper in the same task.
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-integration.test.ts`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-integration.test.ts
|
||||
git commit -m "refactor: handle facebook route-aware failure states"
|
||||
```
|
||||
|
||||
### Task 7: Run Full Verification And Live Probe
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/core/src/scrapers/facebook.ts` if small cleanup is required
|
||||
- Modify: `packages/core/test/facebook-core.test.ts` if small cleanup is required
|
||||
- Modify: `packages/core/test/facebook-integration.test.ts` if small cleanup is required
|
||||
|
||||
- [ ] **Step 1: Run focused Facebook tests**
|
||||
|
||||
Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 2: Run broader core tests**
|
||||
|
||||
Run: `bun test packages/core/test`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 3: Run live authenticated Facebook probe**
|
||||
|
||||
Run:
|
||||
|
||||
```bash
|
||||
set -a && source .env && set +a && bun --eval 'import { fetchFacebookItems, fetchFacebookItem } from "./packages/core/src/index.ts";
|
||||
const results = await fetchFacebookItems("iphone", 1, "toronto", 3);
|
||||
console.log("SEARCH_COUNT=" + results.length);
|
||||
console.log(JSON.stringify(results[0] ?? null));
|
||||
if (results[0]?.url) {
|
||||
const match = results[0].url.match(/\/item\/(\d+)/);
|
||||
if (match) {
|
||||
const item = await fetchFacebookItem(match[1]);
|
||||
console.log(JSON.stringify(item));
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Expected:
|
||||
|
||||
- search returns at least one result
|
||||
- item fetch returns non-null for the first live result when the route is not stale/unavailable
|
||||
|
||||
- [ ] **Step 4: Make any minimal cleanup needed to keep tests and live probe green**
|
||||
|
||||
If cleanup is needed, keep it limited to naming, dead-code removal caused by the rewrite, or small parser corrections directly exposed by the verification commands.
|
||||
|
||||
- [ ] **Step 5: Re-run verification**
|
||||
|
||||
Run:
|
||||
|
||||
```bash
|
||||
bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts && bun test packages/core/test
|
||||
```
|
||||
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts
|
||||
git commit -m "refactor: complete facebook comet scraper rewrite"
|
||||
```
|
||||
|
||||
## Self-Review
|
||||
|
||||
- Spec coverage: the plan covers classification, route-aware search parsing, route-aware item parsing, HTML fallbacks, explicit failure-state handling, test replacement, and live verification.
|
||||
- Placeholder scan: no `TODO`, `TBD`, or unspecified “handle appropriately” steps remain.
|
||||
- Type consistency: all planned functions and types use the same names across tasks: `classifyFacebookResponse`, `extractFacebookBootstrapCandidates`, `extractFacebookMarketplaceData`, and `extractFacebookItemData`.
|
||||
@@ -0,0 +1,226 @@
|
||||
# Facebook Comet Rewrite Design
|
||||
|
||||
## Summary
|
||||
|
||||
Replace the legacy Facebook Marketplace scraper with a route-aware implementation built around current Comet bootstrap markers and route-specific extraction.
|
||||
The new scraper will keep authenticated direct HTTP fetches as the primary transport, but it will stop treating legacy `require`, `__bbox`, and `marketplace_product_details_page` structures as the main parsing contract.
|
||||
|
||||
## Goals
|
||||
|
||||
- Replace both Facebook search and item-detail extraction with a current-shape parser.
|
||||
- Keep authenticated direct HTTP requests as the primary fetch strategy.
|
||||
- Parse route-specific Comet bootstrap/state payloads before falling back to rendered-HTML extraction.
|
||||
- Detect auth-gated, unavailable, and unknown responses explicitly.
|
||||
- Update tests so they model current route markers and failure modes instead of legacy page objects.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Reworking non-Facebook scrapers.
|
||||
- Converting the scraper to browser-only automation.
|
||||
- Preserving old parser behavior for `marketplace_product_details_page` or `__bbox`-driven item extraction.
|
||||
- Reverse-engineering every internal Facebook bootstrap payload shape exhaustively before implementation.
|
||||
|
||||
## Current State
|
||||
|
||||
The current implementation in `packages/core/src/scrapers/facebook.ts` still uses authenticated HTTP requests, which remains correct.
|
||||
The search path parses embedded script JSON and looks for `marketplace_search.feed_units.edges`.
|
||||
The item-detail path is centered on legacy extraction paths such as:
|
||||
|
||||
- `parsed.require[0][3].__bbox.result.data.viewer.marketplace_product_details_page.target`
|
||||
- nested `__bbox.require[...]` variations
|
||||
- recursive search through `parsed.require`
|
||||
|
||||
Live evidence gathered earlier in this session and by the isolated research subagent shows that current Facebook Marketplace pages are Comet route-driven and expose markers such as:
|
||||
|
||||
- `XCometMarketplaceSearchController`
|
||||
- `XCometMarketplacePermalinkController`
|
||||
- `routing_namespace":"fb_comet"`
|
||||
- `use_ssr_state_manager":true`
|
||||
- `ServerJS`
|
||||
- `Bootloader`
|
||||
- `data-sjs`
|
||||
- `data-btmanifest`
|
||||
|
||||
The same live investigation also showed that authenticated item pages no longer expose the old `marketplace_product_details_page` marker reliably, while live search still returns usable results.
|
||||
|
||||
## Chosen Approach
|
||||
|
||||
Use a hybrid Comet-bootstrap parser.
|
||||
|
||||
The scraper will:
|
||||
|
||||
1. Fetch authenticated HTML directly.
|
||||
2. Classify the response using current route and auth markers.
|
||||
3. Parse inline bootstrap/state payloads using route-specific probes.
|
||||
4. Fall back to rendered-HTML extraction only when bootstrap markers are present but the payload cannot be decoded into the expected search or item shape.
|
||||
|
||||
This keeps the cheaper direct-HTTP transport while shifting the parser contract from legacy page-object names to current Comet route structure.
|
||||
|
||||
## Design
|
||||
|
||||
### Route Classification
|
||||
|
||||
Add a small response-classification layer before data extraction.
|
||||
It should identify these states from the fetched response URL and HTML:
|
||||
|
||||
- `auth_gated`
|
||||
- `unavailable`
|
||||
- `search`
|
||||
- `item`
|
||||
- `unknown`
|
||||
|
||||
Signals to use:
|
||||
|
||||
- final URL containing `/login/` or login-shell text
|
||||
- final URL containing `unavailable_product=1`
|
||||
- search controller markers such as `XCometMarketplaceSearchController`
|
||||
- item controller markers such as `XCometMarketplacePermalinkController`
|
||||
- shared Comet markers such as `routing_namespace":"fb_comet"`
|
||||
|
||||
This classification layer becomes the top-level contract for both fetch functions.
|
||||
|
||||
### Search Extraction
|
||||
|
||||
The search path will be rewritten around Comet search-route markers.
|
||||
|
||||
Primary behavior:
|
||||
|
||||
- fetch the Marketplace search HTML with auth cookies
|
||||
- confirm the response class is `search`
|
||||
- extract inline bootstrap/state blobs from script tags and page attributes
|
||||
- probe for route-specific search payloads associated with `XCometMarketplaceSearchController`
|
||||
- map decoded search results into summary listing records
|
||||
|
||||
Search summary fields should remain aligned with the current public output shape:
|
||||
|
||||
- item URL
|
||||
- title
|
||||
- formatted price and normalized cents when possible
|
||||
- city/address summary when present
|
||||
- seller summary when present in the search payload
|
||||
- category/status/media fields only when they are present with stable meaning
|
||||
|
||||
Fallback behavior:
|
||||
|
||||
- if search route markers are present but structured payload decoding fails, extract listing summaries from rendered HTML anchors and text patterns
|
||||
- use item links matching `/marketplace/item/<id>` as the anchor for fallback extraction
|
||||
- treat fallback results as summary-only data, not rich detail data
|
||||
|
||||
### Item Extraction
|
||||
|
||||
The item-detail path will be rewritten around the Comet permalink route.
|
||||
|
||||
Primary behavior:
|
||||
|
||||
- fetch the item permalink HTML with auth cookies
|
||||
- confirm the response class is `item`
|
||||
- extract inline bootstrap/state blobs from script tags and page attributes
|
||||
- probe for permalink payloads associated with `XCometMarketplacePermalinkController`
|
||||
- decode the richest recoverable item record and map it into `FacebookListingDetails`
|
||||
|
||||
Priority item fields:
|
||||
|
||||
- item ID and permalink URL
|
||||
- title
|
||||
- formatted price and normalized cents when possible
|
||||
- condition
|
||||
- description
|
||||
- listed age / creation date when derivable
|
||||
- approximate location
|
||||
- seller name and seller ID when present
|
||||
- listing status when the payload makes it explicit
|
||||
|
||||
Fallback behavior:
|
||||
|
||||
- if permalink route markers are present but no stable payload object is decodable, extract data from rendered HTML text structure
|
||||
- prioritize title, price, condition, description, location text, and seller module content
|
||||
- return partial item data when core user-facing fields are present rather than failing solely because deeper commerce metadata is missing
|
||||
|
||||
### Bootstrap Parsing Strategy
|
||||
|
||||
The parser should stop assuming a single stable JSON path.
|
||||
Instead, it should work in two phases:
|
||||
|
||||
1. Discover candidate bootstrap payloads.
|
||||
2. Score candidates against the expected route shape.
|
||||
|
||||
Candidate discovery inputs:
|
||||
|
||||
- raw `<script>` contents
|
||||
- `data-sjs` and related page attributes
|
||||
- `ServerJS` / `Bootloader` inline blobs
|
||||
- route controller names
|
||||
|
||||
Candidate scoring for search should favor objects that contain repeated result-card semantics, item IDs, listing links, titles, prices, or location summaries.
|
||||
Candidate scoring for item pages should favor objects that contain singular listing semantics, title, price, condition, description, location, seller, or permalink context.
|
||||
|
||||
The parser should not depend on one hard-coded object name surviving forever.
|
||||
Instead, it should look for route-specific semantic clusters and choose the strongest candidate.
|
||||
|
||||
### Legacy Removal
|
||||
|
||||
The old Facebook scraper should be removed as a primary strategy.
|
||||
Specifically:
|
||||
|
||||
- delete old item-detail extraction paths centered on `marketplace_product_details_page`
|
||||
- delete legacy-first `require` / `__bbox` navigation tables
|
||||
- delete tests whose only purpose is to preserve those legacy paths
|
||||
|
||||
If a minimal legacy compatibility branch remains, it must be a last-resort fallback behind the new route-aware parser and should not shape test fixtures or design decisions.
|
||||
|
||||
### Error Handling
|
||||
|
||||
Facebook responses should now fail with explicit route-aware outcomes:
|
||||
|
||||
1. Missing/invalid auth cookie input.
|
||||
2. Auth-gated response.
|
||||
3. Unavailable or stale item response.
|
||||
4. Search or item route detected, but no decodable data found.
|
||||
5. Unknown response shape.
|
||||
|
||||
Error messages should name the actual class of failure instead of implying that every parse miss is caused by expired cookies.
|
||||
|
||||
### Testing Strategy
|
||||
|
||||
Follow TDD for the rewrite.
|
||||
Write failing tests for the new route-aware parser before replacing production code.
|
||||
|
||||
Coverage targets:
|
||||
|
||||
1. Search responses classify correctly from current Comet controller markers.
|
||||
2. Item responses classify correctly from current Comet controller markers.
|
||||
3. Login-gated and unavailable responses are detected before parsing.
|
||||
4. Search bootstrap parsing produces summary listing results from current-shape fixtures.
|
||||
5. Item bootstrap parsing produces rich listing details from current-shape fixtures.
|
||||
6. Search fallback extraction works when route markers exist but structured payload decoding fails.
|
||||
7. Item fallback extraction works when route markers exist but structured payload decoding fails.
|
||||
8. Old legacy-only item fixtures are removed or rewritten so they no longer define the contract.
|
||||
|
||||
Verification target after implementation:
|
||||
|
||||
- `bun test packages/core/test/facebook-core.test.ts`
|
||||
- `bun test packages/core/test/facebook-integration.test.ts`
|
||||
- a live authenticated Facebook probe covering search and item routes
|
||||
|
||||
## Public API Surface
|
||||
|
||||
Keep the current public function names unless the rewrite proves that a signature change is required:
|
||||
|
||||
- `fetchFacebookItems(...)`
|
||||
- `fetchFacebookItem(...)`
|
||||
- `extractFacebookMarketplaceData(...)`
|
||||
- `extractFacebookItemData(...)`
|
||||
|
||||
The internals should change substantially, but callers should not need a new integration surface for this rewrite.
|
||||
|
||||
## Risks
|
||||
|
||||
- Facebook may change bootstrap payload naming again, so route/controller markers are more stable than exact nested object paths but still not guaranteed.
|
||||
- Search and item pages may each contain multiple partial payloads, making candidate ranking important.
|
||||
- Fallback rendered-HTML extraction may be noisier than bootstrap decoding and needs clear precedence rules.
|
||||
- Live fixtures can drift from production quickly, so tests must model route semantics rather than exact one-off payloads where possible.
|
||||
|
||||
## Rollout Notes
|
||||
|
||||
The code, fixtures, and tests should change together.
|
||||
There should be no mixed state where the implementation is Comet-aware but the tests still encode `marketplace_product_details_page` as the primary contract.
|
||||
@@ -75,13 +75,6 @@ interface FacebookEdge {
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceSearch {
|
||||
feed_units?: {
|
||||
edges?: FacebookEdge[];
|
||||
};
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceItem {
|
||||
// Basic identification
|
||||
id: string;
|
||||
@@ -173,6 +166,10 @@ interface FacebookMarketplaceItem {
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
|
||||
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
|
||||
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
|
||||
|
||||
export interface FacebookListingDetails {
|
||||
url: string;
|
||||
title: string;
|
||||
@@ -286,7 +283,7 @@ async function fetchHtml(
|
||||
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||||
cookies?: string;
|
||||
},
|
||||
): Promise<HTMLString> {
|
||||
): Promise<{ html: HTMLString; responseUrl: string }> {
|
||||
const maxRetries = opts?.maxRetries ?? 3;
|
||||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||||
|
||||
@@ -357,7 +354,7 @@ async function fetchHtml(
|
||||
const html = await res.text();
|
||||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||||
await delay(DELAY_MS);
|
||||
return html;
|
||||
return { html, responseUrl: res.url || url };
|
||||
} catch (err) {
|
||||
if (attempt >= maxRetries) throw err;
|
||||
await delay((attempt + 1) * retryBaseMs);
|
||||
@@ -369,223 +366,477 @@ async function fetchHtml(
|
||||
|
||||
// ----------------------------- Parsing -----------------------------
|
||||
|
||||
export type FacebookResponseKind =
|
||||
| "search"
|
||||
| "item"
|
||||
| "auth_gated"
|
||||
| "unavailable"
|
||||
| "unknown";
|
||||
|
||||
export function classifyFacebookResponse(
|
||||
htmlString: HTMLString,
|
||||
responseUrl: string,
|
||||
) {
|
||||
const authGated =
|
||||
responseUrl.includes("/login/") ||
|
||||
htmlString.includes("You must log in") ||
|
||||
htmlString.includes("log in to continue");
|
||||
|
||||
if (authGated) {
|
||||
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
|
||||
}
|
||||
|
||||
const unavailable =
|
||||
responseUrl.includes("unavailable_product=1") ||
|
||||
htmlString.includes("This listing is no longer available") ||
|
||||
htmlString.includes("listing has been removed");
|
||||
if (unavailable) {
|
||||
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||
}
|
||||
|
||||
if (responseUrl.includes("/marketplace/item/")) {
|
||||
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||
}
|
||||
|
||||
export function extractFacebookBootstrapCandidates(
|
||||
htmlString: HTMLString,
|
||||
): Record<string, unknown>[] {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates: Record<string, unknown>[] = [];
|
||||
|
||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||
const scriptText = script.textContent?.trim();
|
||||
if (!scriptText) continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
if (isRecord(parsed)) {
|
||||
candidates.push(parsed as Record<string, unknown>);
|
||||
}
|
||||
} catch {
|
||||
// skip non-JSON script bodies
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] {
|
||||
return (
|
||||
Array.isArray(value) &&
|
||||
value.length > 0 &&
|
||||
value.every(
|
||||
(edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function scoreSearchEdges(edges: FacebookEdge[], score: number): number {
|
||||
return score + Math.min(edges.length, 3);
|
||||
}
|
||||
|
||||
function findSearchEdges(
|
||||
candidate: unknown,
|
||||
score = 0,
|
||||
): { edges: FacebookEdge[]; score: number } | null {
|
||||
if (Array.isArray(candidate)) {
|
||||
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||
|
||||
for (const item of candidate) {
|
||||
const result = findSearchEdges(item, score);
|
||||
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||
bestMatch = result;
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
if (!isRecord(candidate)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||
|
||||
const feedUnits = candidate.feed_units;
|
||||
if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) {
|
||||
bestMatch = {
|
||||
edges: feedUnits.edges,
|
||||
score: scoreSearchEdges(feedUnits.edges, score + 2),
|
||||
};
|
||||
}
|
||||
|
||||
const resultGroups = candidate.resultGroups;
|
||||
if (Array.isArray(resultGroups)) {
|
||||
for (const group of resultGroups) {
|
||||
if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) {
|
||||
const result = {
|
||||
edges: group.edges,
|
||||
score: scoreSearchEdges(group.edges, score + 4),
|
||||
};
|
||||
|
||||
if (!bestMatch || result.score > bestMatch.score) {
|
||||
bestMatch = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const [key, value] of Object.entries(candidate)) {
|
||||
const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0));
|
||||
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||
bestMatch = result;
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceItemMatch {
|
||||
item: FacebookMarketplaceItem;
|
||||
score: number;
|
||||
path: string[];
|
||||
}
|
||||
|
||||
function scoreMarketplaceItemPath(path: string[]): number {
|
||||
let score = 0;
|
||||
|
||||
if (path.includes("payload")) {
|
||||
score += 2;
|
||||
}
|
||||
|
||||
if (path.includes("viewer")) {
|
||||
score += 2;
|
||||
}
|
||||
|
||||
if (path.includes("marketplace_product_details_page")) {
|
||||
score += 6;
|
||||
}
|
||||
|
||||
if (path.includes("target")) {
|
||||
score += 8;
|
||||
}
|
||||
|
||||
if (path.includes("listing")) {
|
||||
score += 6;
|
||||
}
|
||||
|
||||
if (
|
||||
path.some(
|
||||
(segment) =>
|
||||
segment.includes("recommend") || segment.includes("related"),
|
||||
)
|
||||
) {
|
||||
score -= 10;
|
||||
}
|
||||
|
||||
return score - path.length;
|
||||
}
|
||||
|
||||
function collectMarketplaceItemCandidates(
|
||||
candidate: unknown,
|
||||
path: string[] = [],
|
||||
): FacebookMarketplaceItemMatch[] {
|
||||
if (Array.isArray(candidate)) {
|
||||
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
|
||||
}
|
||||
|
||||
if (!isRecord(candidate)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const matches: FacebookMarketplaceItemMatch[] = [];
|
||||
|
||||
if (
|
||||
typeof candidate.id === "string" &&
|
||||
candidate.__typename === "GroupCommerceProductItem" &&
|
||||
typeof candidate.marketplace_listing_title === "string"
|
||||
) {
|
||||
matches.push({
|
||||
item: candidate as FacebookMarketplaceItem,
|
||||
score: scoreMarketplaceItemPath(path),
|
||||
path,
|
||||
});
|
||||
}
|
||||
|
||||
for (const [key, value] of Object.entries(candidate)) {
|
||||
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
function parseFacebookRenderedPrice(priceText: string) {
|
||||
const trimmed = priceText.trim();
|
||||
if (!trimmed || trimmed.toUpperCase() === "FREE") {
|
||||
return {
|
||||
amount: "0.00",
|
||||
formatted_amount: trimmed || "FREE",
|
||||
currency: "CAD",
|
||||
};
|
||||
}
|
||||
|
||||
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
|
||||
if (!amountMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
|
||||
if (!Number.isFinite(amount)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
amount: amount.toFixed(2),
|
||||
formatted_amount: trimmed,
|
||||
currency: "CAD",
|
||||
};
|
||||
}
|
||||
|
||||
function extractRenderedText(node: ParentNode, selector: string): string[] {
|
||||
return Array.from(node.querySelectorAll(selector))
|
||||
.map((element) => element.textContent?.trim())
|
||||
.filter((text): text is string => Boolean(text));
|
||||
}
|
||||
|
||||
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
|
||||
const href = element?.getAttribute("href") || "";
|
||||
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
|
||||
}
|
||||
|
||||
function extractFacebookPermalinkItemId(document: Document): string | null {
|
||||
const canonicalId = extractMarketplaceItemIdFromElement(
|
||||
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
|
||||
);
|
||||
if (canonicalId) {
|
||||
return canonicalId;
|
||||
}
|
||||
|
||||
const ogUrl = document
|
||||
.querySelector('meta[property="og:url"]')
|
||||
?.getAttribute("content");
|
||||
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||
if (ogId) {
|
||||
return ogId;
|
||||
}
|
||||
|
||||
const title = document.querySelector("h1")?.textContent?.trim();
|
||||
if (!title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const itemLinks = Array.from(
|
||||
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||
);
|
||||
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
|
||||
|
||||
if (selfLink) {
|
||||
return extractMarketplaceItemIdFromElement(selfLink);
|
||||
}
|
||||
|
||||
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
|
||||
}
|
||||
|
||||
function extractFacebookDescriptionText(document: Document): string | undefined {
|
||||
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
|
||||
|
||||
for (const label of labels) {
|
||||
if (label.textContent?.trim() !== "Description") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let sibling = label.nextElementSibling;
|
||||
while (sibling) {
|
||||
const text = sibling.textContent?.trim();
|
||||
if (text && text !== "Description") {
|
||||
return text;
|
||||
}
|
||||
sibling = sibling.nextElementSibling;
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function extractFacebookMarketplaceHtmlFallback(
|
||||
htmlString: HTMLString,
|
||||
): FacebookAdNode[] | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const links = Array.from(
|
||||
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||
) as HTMLAnchorElement[];
|
||||
const seenIds = new Set<string>();
|
||||
const results: FacebookAdNode[] = [];
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.getAttribute("href") || "";
|
||||
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||
if (!id || seenIds.has(id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const texts = extractRenderedText(link, "span, div");
|
||||
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
|
||||
const title = texts.find(
|
||||
(text) => text !== priceText && text !== location && !text.includes("/"),
|
||||
);
|
||||
if (!title || !priceText) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parsedPrice = parseFacebookRenderedPrice(priceText);
|
||||
if (!parsedPrice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push({
|
||||
node: {
|
||||
listing: {
|
||||
id,
|
||||
marketplace_listing_title: title,
|
||||
listing_price: parsedPrice,
|
||||
location: location
|
||||
? {
|
||||
reverse_geocode: {
|
||||
city_page: {
|
||||
display_name: location,
|
||||
},
|
||||
},
|
||||
}
|
||||
: undefined,
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
seenIds.add(id);
|
||||
}
|
||||
|
||||
return results.length > 0 ? results : null;
|
||||
}
|
||||
|
||||
function extractFacebookItemHtmlFallback(
|
||||
htmlString: HTMLString,
|
||||
): FacebookMarketplaceItem | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const title = document.querySelector("h1")?.textContent?.trim();
|
||||
const id = extractFacebookPermalinkItemId(document);
|
||||
|
||||
if (!id || !title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const texts = extractRenderedText(document, "h1, span, div, p");
|
||||
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
|
||||
const location = texts.find(
|
||||
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
|
||||
);
|
||||
const description = extractFacebookDescriptionText(document);
|
||||
|
||||
return {
|
||||
id,
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: title,
|
||||
formatted_price: priceText ? { text: priceText } : undefined,
|
||||
listing_price: parsedPrice
|
||||
? {
|
||||
amount: parsedPrice.amount,
|
||||
currency: parsedPrice.currency,
|
||||
amount_with_offset: parsedPrice.amount,
|
||||
}
|
||||
: undefined,
|
||||
location_text: location ? { text: location } : undefined,
|
||||
redacted_description: description ? { text: description } : undefined,
|
||||
is_live: true,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
export function extractFacebookMarketplaceData(
|
||||
htmlString: HTMLString,
|
||||
): FacebookAdNode[] | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||
let bestEdges: FacebookEdge[] | null = null;
|
||||
let bestScore = -1;
|
||||
|
||||
let marketplaceData: FacebookMarketplaceSearch | null = null;
|
||||
|
||||
// Find the script containing the require data with marketplace_search
|
||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||
const scriptText = script.textContent;
|
||||
if (!scriptText) continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
|
||||
// First check if this is the direct data structure (like in examples)
|
||||
if (parsed.require && Array.isArray(parsed.require)) {
|
||||
// Try multiple navigation paths to find marketplace_search
|
||||
const paths = [
|
||||
// Original path from example
|
||||
() =>
|
||||
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
|
||||
.marketplace_search,
|
||||
// Alternative path structure
|
||||
() =>
|
||||
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
|
||||
// Another variation
|
||||
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
|
||||
// Direct access for some responses
|
||||
() => {
|
||||
for (const item of parsed.require) {
|
||||
if (item && item.length >= 4 && item[3]) {
|
||||
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
|
||||
if (bbox) return bbox;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
},
|
||||
];
|
||||
|
||||
for (const getData of paths) {
|
||||
try {
|
||||
const result = getData();
|
||||
if (
|
||||
result &&
|
||||
isRecord(result) &&
|
||||
(result as Record<string, unknown>).feed_units?.edges?.length > 0
|
||||
) {
|
||||
marketplaceData = result as FacebookMarketplaceSearch;
|
||||
break;
|
||||
}
|
||||
} catch {}
|
||||
for (const candidate of candidates) {
|
||||
const result = findSearchEdges(candidate);
|
||||
if (!result?.edges.length) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (marketplaceData) break;
|
||||
if (result.score > bestScore) {
|
||||
bestScore = result.score;
|
||||
bestEdges = result.edges;
|
||||
}
|
||||
}
|
||||
|
||||
// Also check for direct marketplace_search in the parsed data
|
||||
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
|
||||
const searchData =
|
||||
parsed.marketplace_search as FacebookMarketplaceSearch;
|
||||
const feedLength = searchData.feed_units?.edges?.length ?? 0;
|
||||
if (feedLength > 0) {
|
||||
marketplaceData = searchData;
|
||||
break;
|
||||
if (!bestEdges?.length) {
|
||||
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
|
||||
if (htmlFallback?.length) {
|
||||
console.log(
|
||||
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
|
||||
);
|
||||
return htmlFallback;
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
|
||||
if (!marketplaceData?.feed_units?.edges?.length) {
|
||||
console.warn("No marketplace data found in HTML response");
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`,
|
||||
`Successfully parsed ${bestEdges.length} Facebook marketplace listings`,
|
||||
);
|
||||
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
|
||||
return bestEdges.map((edge) => ({ node: edge.node }));
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace item details from Facebook item page HTML
|
||||
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
|
||||
Updated for 2026 Facebook Marketplace bootstrap candidates
|
||||
*/
|
||||
export function extractFacebookItemData(
|
||||
htmlString: HTMLString,
|
||||
): FacebookMarketplaceItem | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||
let bestMatch: FacebookMarketplaceItemMatch | null = null;
|
||||
|
||||
for (const script of scripts) {
|
||||
const scriptText = script.textContent;
|
||||
if (!scriptText) continue;
|
||||
for (const candidate of candidates) {
|
||||
const matches = collectMarketplaceItemCandidates(candidate);
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
|
||||
// Check for the require structure with marketplace product details
|
||||
if (parsed.require && Array.isArray(parsed.require)) {
|
||||
// Try multiple extraction paths discovered from reverse engineering
|
||||
const extractionPaths = [
|
||||
// Path 1: Primary path from current API structure
|
||||
() =>
|
||||
parsed.require[0][3].__bbox.result.data.viewer
|
||||
.marketplace_product_details_page.target,
|
||||
// Path 2: Alternative path with nested require
|
||||
() =>
|
||||
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
|
||||
.viewer.marketplace_product_details_page.target,
|
||||
// Path 3: Variation without the [0] index
|
||||
() =>
|
||||
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
|
||||
.viewer.marketplace_product_details_page.target,
|
||||
// Path 4-5: Additional fallback paths for edge cases
|
||||
() =>
|
||||
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
|
||||
?.marketplace_product_details_page?.target,
|
||||
() =>
|
||||
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
|
||||
?.marketplace_product_details_page?.target,
|
||||
];
|
||||
|
||||
let pathIndex = 0;
|
||||
for (const getPath of extractionPaths) {
|
||||
try {
|
||||
const targetData = getPath();
|
||||
for (const match of matches) {
|
||||
if (
|
||||
targetData &&
|
||||
typeof targetData === "object" &&
|
||||
targetData.id &&
|
||||
targetData.marketplace_listing_title &&
|
||||
targetData.__typename === "GroupCommerceProductItem"
|
||||
!bestMatch ||
|
||||
match.score > bestMatch.score ||
|
||||
(match.score === bestMatch.score && match.path.length < bestMatch.path.length)
|
||||
) {
|
||||
console.log(
|
||||
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
|
||||
);
|
||||
return targetData as FacebookMarketplaceItem;
|
||||
bestMatch = match;
|
||||
}
|
||||
} catch {
|
||||
// Path not found or invalid, try next path
|
||||
}
|
||||
pathIndex++;
|
||||
}
|
||||
|
||||
// Fallback: Search recursively for marketplace data in the parsed structure
|
||||
const findMarketplaceData = (
|
||||
obj: unknown,
|
||||
depth = 0,
|
||||
maxDepth = 10,
|
||||
): FacebookMarketplaceItem | null => {
|
||||
if (depth > maxDepth) return null; // Prevent infinite recursion
|
||||
if (isRecord(obj)) {
|
||||
// Check if this object matches the expected marketplace item structure
|
||||
const candidate = obj as Record<string, unknown>;
|
||||
if (
|
||||
candidate.marketplace_listing_title &&
|
||||
candidate.id &&
|
||||
candidate.__typename === "GroupCommerceProductItem" &&
|
||||
candidate.redacted_description
|
||||
) {
|
||||
return candidate as unknown as FacebookMarketplaceItem;
|
||||
}
|
||||
// Recursively search nested objects and arrays
|
||||
for (const key in obj) {
|
||||
const value = obj[key];
|
||||
if (isRecord(value) || Array.isArray(value)) {
|
||||
const result = findMarketplaceData(value, depth + 1, maxDepth);
|
||||
if (result) return result;
|
||||
}
|
||||
}
|
||||
} else if (Array.isArray(obj)) {
|
||||
// Search through arrays
|
||||
for (const item of obj) {
|
||||
const result = findMarketplaceData(item, depth + 1, maxDepth);
|
||||
if (result) return result;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
// Search through the entire require structure
|
||||
const recursiveResult = findMarketplaceData(parsed.require);
|
||||
if (recursiveResult) {
|
||||
console.log(
|
||||
"Successfully extracted Facebook item data using recursive search",
|
||||
);
|
||||
return recursiveResult;
|
||||
if (bestMatch) {
|
||||
return bestMatch.item;
|
||||
}
|
||||
|
||||
// Additional search in other potential locations
|
||||
if (
|
||||
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
|
||||
?.target
|
||||
) {
|
||||
const bboxData =
|
||||
parsed.__bbox.result.data.viewer.marketplace_product_details_page
|
||||
.target;
|
||||
if (
|
||||
bboxData &&
|
||||
typeof bboxData === "object" &&
|
||||
bboxData.id &&
|
||||
bboxData.marketplace_listing_title &&
|
||||
bboxData.__typename === "GroupCommerceProductItem"
|
||||
) {
|
||||
console.log(
|
||||
"Successfully extracted Facebook item data from __bbox structure",
|
||||
);
|
||||
return bboxData as FacebookMarketplaceItem;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||
return extractFacebookItemHtmlFallback(htmlString);
|
||||
}
|
||||
|
||||
return null;
|
||||
@@ -838,8 +1089,9 @@ export default async function fetchFacebookItems(
|
||||
console.log(`Using ${cookies.length} cookies for authentication`);
|
||||
|
||||
let searchHtml: string;
|
||||
let searchResponseUrl = searchUrl;
|
||||
try {
|
||||
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
||||
const response = await fetchHtml(searchUrl, DELAY_MS, {
|
||||
maxRetries: 3,
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
@@ -850,6 +1102,8 @@ export default async function fetchFacebookItems(
|
||||
},
|
||||
cookies: cookiesHeader,
|
||||
});
|
||||
searchHtml = response.html;
|
||||
searchResponseUrl = response.responseUrl;
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.warn(
|
||||
@@ -865,6 +1119,24 @@ export default async function fetchFacebookItems(
|
||||
throw err;
|
||||
}
|
||||
|
||||
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
|
||||
if (classification.authGated) {
|
||||
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
|
||||
return [];
|
||||
}
|
||||
|
||||
if (classification.unavailable) {
|
||||
console.warn("Facebook marketplace search returned an unavailable route.");
|
||||
return [];
|
||||
}
|
||||
|
||||
if (classification.kind !== "search") {
|
||||
console.warn(
|
||||
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
|
||||
const ads = extractFacebookMarketplaceData(searchHtml);
|
||||
if (!ads || ads.length === 0) {
|
||||
console.warn("No ads parsed from Facebook marketplace page.");
|
||||
@@ -916,8 +1188,9 @@ export async function fetchFacebookItem(
|
||||
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
|
||||
|
||||
let itemHtml: string;
|
||||
let itemResponseUrl = itemUrl;
|
||||
try {
|
||||
itemHtml = await fetchHtml(itemUrl, 1000, {
|
||||
const response = await fetchHtml(itemUrl, 1000, {
|
||||
onRateInfo: (remaining, reset) => {
|
||||
if (remaining && reset) {
|
||||
console.log(
|
||||
@@ -927,6 +1200,8 @@ export async function fetchFacebookItem(
|
||||
},
|
||||
cookies: cookiesHeader,
|
||||
});
|
||||
itemHtml = response.html;
|
||||
itemResponseUrl = response.responseUrl;
|
||||
} catch (err) {
|
||||
if (err instanceof HttpError) {
|
||||
console.warn(
|
||||
@@ -967,31 +1242,31 @@ export async function fetchFacebookItem(
|
||||
throw err;
|
||||
}
|
||||
|
||||
const itemData = extractFacebookItemData(itemHtml);
|
||||
if (!itemData) {
|
||||
const classification = classifyFacebookResponse(itemHtml, itemResponseUrl);
|
||||
|
||||
if (classification.authGated) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (classification.kind !== "item") {
|
||||
logExtractionMetrics(false, itemId);
|
||||
// Enhanced checking for specific failure scenarios
|
||||
if (
|
||||
itemHtml.includes("This listing is no longer available") ||
|
||||
itemHtml.includes("listing has been removed") ||
|
||||
itemHtml.includes("This item has been sold")
|
||||
) {
|
||||
console.warn(
|
||||
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
||||
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
itemHtml.includes("log in to Facebook") ||
|
||||
itemHtml.includes("You must log in") ||
|
||||
itemHtml.includes("authentication required")
|
||||
) {
|
||||
console.warn(
|
||||
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
const itemData = extractFacebookItemData(itemHtml);
|
||||
if (!itemData) {
|
||||
logExtractionMetrics(false, itemId);
|
||||
|
||||
console.warn(
|
||||
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||
import {
|
||||
classifyFacebookResponse,
|
||||
ensureFacebookCookies,
|
||||
extractFacebookBootstrapCandidates,
|
||||
extractFacebookItemData,
|
||||
extractFacebookMarketplaceData,
|
||||
fetchFacebookItem,
|
||||
@@ -367,43 +369,134 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
|
||||
describe("Data Extraction", () => {
|
||||
describe("extractFacebookItemData", () => {
|
||||
test("should extract item data from standard require structure", () => {
|
||||
const mockItemData = {
|
||||
id: "123456",
|
||||
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
listing: {
|
||||
id: "123",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Test Item",
|
||||
formatted_price: { text: "$100.00" },
|
||||
listing_price: { amount: "100.00", currency: "CAD" },
|
||||
marketplace_listing_title: "Vintage Chair",
|
||||
formatted_price: { text: "CA$80" },
|
||||
listing_price: {
|
||||
amount: "80.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "80.00",
|
||||
},
|
||||
redacted_description: { text: "Solid wood chair" },
|
||||
location_text: { text: "Toronto, ON" },
|
||||
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
|
||||
condition: "USED",
|
||||
is_live: true,
|
||||
};
|
||||
const mockData = {
|
||||
require: [
|
||||
[
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
{
|
||||
__bbox: {
|
||||
result: {
|
||||
data: {
|
||||
viewer: {
|
||||
marketplace_product_details_page: {
|
||||
target: mockItemData,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
],
|
||||
};
|
||||
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("123456");
|
||||
expect(result?.marketplace_listing_title).toBe("Test Item");
|
||||
expect(result?.id).toBe("123");
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
});
|
||||
|
||||
test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{invalid: json}</script>
|
||||
<h1>Vintage Chair</h1>
|
||||
<span>CA$80</span>
|
||||
<div>Toronto, ON</div>
|
||||
<div>Description</div>
|
||||
<div>Solid wood chair</div>
|
||||
<a href="/marketplace/item/123/">View listing</a>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("123");
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
expect(result?.formatted_price?.text).toBe("CA$80");
|
||||
expect(result?.location_text?.text).toBe("Toronto, ON");
|
||||
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||
});
|
||||
|
||||
test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<link rel="canonical" href="https://www.facebook.com/marketplace/item/123/" />
|
||||
</head>
|
||||
<body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{invalid: json}</script>
|
||||
<a href="/marketplace/item/999/">
|
||||
<span>Related Chair</span>
|
||||
</a>
|
||||
<h1>Vintage Chair</h1>
|
||||
<span>CA$80</span>
|
||||
<div>Toronto, ON</div>
|
||||
<div>Message seller</div>
|
||||
<div>Seller details</div>
|
||||
<div>Description</div>
|
||||
<div>Solid wood chair</div>
|
||||
<a href="/marketplace/item/123/">View listing</a>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("123");
|
||||
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||
});
|
||||
|
||||
test("prefers the canonical permalink target over earlier decoy items", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
recommendation_units: [
|
||||
{
|
||||
listing: {
|
||||
id: "decoy-1",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Recommended Chair",
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
],
|
||||
target: {
|
||||
id: "real-123",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Canonical Chair",
|
||||
formatted_price: { text: "CA$120" },
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "120.00",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookItemData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.id).toBe("real-123");
|
||||
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
|
||||
});
|
||||
|
||||
test("should handle missing item data", () => {
|
||||
@@ -545,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
);
|
||||
});
|
||||
|
||||
test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>{invalid: json}</script>
|
||||
<a href="/marketplace/item/987654321/">
|
||||
<span>Vintage Bike</span>
|
||||
<span>CA$120</span>
|
||||
<span>Toronto, ON</span>
|
||||
</a>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const result = extractFacebookMarketplaceData(html);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result?.[0].node.listing.id).toBe("987654321");
|
||||
expect(result?.[0].node.listing.marketplace_listing_title).toBe(
|
||||
"Vintage Bike",
|
||||
);
|
||||
expect(result?.[0].node.listing.listing_price).toEqual({
|
||||
amount: "120.00",
|
||||
formatted_amount: "CA$120",
|
||||
currency: "CAD",
|
||||
});
|
||||
});
|
||||
|
||||
test("should handle empty search results", () => {
|
||||
const mockData = {
|
||||
require: [
|
||||
@@ -571,6 +691,305 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
||||
const result = extractFacebookMarketplaceData(html);
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
test("classifies Comet search responses", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head><title>Marketplace</title></head>
|
||||
<body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "search",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies Comet item responses", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<body>
|
||||
<script>"XCometMarketplacePermalinkController"</script>
|
||||
<script>{"routing_namespace":"fb_comet"}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/item/123/",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "item",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies login-gated responses before parsing", () => {
|
||||
const html = `<html><body>You must log in to Facebook</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "auth_gated",
|
||||
authGated: true,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies unavailable item responses", () => {
|
||||
const html = `<html><body>Marketplace</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "unavailable",
|
||||
authGated: false,
|
||||
unavailable: true,
|
||||
});
|
||||
});
|
||||
|
||||
test("classifies unknown responses when no signal is present", () => {
|
||||
const html = `<html><body>Some random page</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "unknown",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("does not false-positive on incidental login text", () => {
|
||||
const html = `<html><body><footer>log in to Facebook to see your notifications</footer></body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "unknown",
|
||||
authGated: false,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
|
||||
test("detects auth gating from URL redirect", () => {
|
||||
const html = `<html><body>Redirecting...</body></html>`;
|
||||
|
||||
expect(
|
||||
classifyFacebookResponse(
|
||||
html,
|
||||
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F",
|
||||
),
|
||||
).toEqual({
|
||||
kind: "auth_gated",
|
||||
authGated: true,
|
||||
unavailable: false,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("extractFacebookBootstrapCandidates", () => {
|
||||
test("extracts Comet bootstrap candidates from script tags", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>{"routing_namespace":"fb_comet"}</script>
|
||||
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
|
||||
<script>not json</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const candidates = extractFacebookBootstrapCandidates(html);
|
||||
expect(candidates).toHaveLength(2);
|
||||
expect(candidates[1]).toEqual({
|
||||
data: {
|
||||
marketplace_search_bootstrap: {
|
||||
edges: [{ node: { listing: { id: "1" } } }],
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("keeps candidate order stable for later scoring", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>{"marker":"first"}</script>
|
||||
<script>{"marker":"second"}</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const candidates = extractFacebookBootstrapCandidates(html);
|
||||
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
|
||||
});
|
||||
|
||||
test("extracts search results from Comet bootstrap candidates", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "Bike",
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
formatted_amount: "CA$120",
|
||||
currency: "CAD",
|
||||
},
|
||||
location: {
|
||||
reverse_geocode: {
|
||||
city_page: { display_name: "Toronto" },
|
||||
},
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const ads = extractFacebookMarketplaceData(html);
|
||||
expect(ads).toHaveLength(1);
|
||||
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
|
||||
});
|
||||
|
||||
test("prefers the strongest marketplace edge set when multiple edges arrays exist", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
incidental: {
|
||||
feed_units: {
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "wrong-1",
|
||||
marketplace_listing_title: "Wrong Listing",
|
||||
listing_price: {
|
||||
amount: "1.00",
|
||||
formatted_amount: "CA$1",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "right-1",
|
||||
marketplace_listing_title: "Right Listing",
|
||||
listing_price: {
|
||||
amount: "250.00",
|
||||
formatted_amount: "CA$250",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const ads = extractFacebookMarketplaceData(html);
|
||||
expect(ads).toHaveLength(1);
|
||||
expect(ads?.[0].node.listing.id).toBe("right-1");
|
||||
});
|
||||
|
||||
test("rejects mixed edge arrays that contain non-listing entries", () => {
|
||||
const html = `
|
||||
<html><body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<script>
|
||||
${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "Bike",
|
||||
listing_price: {
|
||||
amount: "120.00",
|
||||
formatted_amount: "CA$120",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
node: {
|
||||
story: {
|
||||
id: "not-a-listing",
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}
|
||||
</script>
|
||||
</body></html>
|
||||
`;
|
||||
|
||||
const ads = extractFacebookMarketplaceData(html);
|
||||
expect(ads).toBeNull();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||
import fetchFacebookItems from "../src/scrapers/facebook";
|
||||
import fetchFacebookItems, { fetchFacebookItem } from "../src/scrapers/facebook";
|
||||
|
||||
// Mock fetch globally
|
||||
const originalFetch = global.fetch;
|
||||
@@ -27,27 +27,19 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
|
||||
describe("Main Search Function", () => {
|
||||
test("should successfully fetch search results", async () => {
|
||||
const mockSearchData = {
|
||||
require: [
|
||||
[
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
__bbox: {
|
||||
result: {
|
||||
data: {
|
||||
marketplace_search: {
|
||||
feed_units: {
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "iPhone 13 Pro",
|
||||
marketplace_listing_title: "iPhone 13",
|
||||
listing_price: {
|
||||
amount: "800.00",
|
||||
formatted_amount: "$800.00",
|
||||
amount: "500.00",
|
||||
formatted_amount: "CA$500",
|
||||
currency: "CAD",
|
||||
},
|
||||
location: {
|
||||
@@ -55,49 +47,20 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
city_page: { display_name: "Toronto" },
|
||||
},
|
||||
},
|
||||
creation_time: 1640995200,
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "2",
|
||||
marketplace_listing_title: "Samsung Galaxy",
|
||||
listing_price: {
|
||||
amount: "600.00",
|
||||
formatted_amount: "$600.00",
|
||||
currency: "CAD",
|
||||
},
|
||||
location: {
|
||||
reverse_geocode: {
|
||||
city_page: { display_name: "Mississauga" },
|
||||
},
|
||||
},
|
||||
creation_time: 1640995300,
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
],
|
||||
};
|
||||
},
|
||||
})}</script></body></html>`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
text: () => Promise.resolve(mockSearchHtml),
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
@@ -105,9 +68,8 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
);
|
||||
|
||||
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
|
||||
expect(results).toHaveLength(2);
|
||||
expect(results[0].title).toBe("iPhone 13 Pro");
|
||||
expect(results[1].title).toBe("Samsung Galaxy");
|
||||
expect(results).toHaveLength(1);
|
||||
expect(results[0].title).toBe("iPhone 13");
|
||||
});
|
||||
|
||||
test("should filter out items without price", async () => {
|
||||
@@ -163,7 +125,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -218,7 +180,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -259,7 +221,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -292,6 +254,76 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
expect(results).toEqual([]);
|
||||
});
|
||||
|
||||
test("should return empty array for auth-gated search HTML", async () => {
|
||||
const authGatedSearchHtml = `
|
||||
<html>
|
||||
<body>
|
||||
<script>"XCometMarketplaceSearchController"</script>
|
||||
<a href="/marketplace/item/123456789/">
|
||||
<span>Vintage Lamp</span>
|
||||
<span>CA$45</span>
|
||||
<span>Toronto, ON</span>
|
||||
</a>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
|
||||
text: () => Promise.resolve(authGatedSearchHtml),
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
|
||||
expect(results).toEqual([]);
|
||||
});
|
||||
|
||||
test("should return empty array when search request lands on unknown route", async () => {
|
||||
const wrongRouteHtml = `<html><body><script>${JSON.stringify({
|
||||
payload: {
|
||||
resultGroups: [
|
||||
{
|
||||
edges: [
|
||||
{
|
||||
node: {
|
||||
listing: {
|
||||
id: "1",
|
||||
marketplace_listing_title: "Leaked Search Result",
|
||||
listing_price: {
|
||||
amount: "75.00",
|
||||
formatted_amount: "CA$75",
|
||||
currency: "CAD",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
})}</script></body></html>`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
url: "https://www.facebook.com/marketplace/toronto/",
|
||||
text: () => Promise.resolve(wrongRouteHtml),
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
|
||||
expect(results).toEqual([]);
|
||||
});
|
||||
|
||||
test("should handle network errors", async () => {
|
||||
global.fetch = mock(() => Promise.reject(new Error("Network error")));
|
||||
|
||||
@@ -358,7 +390,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -431,7 +463,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -500,7 +532,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -571,7 +603,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
ok: true,
|
||||
text: () =>
|
||||
Promise.resolve(
|
||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||
),
|
||||
headers: {
|
||||
get: () => null,
|
||||
@@ -637,4 +669,45 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
||||
expect(results).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Item Fetch Function", () => {
|
||||
test("should return null for unavailable item responses", async () => {
|
||||
const unavailableItemHtml = `
|
||||
<html>
|
||||
<body>
|
||||
<script>${JSON.stringify({
|
||||
payload: {
|
||||
listing: {
|
||||
id: "related-123",
|
||||
__typename: "GroupCommerceProductItem",
|
||||
marketplace_listing_title: "Related Listing",
|
||||
formatted_price: { text: "CA$90" },
|
||||
listing_price: {
|
||||
amount: "90.00",
|
||||
currency: "CAD",
|
||||
amount_with_offset: "90.00",
|
||||
},
|
||||
is_live: true,
|
||||
},
|
||||
},
|
||||
})}</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
global.fetch = mock(() =>
|
||||
Promise.resolve({
|
||||
ok: true,
|
||||
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||
text: () => Promise.resolve(unavailableItemHtml),
|
||||
headers: {
|
||||
get: () => null,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await fetchFacebookItem("123");
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user