Compare commits

...

8 Commits

5 changed files with 2090 additions and 325 deletions

View File

@@ -0,0 +1,772 @@
# Facebook Comet Rewrite Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Replace the legacy Facebook Marketplace scraper with a route-aware hybrid Comet-bootstrap parser for both search and item routes.
**Architecture:** Keep authenticated direct HTTP fetches as the transport. Classify each Facebook response first, then parse route-specific Comet bootstrap/state candidates, and fall back to rendered-HTML extraction only when bootstrap decoding cannot produce the expected search or item shape.
**Tech Stack:** Bun, TypeScript, `bun:test`, `linkedom`, existing shared cookie/http helpers
---
## File Structure
- Modify: `packages/core/src/scrapers/facebook.ts`
- Owns Facebook fetch flow, response classification, bootstrap candidate extraction, search parsing, item parsing, and HTML fallbacks.
- Modify: `packages/core/test/facebook-core.test.ts`
- Owns unit coverage for response classification, bootstrap parsing, fallback parsing, and route-aware item/search extraction behavior.
- Modify: `packages/core/test/facebook-integration.test.ts`
- Owns higher-level fetch flow tests, auth/degradation behavior, and result shaping for search/item entrypoints.
### Task 1: Add Route Classification Coverage
**Files:**
- Modify: `packages/core/test/facebook-core.test.ts`
- Modify: `packages/core/src/scrapers/facebook.ts`
- Test: `packages/core/test/facebook-core.test.ts`
- [ ] **Step 1: Write the failing tests**
Add these tests near the Facebook parser tests in `packages/core/test/facebook-core.test.ts`:
```ts
test("classifies Comet search responses", () => {
const html = `
<html>
<head><title>Marketplace</title></head>
<body>
<script>"XCometMarketplaceSearchController"</script>
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
</body>
</html>
`;
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/search?query=bike")).toEqual({
kind: "search",
authGated: false,
unavailable: false,
});
});
test("classifies Comet item responses", () => {
const html = `
<html>
<body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{"routing_namespace":"fb_comet"}</script>
</body>
</html>
`;
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/item/123/")).toEqual({
kind: "item",
authGated: false,
unavailable: false,
});
});
test("classifies login-gated responses before parsing", () => {
const html = `<html><body>You must log in to Facebook</body></html>`;
expect(classifyFacebookResponse(html, "https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F")).toEqual({
kind: "auth_gated",
authGated: true,
unavailable: false,
});
});
test("classifies unavailable item responses", () => {
const html = `<html><body>Marketplace</body></html>`;
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/?unavailable_product=1")).toEqual({
kind: "unavailable",
authGated: false,
unavailable: true,
});
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"`
Expected: FAIL because `classifyFacebookResponse` does not exist yet.
- [ ] **Step 3: Write minimal implementation**
Add this type and function near the parsing section in `packages/core/src/scrapers/facebook.ts`:
```ts
type FacebookResponseKind = "search" | "item" | "auth_gated" | "unavailable" | "unknown";
export function classifyFacebookResponse(htmlString: HTMLString, responseUrl: string) {
const authGated =
responseUrl.includes("/login/") ||
htmlString.includes("You must log in to Facebook") ||
htmlString.includes("log in to Facebook");
if (authGated) {
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
}
const unavailable = responseUrl.includes("unavailable_product=1");
if (unavailable) {
return { kind: "unavailable" as const, authGated: false, unavailable: true };
}
if (htmlString.includes("XCometMarketplaceSearchController")) {
return { kind: "search" as const, authGated: false, unavailable: false };
}
if (htmlString.includes("XCometMarketplacePermalinkController")) {
return { kind: "item" as const, authGated: false, unavailable: false };
}
return { kind: "unknown" as const, authGated: false, unavailable: false };
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
git commit -m "refactor: add facebook response classification"
```
### Task 2: Add Bootstrap Candidate Extraction
**Files:**
- Modify: `packages/core/test/facebook-core.test.ts`
- Modify: `packages/core/src/scrapers/facebook.ts`
- Test: `packages/core/test/facebook-core.test.ts`
- [ ] **Step 1: Write the failing tests**
Add these tests:
```ts
test("extracts Comet bootstrap candidates from script tags", () => {
const html = `
<html><body>
<script>{"routing_namespace":"fb_comet"}</script>
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
<script>not json</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates).toHaveLength(2);
expect(candidates[1]).toEqual({
data: {
marketplace_search_bootstrap: {
edges: [{ node: { listing: { id: "1" } } }],
},
},
});
});
test("keeps candidate order stable for later scoring", () => {
const html = `
<html><body>
<script>{"marker":"first"}</script>
<script>{"marker":"second"}</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates.map((candidate) => candidate.marker)).toEqual(["first", "second"]);
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"`
Expected: FAIL because `extractFacebookBootstrapCandidates` does not exist.
- [ ] **Step 3: Write minimal implementation**
Add this helper near the parser utilities in `packages/core/src/scrapers/facebook.ts`:
```ts
export function extractFacebookBootstrapCandidates(htmlString: HTMLString): Record<string, unknown>[] {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates: Record<string, unknown>[] = [];
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent?.trim();
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
if (isRecord(parsed)) {
candidates.push(parsed);
}
} catch {
// Ignore non-JSON script bodies.
}
}
return candidates;
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
git commit -m "refactor: add facebook bootstrap candidate extraction"
```
### Task 3: Replace Search Parsing With Candidate Scoring
**Files:**
- Modify: `packages/core/test/facebook-core.test.ts`
- Modify: `packages/core/test/facebook-integration.test.ts`
- Modify: `packages/core/src/scrapers/facebook.ts`
- Test: `packages/core/test/facebook-core.test.ts`
- Test: `packages/core/test/facebook-integration.test.ts`
- [ ] **Step 1: Write the failing tests**
Add a core test for route-aware search extraction:
```ts
test("extracts search results from Comet bootstrap candidates", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Bike",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Toronto" },
},
},
is_live: true,
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toHaveLength(1);
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
});
```
Replace one integration fixture with a current-shape search fixture:
```ts
const mockSearchHtml = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "iPhone 13",
listing_price: {
amount: "500.00",
formatted_amount: "CA$500",
currency: "CAD",
},
location: { reverse_geocode: { city_page: { display_name: "Toronto" } } },
is_live: true,
},
},
},
],
},
],
},
})}</script>
</body></html>
`;
```
- [ ] **Step 2: Run test to verify it fails**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet bootstrap candidates"`
Expected: FAIL because the current search extractor only understands legacy `marketplace_search` shapes.
- [ ] **Step 3: Write minimal implementation**
Replace the search extraction internals in `extractFacebookMarketplaceData()` with candidate scoring like this:
```ts
function findSearchEdges(candidate: unknown): FacebookEdge[] | null {
if (Array.isArray(candidate)) {
for (const item of candidate) {
const result = findSearchEdges(item);
if (result) return result;
}
return null;
}
if (!isRecord(candidate)) {
return null;
}
const directEdges = candidate.feed_units?.edges;
if (Array.isArray(directEdges)) {
return directEdges as FacebookEdge[];
}
const resultGroups = candidate.resultGroups;
if (Array.isArray(resultGroups)) {
for (const group of resultGroups) {
if (isRecord(group) && Array.isArray(group.edges)) {
return group.edges as FacebookEdge[];
}
}
}
for (const value of Object.values(candidate)) {
const result = findSearchEdges(value);
if (result) return result;
}
return null;
}
export function extractFacebookMarketplaceData(htmlString: HTMLString): FacebookAdNode[] | null {
const candidates = extractFacebookBootstrapCandidates(htmlString);
for (const candidate of candidates) {
const edges = findSearchEdges(candidate);
if (edges?.length) {
return edges.map((edge) => ({ node: edge.node }));
}
}
console.warn("No marketplace data found in HTML response");
return null;
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts`
Expected: PASS for the rewritten search fixtures and existing unaffected tests.
- [ ] **Step 5: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts
git commit -m "refactor: rewrite facebook search parser for comet bootstrap"
```
### Task 4: Replace Item Parsing With Candidate Scoring
**Files:**
- Modify: `packages/core/test/facebook-core.test.ts`
- Modify: `packages/core/src/scrapers/facebook.ts`
- Test: `packages/core/test/facebook-core.test.ts`
- [ ] **Step 1: Write the failing tests**
Replace one old item fixture with a current-shape item fixture:
```ts
test("extracts item details from Comet permalink bootstrap candidates", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
listing: {
id: "123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Vintage Chair",
formatted_price: { text: "CA$80" },
listing_price: { amount: "80.00", currency: "CAD", amount_with_offset: "80.00" },
redacted_description: { text: "Solid wood chair" },
location_text: { text: "Toronto, ON" },
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
condition: "USED",
is_live: true,
},
},
})}
</script>
</body></html>
`;
const item = extractFacebookItemData(html);
expect(item?.id).toBe("123");
expect(item?.marketplace_listing_title).toBe("Vintage Chair");
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet permalink bootstrap"`
Expected: FAIL because the current item extractor depends on legacy permalink markers.
- [ ] **Step 3: Write minimal implementation**
Replace the item extraction internals with a semantic candidate finder like this:
```ts
function findMarketplaceItemCandidate(candidate: unknown): FacebookMarketplaceItem | null {
if (Array.isArray(candidate)) {
for (const item of candidate) {
const result = findMarketplaceItemCandidate(item);
if (result) return result;
}
return null;
}
if (!isRecord(candidate)) {
return null;
}
if (
candidate.id &&
candidate.__typename === "GroupCommerceProductItem" &&
candidate.marketplace_listing_title
) {
return candidate as FacebookMarketplaceItem;
}
for (const value of Object.values(candidate)) {
const result = findMarketplaceItemCandidate(value);
if (result) return result;
}
return null;
}
export function extractFacebookItemData(htmlString: HTMLString): FacebookMarketplaceItem | null {
const candidates = extractFacebookBootstrapCandidates(htmlString);
for (const candidate of candidates) {
const item = findMarketplaceItemCandidate(candidate);
if (item) {
return item;
}
}
return null;
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `bun test packages/core/test/facebook-core.test.ts`
Expected: PASS for current-shape item tests and remaining parser tests.
- [ ] **Step 5: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
git commit -m "refactor: rewrite facebook item parser for comet bootstrap"
```
### Task 5: Add HTML Fallback Extraction
**Files:**
- Modify: `packages/core/test/facebook-core.test.ts`
- Modify: `packages/core/src/scrapers/facebook.ts`
- Test: `packages/core/test/facebook-core.test.ts`
- [ ] **Step 1: Write the failing tests**
Add these fallback tests:
```ts
test("falls back to rendered search HTML when bootstrap payloads are undecodable", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<a href="https://www.facebook.com/marketplace/item/123/?ref=search">Vintage Lamp</a>
<span>CA$45</span>
<span>Toronto, ON</span>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
const parsed = ads ? parseFacebookAds(ads) : [];
expect(parsed[0].title).toBe("Vintage Lamp");
expect(parsed[0].listingPrice?.amountFormatted).toBe("CA$45");
});
test("falls back to rendered item HTML when bootstrap payloads are undecodable", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<h1>Vintage Desk</h1>
<span>CA$120</span>
<span>Condition Used - Good</span>
<div>Description Solid oak desk.</div>
<div>Seller information Jordan</div>
</body></html>
`;
const item = extractFacebookItemData(html);
expect(item?.marketplace_listing_title).toBe("Vintage Desk");
expect(item?.formatted_price?.text).toBe("CA$120");
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"`
Expected: FAIL because the extractor currently returns `null` without a structured candidate.
- [ ] **Step 3: Write minimal implementation**
Add route-specific HTML fallback helpers in `packages/core/src/scrapers/facebook.ts`:
```ts
function extractSearchFallback(htmlString: HTMLString): FacebookAdNode[] | null {
const idMatch = htmlString.match(/marketplace\/item\/(\d+)/);
const titleMatch = htmlString.match(/marketplace\/item\/\d+\/[^>]*>([^<]+)</);
const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/);
const cityMatch = htmlString.match(/([A-Z][a-z]+,\s*[A-Z]{2})/);
if (!idMatch || !titleMatch || !priceMatch) return null;
return [
{
node: {
listing: {
id: idMatch[1],
marketplace_listing_title: titleMatch[1].trim(),
listing_price: {
amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
formatted_amount: priceMatch[0],
currency: "CAD",
},
location: cityMatch
? { reverse_geocode: { city_page: { display_name: cityMatch[1].split(",")[0] } } }
: undefined,
is_live: true,
},
},
},
];
}
function extractItemFallback(htmlString: HTMLString): FacebookMarketplaceItem | null {
const titleMatch = htmlString.match(/<h1[^>]*>([^<]+)<\/h1>/i);
const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/);
if (!titleMatch || !priceMatch) return null;
return {
id: "fallback-item",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: titleMatch[1].trim(),
formatted_price: { text: priceMatch[0] },
listing_price: {
amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
currency: "CAD",
amount_with_offset: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
},
redacted_description: { text: htmlString.includes("Description") ? htmlString.split("Description")[1].split("<")[0].trim() : "" },
is_live: true,
};
}
```
Then call these helpers as the last fallback inside `extractFacebookMarketplaceData()` and `extractFacebookItemData()`.
- [ ] **Step 4: Run test to verify it passes**
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
git commit -m "refactor: add facebook html fallbacks"
```
### Task 6: Wire Route-Aware Failures Into Entry Points
**Files:**
- Modify: `packages/core/test/facebook-integration.test.ts`
- Modify: `packages/core/src/scrapers/facebook.ts`
- Test: `packages/core/test/facebook-integration.test.ts`
- [ ] **Step 1: Write the failing tests**
Add these integration tests:
```ts
test("returns empty search results for auth-gated search HTML", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
text: () => Promise.resolve("<html><body>You must log in to Facebook</body></html>"),
headers: { get: () => null },
}),
);
const results = await fetchFacebookItems("bike", 1, "toronto", 25);
expect(results).toEqual([]);
});
test("returns null for unavailable item responses", async () => {
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
text: () => Promise.resolve("<html><body>Marketplace</body></html>"),
headers: { get: () => null },
}),
);
const item = await fetchFacebookItem("123");
expect(item).toBeNull();
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `bun test packages/core/test/facebook-integration.test.ts --test-name-pattern "auth-gated|unavailable"`
Expected: FAIL because the entrypoints do not yet classify successful HTML responses by route/auth state.
- [ ] **Step 3: Write minimal implementation**
Update both entrypoints to classify successful HTML before parsing:
```ts
const responseClass = classifyFacebookResponse(searchHtml, searchUrl);
if (responseClass.kind === "auth_gated") {
console.warn("Facebook marketplace search is auth-gated. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.");
return [];
}
const itemResponseClass = classifyFacebookResponse(itemHtml, itemUrl);
if (itemResponseClass.kind === "auth_gated") {
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
return null;
}
if (itemResponseClass.kind === "unavailable") {
console.warn(`Item ${itemId} appears to be unavailable in the marketplace.`);
return null;
}
```
Use the actual response URL from `fetchHtml` plumbing if that helper is extended to return both HTML and final URL; otherwise start by threading final URL support through the fetch helper in the same task.
- [ ] **Step 4: Run test to verify it passes**
Run: `bun test packages/core/test/facebook-integration.test.ts`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-integration.test.ts
git commit -m "refactor: handle facebook route-aware failure states"
```
### Task 7: Run Full Verification And Live Probe
**Files:**
- Modify: `packages/core/src/scrapers/facebook.ts` if small cleanup is required
- Modify: `packages/core/test/facebook-core.test.ts` if small cleanup is required
- Modify: `packages/core/test/facebook-integration.test.ts` if small cleanup is required
- [ ] **Step 1: Run focused Facebook tests**
Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts`
Expected: PASS
- [ ] **Step 2: Run broader core tests**
Run: `bun test packages/core/test`
Expected: PASS
- [ ] **Step 3: Run live authenticated Facebook probe**
Run:
```bash
set -a && source .env && set +a && bun --eval 'import { fetchFacebookItems, fetchFacebookItem } from "./packages/core/src/index.ts";
const results = await fetchFacebookItems("iphone", 1, "toronto", 3);
console.log("SEARCH_COUNT=" + results.length);
console.log(JSON.stringify(results[0] ?? null));
if (results[0]?.url) {
const match = results[0].url.match(/\/item\/(\d+)/);
if (match) {
const item = await fetchFacebookItem(match[1]);
console.log(JSON.stringify(item));
}
}'
```
Expected:
- search returns at least one result
- item fetch returns non-null for the first live result when the route is not stale/unavailable
- [ ] **Step 4: Make any minimal cleanup needed to keep tests and live probe green**
If cleanup is needed, keep it limited to naming, dead-code removal caused by the rewrite, or small parser corrections directly exposed by the verification commands.
- [ ] **Step 5: Re-run verification**
Run:
```bash
bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts && bun test packages/core/test
```
Expected: PASS
- [ ] **Step 6: Commit**
```bash
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts
git commit -m "refactor: complete facebook comet scraper rewrite"
```
## Self-Review
- Spec coverage: the plan covers classification, route-aware search parsing, route-aware item parsing, HTML fallbacks, explicit failure-state handling, test replacement, and live verification.
- Placeholder scan: no `TODO`, `TBD`, or unspecified “handle appropriately” steps remain.
- Type consistency: all planned functions and types use the same names across tasks: `classifyFacebookResponse`, `extractFacebookBootstrapCandidates`, `extractFacebookMarketplaceData`, and `extractFacebookItemData`.

View File

@@ -0,0 +1,226 @@
# Facebook Comet Rewrite Design
## Summary
Replace the legacy Facebook Marketplace scraper with a route-aware implementation built around current Comet bootstrap markers and route-specific extraction.
The new scraper will keep authenticated direct HTTP fetches as the primary transport, but it will stop treating legacy `require`, `__bbox`, and `marketplace_product_details_page` structures as the main parsing contract.
## Goals
- Replace both Facebook search and item-detail extraction with a current-shape parser.
- Keep authenticated direct HTTP requests as the primary fetch strategy.
- Parse route-specific Comet bootstrap/state payloads before falling back to rendered-HTML extraction.
- Detect auth-gated, unavailable, and unknown responses explicitly.
- Update tests so they model current route markers and failure modes instead of legacy page objects.
## Non-Goals
- Reworking non-Facebook scrapers.
- Converting the scraper to browser-only automation.
- Preserving old parser behavior for `marketplace_product_details_page` or `__bbox`-driven item extraction.
- Reverse-engineering every internal Facebook bootstrap payload shape exhaustively before implementation.
## Current State
The current implementation in `packages/core/src/scrapers/facebook.ts` still uses authenticated HTTP requests, which remains correct.
The search path parses embedded script JSON and looks for `marketplace_search.feed_units.edges`.
The item-detail path is centered on legacy extraction paths such as:
- `parsed.require[0][3].__bbox.result.data.viewer.marketplace_product_details_page.target`
- nested `__bbox.require[...]` variations
- recursive search through `parsed.require`
Live evidence gathered earlier in this session and by the isolated research subagent shows that current Facebook Marketplace pages are Comet route-driven and expose markers such as:
- `XCometMarketplaceSearchController`
- `XCometMarketplacePermalinkController`
- `routing_namespace":"fb_comet"`
- `use_ssr_state_manager":true`
- `ServerJS`
- `Bootloader`
- `data-sjs`
- `data-btmanifest`
The same live investigation also showed that authenticated item pages no longer expose the old `marketplace_product_details_page` marker reliably, while live search still returns usable results.
## Chosen Approach
Use a hybrid Comet-bootstrap parser.
The scraper will:
1. Fetch authenticated HTML directly.
2. Classify the response using current route and auth markers.
3. Parse inline bootstrap/state payloads using route-specific probes.
4. Fall back to rendered-HTML extraction only when bootstrap markers are present but the payload cannot be decoded into the expected search or item shape.
This keeps the cheaper direct-HTTP transport while shifting the parser contract from legacy page-object names to current Comet route structure.
## Design
### Route Classification
Add a small response-classification layer before data extraction.
It should identify these states from the fetched response URL and HTML:
- `auth_gated`
- `unavailable`
- `search`
- `item`
- `unknown`
Signals to use:
- final URL containing `/login/` or login-shell text
- final URL containing `unavailable_product=1`
- search controller markers such as `XCometMarketplaceSearchController`
- item controller markers such as `XCometMarketplacePermalinkController`
- shared Comet markers such as `routing_namespace":"fb_comet"`
This classification layer becomes the top-level contract for both fetch functions.
### Search Extraction
The search path will be rewritten around Comet search-route markers.
Primary behavior:
- fetch the Marketplace search HTML with auth cookies
- confirm the response class is `search`
- extract inline bootstrap/state blobs from script tags and page attributes
- probe for route-specific search payloads associated with `XCometMarketplaceSearchController`
- map decoded search results into summary listing records
Search summary fields should remain aligned with the current public output shape:
- item URL
- title
- formatted price and normalized cents when possible
- city/address summary when present
- seller summary when present in the search payload
- category/status/media fields only when they are present with stable meaning
Fallback behavior:
- if search route markers are present but structured payload decoding fails, extract listing summaries from rendered HTML anchors and text patterns
- use item links matching `/marketplace/item/<id>` as the anchor for fallback extraction
- treat fallback results as summary-only data, not rich detail data
### Item Extraction
The item-detail path will be rewritten around the Comet permalink route.
Primary behavior:
- fetch the item permalink HTML with auth cookies
- confirm the response class is `item`
- extract inline bootstrap/state blobs from script tags and page attributes
- probe for permalink payloads associated with `XCometMarketplacePermalinkController`
- decode the richest recoverable item record and map it into `FacebookListingDetails`
Priority item fields:
- item ID and permalink URL
- title
- formatted price and normalized cents when possible
- condition
- description
- listed age / creation date when derivable
- approximate location
- seller name and seller ID when present
- listing status when the payload makes it explicit
Fallback behavior:
- if permalink route markers are present but no stable payload object is decodable, extract data from rendered HTML text structure
- prioritize title, price, condition, description, location text, and seller module content
- return partial item data when core user-facing fields are present rather than failing solely because deeper commerce metadata is missing
### Bootstrap Parsing Strategy
The parser should stop assuming a single stable JSON path.
Instead, it should work in two phases:
1. Discover candidate bootstrap payloads.
2. Score candidates against the expected route shape.
Candidate discovery inputs:
- raw `<script>` contents
- `data-sjs` and related page attributes
- `ServerJS` / `Bootloader` inline blobs
- route controller names
Candidate scoring for search should favor objects that contain repeated result-card semantics, item IDs, listing links, titles, prices, or location summaries.
Candidate scoring for item pages should favor objects that contain singular listing semantics, title, price, condition, description, location, seller, or permalink context.
The parser should not depend on one hard-coded object name surviving forever.
Instead, it should look for route-specific semantic clusters and choose the strongest candidate.
### Legacy Removal
The old Facebook scraper should be removed as a primary strategy.
Specifically:
- delete old item-detail extraction paths centered on `marketplace_product_details_page`
- delete legacy-first `require` / `__bbox` navigation tables
- delete tests whose only purpose is to preserve those legacy paths
If a minimal legacy compatibility branch remains, it must be a last-resort fallback behind the new route-aware parser and should not shape test fixtures or design decisions.
### Error Handling
Facebook responses should now fail with explicit route-aware outcomes:
1. Missing/invalid auth cookie input.
2. Auth-gated response.
3. Unavailable or stale item response.
4. Search or item route detected, but no decodable data found.
5. Unknown response shape.
Error messages should name the actual class of failure instead of implying that every parse miss is caused by expired cookies.
### Testing Strategy
Follow TDD for the rewrite.
Write failing tests for the new route-aware parser before replacing production code.
Coverage targets:
1. Search responses classify correctly from current Comet controller markers.
2. Item responses classify correctly from current Comet controller markers.
3. Login-gated and unavailable responses are detected before parsing.
4. Search bootstrap parsing produces summary listing results from current-shape fixtures.
5. Item bootstrap parsing produces rich listing details from current-shape fixtures.
6. Search fallback extraction works when route markers exist but structured payload decoding fails.
7. Item fallback extraction works when route markers exist but structured payload decoding fails.
8. Old legacy-only item fixtures are removed or rewritten so they no longer define the contract.
Verification target after implementation:
- `bun test packages/core/test/facebook-core.test.ts`
- `bun test packages/core/test/facebook-integration.test.ts`
- a live authenticated Facebook probe covering search and item routes
## Public API Surface
Keep the current public function names unless the rewrite proves that a signature change is required:
- `fetchFacebookItems(...)`
- `fetchFacebookItem(...)`
- `extractFacebookMarketplaceData(...)`
- `extractFacebookItemData(...)`
The internals should change substantially, but callers should not need a new integration surface for this rewrite.
## Risks
- Facebook may change bootstrap payload naming again, so route/controller markers are more stable than exact nested object paths but still not guaranteed.
- Search and item pages may each contain multiple partial payloads, making candidate ranking important.
- Fallback rendered-HTML extraction may be noisier than bootstrap decoding and needs clear precedence rules.
- Live fixtures can drift from production quickly, so tests must model route semantics rather than exact one-off payloads where possible.
## Rollout Notes
The code, fixtures, and tests should change together.
There should be no mixed state where the implementation is Comet-aware but the tests still encode `marketplace_product_details_page` as the primary contract.

View File

@@ -75,13 +75,6 @@ interface FacebookEdge {
[k: string]: unknown;
}
interface FacebookMarketplaceSearch {
feed_units?: {
edges?: FacebookEdge[];
};
[k: string]: unknown;
}
interface FacebookMarketplaceItem {
// Basic identification
id: string;
@@ -173,6 +166,10 @@ interface FacebookMarketplaceItem {
[k: string]: unknown;
}
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
export interface FacebookListingDetails {
url: string;
title: string;
@@ -286,7 +283,7 @@ async function fetchHtml(
onRateInfo?: (remaining: string | null, reset: string | null) => void;
cookies?: string;
},
): Promise<HTMLString> {
): Promise<{ html: HTMLString; responseUrl: string }> {
const maxRetries = opts?.maxRetries ?? 3;
const retryBaseMs = opts?.retryBaseMs ?? 500;
@@ -357,7 +354,7 @@ async function fetchHtml(
const html = await res.text();
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
await delay(DELAY_MS);
return html;
return { html, responseUrl: res.url || url };
} catch (err) {
if (attempt >= maxRetries) throw err;
await delay((attempt + 1) * retryBaseMs);
@@ -369,223 +366,477 @@ async function fetchHtml(
// ----------------------------- Parsing -----------------------------
export type FacebookResponseKind =
| "search"
| "item"
| "auth_gated"
| "unavailable"
| "unknown";
export function classifyFacebookResponse(
htmlString: HTMLString,
responseUrl: string,
) {
const authGated =
responseUrl.includes("/login/") ||
htmlString.includes("You must log in") ||
htmlString.includes("log in to continue");
if (authGated) {
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
}
const unavailable =
responseUrl.includes("unavailable_product=1") ||
htmlString.includes("This listing is no longer available") ||
htmlString.includes("listing has been removed");
if (unavailable) {
return { kind: "unavailable" as const, authGated: false, unavailable: true };
}
if (responseUrl.includes("/marketplace/item/")) {
return { kind: "item" as const, authGated: false, unavailable: false };
}
if (htmlString.includes("XCometMarketplaceSearchController")) {
return { kind: "search" as const, authGated: false, unavailable: false };
}
if (htmlString.includes("XCometMarketplacePermalinkController")) {
return { kind: "item" as const, authGated: false, unavailable: false };
}
return { kind: "unknown" as const, authGated: false, unavailable: false };
}
export function extractFacebookBootstrapCandidates(
htmlString: HTMLString,
): Record<string, unknown>[] {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates: Record<string, unknown>[] = [];
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent?.trim();
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
if (isRecord(parsed)) {
candidates.push(parsed as Record<string, unknown>);
}
} catch {
// skip non-JSON script bodies
}
}
return candidates;
}
function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] {
return (
Array.isArray(value) &&
value.length > 0 &&
value.every(
(edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing),
)
);
}
function scoreSearchEdges(edges: FacebookEdge[], score: number): number {
return score + Math.min(edges.length, 3);
}
function findSearchEdges(
candidate: unknown,
score = 0,
): { edges: FacebookEdge[]; score: number } | null {
if (Array.isArray(candidate)) {
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
for (const item of candidate) {
const result = findSearchEdges(item, score);
if (result && (!bestMatch || result.score > bestMatch.score)) {
bestMatch = result;
}
}
return bestMatch;
}
if (!isRecord(candidate)) {
return null;
}
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
const feedUnits = candidate.feed_units;
if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) {
bestMatch = {
edges: feedUnits.edges,
score: scoreSearchEdges(feedUnits.edges, score + 2),
};
}
const resultGroups = candidate.resultGroups;
if (Array.isArray(resultGroups)) {
for (const group of resultGroups) {
if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) {
const result = {
edges: group.edges,
score: scoreSearchEdges(group.edges, score + 4),
};
if (!bestMatch || result.score > bestMatch.score) {
bestMatch = result;
}
}
}
}
for (const [key, value] of Object.entries(candidate)) {
const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0));
if (result && (!bestMatch || result.score > bestMatch.score)) {
bestMatch = result;
}
}
return bestMatch;
}
interface FacebookMarketplaceItemMatch {
item: FacebookMarketplaceItem;
score: number;
path: string[];
}
function scoreMarketplaceItemPath(path: string[]): number {
let score = 0;
if (path.includes("payload")) {
score += 2;
}
if (path.includes("viewer")) {
score += 2;
}
if (path.includes("marketplace_product_details_page")) {
score += 6;
}
if (path.includes("target")) {
score += 8;
}
if (path.includes("listing")) {
score += 6;
}
if (
path.some(
(segment) =>
segment.includes("recommend") || segment.includes("related"),
)
) {
score -= 10;
}
return score - path.length;
}
function collectMarketplaceItemCandidates(
candidate: unknown,
path: string[] = [],
): FacebookMarketplaceItemMatch[] {
if (Array.isArray(candidate)) {
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
}
if (!isRecord(candidate)) {
return [];
}
const matches: FacebookMarketplaceItemMatch[] = [];
if (
typeof candidate.id === "string" &&
candidate.__typename === "GroupCommerceProductItem" &&
typeof candidate.marketplace_listing_title === "string"
) {
matches.push({
item: candidate as FacebookMarketplaceItem,
score: scoreMarketplaceItemPath(path),
path,
});
}
for (const [key, value] of Object.entries(candidate)) {
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
}
return matches;
}
function parseFacebookRenderedPrice(priceText: string) {
const trimmed = priceText.trim();
if (!trimmed || trimmed.toUpperCase() === "FREE") {
return {
amount: "0.00",
formatted_amount: trimmed || "FREE",
currency: "CAD",
};
}
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
if (!amountMatch) {
return null;
}
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
if (!Number.isFinite(amount)) {
return null;
}
return {
amount: amount.toFixed(2),
formatted_amount: trimmed,
currency: "CAD",
};
}
function extractRenderedText(node: ParentNode, selector: string): string[] {
return Array.from(node.querySelectorAll(selector))
.map((element) => element.textContent?.trim())
.filter((text): text is string => Boolean(text));
}
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
const href = element?.getAttribute("href") || "";
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
}
function extractFacebookPermalinkItemId(document: Document): string | null {
const canonicalId = extractMarketplaceItemIdFromElement(
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
);
if (canonicalId) {
return canonicalId;
}
const ogUrl = document
.querySelector('meta[property="og:url"]')
?.getAttribute("content");
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
if (ogId) {
return ogId;
}
const title = document.querySelector("h1")?.textContent?.trim();
if (!title) {
return null;
}
const itemLinks = Array.from(
document.querySelectorAll('a[href*="/marketplace/item/"]'),
);
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
if (selfLink) {
return extractMarketplaceItemIdFromElement(selfLink);
}
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
}
function extractFacebookDescriptionText(document: Document): string | undefined {
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
for (const label of labels) {
if (label.textContent?.trim() !== "Description") {
continue;
}
let sibling = label.nextElementSibling;
while (sibling) {
const text = sibling.textContent?.trim();
if (text && text !== "Description") {
return text;
}
sibling = sibling.nextElementSibling;
}
}
return undefined;
}
function extractFacebookMarketplaceHtmlFallback(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const links = Array.from(
document.querySelectorAll('a[href*="/marketplace/item/"]'),
) as HTMLAnchorElement[];
const seenIds = new Set<string>();
const results: FacebookAdNode[] = [];
for (const link of links) {
const href = link.getAttribute("href") || "";
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
if (!id || seenIds.has(id)) {
continue;
}
const texts = extractRenderedText(link, "span, div");
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
const title = texts.find(
(text) => text !== priceText && text !== location && !text.includes("/"),
);
if (!title || !priceText) {
continue;
}
const parsedPrice = parseFacebookRenderedPrice(priceText);
if (!parsedPrice) {
continue;
}
results.push({
node: {
listing: {
id,
marketplace_listing_title: title,
listing_price: parsedPrice,
location: location
? {
reverse_geocode: {
city_page: {
display_name: location,
},
},
}
: undefined,
is_live: true,
},
},
});
seenIds.add(id);
}
return results.length > 0 ? results : null;
}
function extractFacebookItemHtmlFallback(
htmlString: HTMLString,
): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString);
const title = document.querySelector("h1")?.textContent?.trim();
const id = extractFacebookPermalinkItemId(document);
if (!id || !title) {
return null;
}
const texts = extractRenderedText(document, "h1, span, div, p");
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
const location = texts.find(
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
);
const description = extractFacebookDescriptionText(document);
return {
id,
__typename: "GroupCommerceProductItem",
marketplace_listing_title: title,
formatted_price: priceText ? { text: priceText } : undefined,
listing_price: parsedPrice
? {
amount: parsedPrice.amount,
currency: parsedPrice.currency,
amount_with_offset: parsedPrice.amount,
}
: undefined,
location_text: location ? { text: location } : undefined,
redacted_description: description ? { text: description } : undefined,
is_live: true,
};
}
/**
Extract marketplace search data from Facebook page script tags
*/
export function extractFacebookMarketplaceData(
htmlString: HTMLString,
): FacebookAdNode[] | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates = extractFacebookBootstrapCandidates(htmlString);
let bestEdges: FacebookEdge[] | null = null;
let bestScore = -1;
let marketplaceData: FacebookMarketplaceSearch | null = null;
for (const candidate of candidates) {
const result = findSearchEdges(candidate);
if (!result?.edges.length) {
continue;
}
// Find the script containing the require data with marketplace_search
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
const scriptText = script.textContent;
if (!scriptText) continue;
try {
const parsed = JSON.parse(scriptText);
// First check if this is the direct data structure (like in examples)
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple navigation paths to find marketplace_search
const paths = [
// Original path from example
() =>
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
.marketplace_search,
// Alternative path structure
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
// Another variation
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
// Direct access for some responses
() => {
for (const item of parsed.require) {
if (item && item.length >= 4 && item[3]) {
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
if (bbox) return bbox;
}
}
return null;
},
];
for (const getData of paths) {
try {
const result = getData();
if (
result &&
isRecord(result) &&
(result as Record<string, unknown>).feed_units?.edges?.length > 0
) {
marketplaceData = result as FacebookMarketplaceSearch;
break;
}
} catch {}
}
if (marketplaceData) break;
}
// Also check for direct marketplace_search in the parsed data
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
const searchData =
parsed.marketplace_search as FacebookMarketplaceSearch;
const feedLength = searchData.feed_units?.edges?.length ?? 0;
if (feedLength > 0) {
marketplaceData = searchData;
break;
}
}
} catch {}
if (result.score > bestScore) {
bestScore = result.score;
bestEdges = result.edges;
}
}
if (!marketplaceData?.feed_units?.edges?.length) {
if (!bestEdges?.length) {
if (htmlString.includes("XCometMarketplaceSearchController")) {
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
if (htmlFallback?.length) {
console.log(
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
);
return htmlFallback;
}
}
console.warn("No marketplace data found in HTML response");
return null;
}
console.log(
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`,
`Successfully parsed ${bestEdges.length} Facebook marketplace listings`,
);
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
return bestEdges.map((edge) => ({ node: edge.node }));
}
/**
Extract marketplace item details from Facebook item page HTML
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
Updated for 2026 Facebook Marketplace bootstrap candidates
*/
export function extractFacebookItemData(
htmlString: HTMLString,
): FacebookMarketplaceItem | null {
const { document } = parseHTML(htmlString);
const scripts = document.querySelectorAll("script");
const candidates = extractFacebookBootstrapCandidates(htmlString);
let bestMatch: FacebookMarketplaceItemMatch | null = null;
for (const script of scripts) {
const scriptText = script.textContent;
if (!scriptText) continue;
for (const candidate of candidates) {
const matches = collectMarketplaceItemCandidates(candidate);
try {
const parsed = JSON.parse(scriptText);
// Check for the require structure with marketplace product details
if (parsed.require && Array.isArray(parsed.require)) {
// Try multiple extraction paths discovered from reverse engineering
const extractionPaths = [
// Path 1: Primary path from current API structure
() =>
parsed.require[0][3].__bbox.result.data.viewer
.marketplace_product_details_page.target,
// Path 2: Alternative path with nested require
() =>
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 3: Variation without the [0] index
() =>
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
.viewer.marketplace_product_details_page.target,
// Path 4-5: Additional fallback paths for edge cases
() =>
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
() =>
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
?.marketplace_product_details_page?.target,
];
let pathIndex = 0;
for (const getPath of extractionPaths) {
try {
const targetData = getPath();
if (
targetData &&
typeof targetData === "object" &&
targetData.id &&
targetData.marketplace_listing_title &&
targetData.__typename === "GroupCommerceProductItem"
) {
console.log(
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
);
return targetData as FacebookMarketplaceItem;
}
} catch {
// Path not found or invalid, try next path
}
pathIndex++;
}
// Fallback: Search recursively for marketplace data in the parsed structure
const findMarketplaceData = (
obj: unknown,
depth = 0,
maxDepth = 10,
): FacebookMarketplaceItem | null => {
if (depth > maxDepth) return null; // Prevent infinite recursion
if (isRecord(obj)) {
// Check if this object matches the expected marketplace item structure
const candidate = obj as Record<string, unknown>;
if (
candidate.marketplace_listing_title &&
candidate.id &&
candidate.__typename === "GroupCommerceProductItem" &&
candidate.redacted_description
) {
return candidate as unknown as FacebookMarketplaceItem;
}
// Recursively search nested objects and arrays
for (const key in obj) {
const value = obj[key];
if (isRecord(value) || Array.isArray(value)) {
const result = findMarketplaceData(value, depth + 1, maxDepth);
if (result) return result;
}
}
} else if (Array.isArray(obj)) {
// Search through arrays
for (const item of obj) {
const result = findMarketplaceData(item, depth + 1, maxDepth);
if (result) return result;
}
}
return null;
};
// Search through the entire require structure
const recursiveResult = findMarketplaceData(parsed.require);
if (recursiveResult) {
console.log(
"Successfully extracted Facebook item data using recursive search",
);
return recursiveResult;
}
// Additional search in other potential locations
if (
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
?.target
) {
const bboxData =
parsed.__bbox.result.data.viewer.marketplace_product_details_page
.target;
if (
bboxData &&
typeof bboxData === "object" &&
bboxData.id &&
bboxData.marketplace_listing_title &&
bboxData.__typename === "GroupCommerceProductItem"
) {
console.log(
"Successfully extracted Facebook item data from __bbox structure",
);
return bboxData as FacebookMarketplaceItem;
}
}
for (const match of matches) {
if (
!bestMatch ||
match.score > bestMatch.score ||
(match.score === bestMatch.score && match.path.length < bestMatch.path.length)
) {
bestMatch = match;
}
} catch {}
}
}
if (bestMatch) {
return bestMatch.item;
}
if (htmlString.includes("XCometMarketplacePermalinkController")) {
return extractFacebookItemHtmlFallback(htmlString);
}
return null;
@@ -838,8 +1089,9 @@ export default async function fetchFacebookItems(
console.log(`Using ${cookies.length} cookies for authentication`);
let searchHtml: string;
let searchResponseUrl = searchUrl;
try {
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
const response = await fetchHtml(searchUrl, DELAY_MS, {
maxRetries: 3,
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
@@ -850,6 +1102,8 @@ export default async function fetchFacebookItems(
},
cookies: cookiesHeader,
});
searchHtml = response.html;
searchResponseUrl = response.responseUrl;
} catch (err) {
if (err instanceof HttpError) {
console.warn(
@@ -865,6 +1119,24 @@ export default async function fetchFacebookItems(
throw err;
}
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
if (classification.authGated) {
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
return [];
}
if (classification.unavailable) {
console.warn("Facebook marketplace search returned an unavailable route.");
return [];
}
if (classification.kind !== "search") {
console.warn(
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
);
return [];
}
const ads = extractFacebookMarketplaceData(searchHtml);
if (!ads || ads.length === 0) {
console.warn("No ads parsed from Facebook marketplace page.");
@@ -916,8 +1188,9 @@ export async function fetchFacebookItem(
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
let itemHtml: string;
let itemResponseUrl = itemUrl;
try {
itemHtml = await fetchHtml(itemUrl, 1000, {
const response = await fetchHtml(itemUrl, 1000, {
onRateInfo: (remaining, reset) => {
if (remaining && reset) {
console.log(
@@ -927,6 +1200,8 @@ export async function fetchFacebookItem(
},
cookies: cookiesHeader,
});
itemHtml = response.html;
itemResponseUrl = response.responseUrl;
} catch (err) {
if (err instanceof HttpError) {
console.warn(
@@ -967,31 +1242,31 @@ export async function fetchFacebookItem(
throw err;
}
const classification = classifyFacebookResponse(itemHtml, itemResponseUrl);
if (classification.authGated) {
logExtractionMetrics(false, itemId);
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
return null;
}
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
logExtractionMetrics(false, itemId);
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
return null;
}
if (classification.kind !== "item") {
logExtractionMetrics(false, itemId);
console.warn(
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
);
return null;
}
const itemData = extractFacebookItemData(itemHtml);
if (!itemData) {
logExtractionMetrics(false, itemId);
// Enhanced checking for specific failure scenarios
if (
itemHtml.includes("This listing is no longer available") ||
itemHtml.includes("listing has been removed") ||
itemHtml.includes("This item has been sold")
) {
console.warn(
`Item ${itemId} appears to be sold or removed from marketplace.`,
);
return null;
}
if (
itemHtml.includes("log in to Facebook") ||
itemHtml.includes("You must log in") ||
itemHtml.includes("authentication required")
) {
console.warn(
`Authentication failed for item ${itemId}. Cookies may be expired.`,
);
return null;
}
console.warn(
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,

View File

@@ -1,6 +1,8 @@
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import {
classifyFacebookResponse,
ensureFacebookCookies,
extractFacebookBootstrapCandidates,
extractFacebookItemData,
extractFacebookMarketplaceData,
fetchFacebookItem,
@@ -367,43 +369,134 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
describe("Data Extraction", () => {
describe("extractFacebookItemData", () => {
test("should extract item data from standard require structure", () => {
const mockItemData = {
id: "123456",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Test Item",
formatted_price: { text: "$100.00" },
listing_price: { amount: "100.00", currency: "CAD" },
is_live: true,
};
const mockData = {
require: [
[
null,
null,
null,
{
__bbox: {
result: {
data: {
viewer: {
marketplace_product_details_page: {
target: mockItemData,
},
},
test("extracts item details from Comet permalink bootstrap candidates", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
listing: {
id: "123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Vintage Chair",
formatted_price: { text: "CA$80" },
listing_price: {
amount: "80.00",
currency: "CAD",
amount_with_offset: "80.00",
},
redacted_description: { text: "Solid wood chair" },
location_text: { text: "Toronto, ON" },
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
condition: "USED",
is_live: true,
},
},
},
],
],
};
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
})}
</script>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("123456");
expect(result?.marketplace_listing_title).toBe("Test Item");
expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
});
test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{invalid: json}</script>
<h1>Vintage Chair</h1>
<span>CA$80</span>
<div>Toronto, ON</div>
<div>Description</div>
<div>Solid wood chair</div>
<a href="/marketplace/item/123/">View listing</a>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
expect(result?.formatted_price?.text).toBe("CA$80");
expect(result?.location_text?.text).toBe("Toronto, ON");
expect(result?.redacted_description?.text).toBe("Solid wood chair");
});
test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => {
const html = `
<html>
<head>
<link rel="canonical" href="https://www.facebook.com/marketplace/item/123/" />
</head>
<body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{invalid: json}</script>
<a href="/marketplace/item/999/">
<span>Related Chair</span>
</a>
<h1>Vintage Chair</h1>
<span>CA$80</span>
<div>Toronto, ON</div>
<div>Message seller</div>
<div>Seller details</div>
<div>Description</div>
<div>Solid wood chair</div>
<a href="/marketplace/item/123/">View listing</a>
</body>
</html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("123");
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
expect(result?.redacted_description?.text).toBe("Solid wood chair");
});
test("prefers the canonical permalink target over earlier decoy items", () => {
const html = `
<html><body>
<script>"XCometMarketplacePermalinkController"</script>
<script>
${JSON.stringify({
payload: {
recommendation_units: [
{
listing: {
id: "decoy-1",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Recommended Chair",
is_live: true,
},
},
],
target: {
id: "real-123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Canonical Chair",
formatted_price: { text: "CA$120" },
listing_price: {
amount: "120.00",
currency: "CAD",
amount_with_offset: "120.00",
},
is_live: true,
},
},
})}
</script>
</body></html>
`;
const result = extractFacebookItemData(html);
expect(result).not.toBeNull();
expect(result?.id).toBe("real-123");
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
});
test("should handle missing item data", () => {
@@ -545,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
);
});
test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>{invalid: json}</script>
<a href="/marketplace/item/987654321/">
<span>Vintage Bike</span>
<span>CA$120</span>
<span>Toronto, ON</span>
</a>
</body></html>
`;
const result = extractFacebookMarketplaceData(html);
expect(result).not.toBeNull();
expect(result).toHaveLength(1);
expect(result?.[0].node.listing.id).toBe("987654321");
expect(result?.[0].node.listing.marketplace_listing_title).toBe(
"Vintage Bike",
);
expect(result?.[0].node.listing.listing_price).toEqual({
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
});
});
test("should handle empty search results", () => {
const mockData = {
require: [
@@ -571,6 +691,305 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
const result = extractFacebookMarketplaceData(html);
expect(result).toBeNull();
});
test("classifies Comet search responses", () => {
const html = `
<html>
<head><title>Marketplace</title></head>
<body>
<script>"XCometMarketplaceSearchController"</script>
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
</body>
</html>
`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/search?query=bike",
),
).toEqual({
kind: "search",
authGated: false,
unavailable: false,
});
});
test("classifies Comet item responses", () => {
const html = `
<html>
<body>
<script>"XCometMarketplacePermalinkController"</script>
<script>{"routing_namespace":"fb_comet"}</script>
</body>
</html>
`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/item/123/",
),
).toEqual({
kind: "item",
authGated: false,
unavailable: false,
});
});
test("classifies login-gated responses before parsing", () => {
const html = `<html><body>You must log in to Facebook</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F",
),
).toEqual({
kind: "auth_gated",
authGated: true,
unavailable: false,
});
});
test("classifies unavailable item responses", () => {
const html = `<html><body>Marketplace</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
),
).toEqual({
kind: "unavailable",
authGated: false,
unavailable: true,
});
});
test("classifies unknown responses when no signal is present", () => {
const html = `<html><body>Some random page</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/",
),
).toEqual({
kind: "unknown",
authGated: false,
unavailable: false,
});
});
test("does not false-positive on incidental login text", () => {
const html = `<html><body><footer>log in to Facebook to see your notifications</footer></body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/marketplace/toronto/search?query=bike",
),
).toEqual({
kind: "unknown",
authGated: false,
unavailable: false,
});
});
test("detects auth gating from URL redirect", () => {
const html = `<html><body>Redirecting...</body></html>`;
expect(
classifyFacebookResponse(
html,
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F",
),
).toEqual({
kind: "auth_gated",
authGated: true,
unavailable: false,
});
});
});
describe("extractFacebookBootstrapCandidates", () => {
test("extracts Comet bootstrap candidates from script tags", () => {
const html = `
<html><body>
<script>{"routing_namespace":"fb_comet"}</script>
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
<script>not json</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates).toHaveLength(2);
expect(candidates[1]).toEqual({
data: {
marketplace_search_bootstrap: {
edges: [{ node: { listing: { id: "1" } } }],
},
},
});
});
test("keeps candidate order stable for later scoring", () => {
const html = `
<html><body>
<script>{"marker":"first"}</script>
<script>{"marker":"second"}</script>
</body></html>
`;
const candidates = extractFacebookBootstrapCandidates(html);
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
});
test("extracts search results from Comet bootstrap candidates", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Bike",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Toronto" },
},
},
is_live: true,
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toHaveLength(1);
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
});
test("prefers the strongest marketplace edge set when multiple edges arrays exist", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
incidental: {
feed_units: {
edges: [
{
node: {
listing: {
id: "wrong-1",
marketplace_listing_title: "Wrong Listing",
listing_price: {
amount: "1.00",
formatted_amount: "CA$1",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
},
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "right-1",
marketplace_listing_title: "Right Listing",
listing_price: {
amount: "250.00",
formatted_amount: "CA$250",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toHaveLength(1);
expect(ads?.[0].node.listing.id).toBe("right-1");
});
test("rejects mixed edge arrays that contain non-listing entries", () => {
const html = `
<html><body>
<script>"XCometMarketplaceSearchController"</script>
<script>
${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Bike",
listing_price: {
amount: "120.00",
formatted_amount: "CA$120",
currency: "CAD",
},
is_live: true,
},
},
},
{
node: {
story: {
id: "not-a-listing",
},
},
},
],
},
],
},
})}
</script>
</body></html>
`;
const ads = extractFacebookMarketplaceData(html);
expect(ads).toBeNull();
});
});
});

View File

@@ -1,5 +1,5 @@
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
import fetchFacebookItems from "../src/scrapers/facebook";
import fetchFacebookItems, { fetchFacebookItem } from "../src/scrapers/facebook";
// Mock fetch globally
const originalFetch = global.fetch;
@@ -27,77 +27,40 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
describe("Main Search Function", () => {
test("should successfully fetch search results", async () => {
const mockSearchData = {
require: [
[
null,
null,
null,
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
payload: {
resultGroups: [
{
__bbox: {
result: {
data: {
marketplace_search: {
feed_units: {
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "iPhone 13 Pro",
listing_price: {
amount: "800.00",
formatted_amount: "$800.00",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Toronto" },
},
},
creation_time: 1640995200,
is_live: true,
},
},
},
{
node: {
listing: {
id: "2",
marketplace_listing_title: "Samsung Galaxy",
listing_price: {
amount: "600.00",
formatted_amount: "$600.00",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Mississauga" },
},
},
creation_time: 1640995300,
is_live: true,
},
},
},
],
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "iPhone 13",
listing_price: {
amount: "500.00",
formatted_amount: "CA$500",
currency: "CAD",
},
location: {
reverse_geocode: {
city_page: { display_name: "Toronto" },
},
},
is_live: true,
},
},
},
},
],
},
],
],
};
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
text: () => Promise.resolve(mockSearchHtml),
headers: {
get: () => null,
},
@@ -105,9 +68,8 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
);
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
expect(results).toHaveLength(2);
expect(results[0].title).toBe("iPhone 13 Pro");
expect(results[1].title).toBe("Samsung Galaxy");
expect(results).toHaveLength(1);
expect(results[0].title).toBe("iPhone 13");
});
test("should filter out items without price", async () => {
@@ -163,7 +125,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -218,7 +180,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -259,7 +221,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -292,6 +254,76 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
expect(results).toEqual([]);
});
test("should return empty array for auth-gated search HTML", async () => {
const authGatedSearchHtml = `
<html>
<body>
<script>"XCometMarketplaceSearchController"</script>
<a href="/marketplace/item/123456789/">
<span>Vintage Lamp</span>
<span>CA$45</span>
<span>Toronto, ON</span>
</a>
</body>
</html>
`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
text: () => Promise.resolve(authGatedSearchHtml),
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
expect(results).toEqual([]);
});
test("should return empty array when search request lands on unknown route", async () => {
const wrongRouteHtml = `<html><body><script>${JSON.stringify({
payload: {
resultGroups: [
{
edges: [
{
node: {
listing: {
id: "1",
marketplace_listing_title: "Leaked Search Result",
listing_price: {
amount: "75.00",
formatted_amount: "CA$75",
currency: "CAD",
},
is_live: true,
},
},
},
],
},
],
},
})}</script></body></html>`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/marketplace/toronto/",
text: () => Promise.resolve(wrongRouteHtml),
headers: {
get: () => null,
},
}),
);
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
expect(results).toEqual([]);
});
test("should handle network errors", async () => {
global.fetch = mock(() => Promise.reject(new Error("Network error")));
@@ -358,7 +390,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -431,7 +463,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -500,7 +532,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -571,7 +603,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
ok: true,
text: () =>
Promise.resolve(
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
),
headers: {
get: () => null,
@@ -637,4 +669,45 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
expect(results).toEqual([]);
});
});
describe("Item Fetch Function", () => {
test("should return null for unavailable item responses", async () => {
const unavailableItemHtml = `
<html>
<body>
<script>${JSON.stringify({
payload: {
listing: {
id: "related-123",
__typename: "GroupCommerceProductItem",
marketplace_listing_title: "Related Listing",
formatted_price: { text: "CA$90" },
listing_price: {
amount: "90.00",
currency: "CAD",
amount_with_offset: "90.00",
},
is_live: true,
},
},
})}</script>
</body>
</html>
`;
global.fetch = mock(() =>
Promise.resolve({
ok: true,
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
text: () => Promise.resolve(unavailableItemHtml),
headers: {
get: () => null,
},
}),
);
const result = await fetchFacebookItem("123");
expect(result).toBeNull();
});
});
});