Compare commits
8 Commits
45cff20377
...
9070f76412
| Author | SHA1 | Date | |
|---|---|---|---|
| 9070f76412 | |||
| 7ddc96dfdf | |||
| 63ca006696 | |||
| c90ee54cc1 | |||
| cfd7619737 | |||
| b072599bc6 | |||
| 2617afc62f | |||
| ba889a1f9d |
772
docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md
Normal file
772
docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md
Normal file
@@ -0,0 +1,772 @@
|
|||||||
|
# Facebook Comet Rewrite Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** Replace the legacy Facebook Marketplace scraper with a route-aware hybrid Comet-bootstrap parser for both search and item routes.
|
||||||
|
|
||||||
|
**Architecture:** Keep authenticated direct HTTP fetches as the transport. Classify each Facebook response first, then parse route-specific Comet bootstrap/state candidates, and fall back to rendered-HTML extraction only when bootstrap decoding cannot produce the expected search or item shape.
|
||||||
|
|
||||||
|
**Tech Stack:** Bun, TypeScript, `bun:test`, `linkedom`, existing shared cookie/http helpers
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Owns Facebook fetch flow, response classification, bootstrap candidate extraction, search parsing, item parsing, and HTML fallbacks.
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Owns unit coverage for response classification, bootstrap parsing, fallback parsing, and route-aware item/search extraction behavior.
|
||||||
|
- Modify: `packages/core/test/facebook-integration.test.ts`
|
||||||
|
- Owns higher-level fetch flow tests, auth/degradation behavior, and result shaping for search/item entrypoints.
|
||||||
|
|
||||||
|
### Task 1: Add Route Classification Coverage
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Test: `packages/core/test/facebook-core.test.ts`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Add these tests near the Facebook parser tests in `packages/core/test/facebook-core.test.ts`:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
test("classifies Comet search responses", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head><title>Marketplace</title></head>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/search?query=bike")).toEqual({
|
||||||
|
kind: "search",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies Comet item responses", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{"routing_namespace":"fb_comet"}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/item/123/")).toEqual({
|
||||||
|
kind: "item",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies login-gated responses before parsing", () => {
|
||||||
|
const html = `<html><body>You must log in to Facebook</body></html>`;
|
||||||
|
|
||||||
|
expect(classifyFacebookResponse(html, "https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F")).toEqual({
|
||||||
|
kind: "auth_gated",
|
||||||
|
authGated: true,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies unavailable item responses", () => {
|
||||||
|
const html = `<html><body>Marketplace</body></html>`;
|
||||||
|
|
||||||
|
expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/?unavailable_product=1")).toEqual({
|
||||||
|
kind: "unavailable",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: true,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"`
|
||||||
|
Expected: FAIL because `classifyFacebookResponse` does not exist yet.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
Add this type and function near the parsing section in `packages/core/src/scrapers/facebook.ts`:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type FacebookResponseKind = "search" | "item" | "auth_gated" | "unavailable" | "unknown";
|
||||||
|
|
||||||
|
export function classifyFacebookResponse(htmlString: HTMLString, responseUrl: string) {
|
||||||
|
const authGated =
|
||||||
|
responseUrl.includes("/login/") ||
|
||||||
|
htmlString.includes("You must log in to Facebook") ||
|
||||||
|
htmlString.includes("log in to Facebook");
|
||||||
|
|
||||||
|
if (authGated) {
|
||||||
|
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
const unavailable = responseUrl.includes("unavailable_product=1");
|
||||||
|
if (unavailable) {
|
||||||
|
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||||
|
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||||
|
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||||
|
git commit -m "refactor: add facebook response classification"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 2: Add Bootstrap Candidate Extraction
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Test: `packages/core/test/facebook-core.test.ts`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Add these tests:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
test("extracts Comet bootstrap candidates from script tags", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>{"routing_namespace":"fb_comet"}</script>
|
||||||
|
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
|
||||||
|
<script>not json</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
|
expect(candidates).toHaveLength(2);
|
||||||
|
expect(candidates[1]).toEqual({
|
||||||
|
data: {
|
||||||
|
marketplace_search_bootstrap: {
|
||||||
|
edges: [{ node: { listing: { id: "1" } } }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("keeps candidate order stable for later scoring", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>{"marker":"first"}</script>
|
||||||
|
<script>{"marker":"second"}</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
|
expect(candidates.map((candidate) => candidate.marker)).toEqual(["first", "second"]);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"`
|
||||||
|
Expected: FAIL because `extractFacebookBootstrapCandidates` does not exist.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
Add this helper near the parser utilities in `packages/core/src/scrapers/facebook.ts`:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
export function extractFacebookBootstrapCandidates(htmlString: HTMLString): Record<string, unknown>[] {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const scripts = document.querySelectorAll("script");
|
||||||
|
const candidates: Record<string, unknown>[] = [];
|
||||||
|
|
||||||
|
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||||
|
const scriptText = script.textContent?.trim();
|
||||||
|
if (!scriptText) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(scriptText);
|
||||||
|
if (isRecord(parsed)) {
|
||||||
|
candidates.push(parsed);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore non-JSON script bodies.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||||
|
git commit -m "refactor: add facebook bootstrap candidate extraction"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 3: Replace Search Parsing With Candidate Scoring
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Modify: `packages/core/test/facebook-integration.test.ts`
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Test: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Test: `packages/core/test/facebook-integration.test.ts`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Add a core test for route-aware search extraction:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
test("extracts search results from Comet bootstrap candidates", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Bike",
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
location: {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: { display_name: "Toronto" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toHaveLength(1);
|
||||||
|
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace one integration fixture with a current-shape search fixture:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const mockSearchHtml = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "iPhone 13",
|
||||||
|
listing_price: {
|
||||||
|
amount: "500.00",
|
||||||
|
formatted_amount: "CA$500",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
location: { reverse_geocode: { city_page: { display_name: "Toronto" } } },
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet bootstrap candidates"`
|
||||||
|
Expected: FAIL because the current search extractor only understands legacy `marketplace_search` shapes.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
Replace the search extraction internals in `extractFacebookMarketplaceData()` with candidate scoring like this:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
function findSearchEdges(candidate: unknown): FacebookEdge[] | null {
|
||||||
|
if (Array.isArray(candidate)) {
|
||||||
|
for (const item of candidate) {
|
||||||
|
const result = findSearchEdges(item);
|
||||||
|
if (result) return result;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRecord(candidate)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const directEdges = candidate.feed_units?.edges;
|
||||||
|
if (Array.isArray(directEdges)) {
|
||||||
|
return directEdges as FacebookEdge[];
|
||||||
|
}
|
||||||
|
|
||||||
|
const resultGroups = candidate.resultGroups;
|
||||||
|
if (Array.isArray(resultGroups)) {
|
||||||
|
for (const group of resultGroups) {
|
||||||
|
if (isRecord(group) && Array.isArray(group.edges)) {
|
||||||
|
return group.edges as FacebookEdge[];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const value of Object.values(candidate)) {
|
||||||
|
const result = findSearchEdges(value);
|
||||||
|
if (result) return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractFacebookMarketplaceData(htmlString: HTMLString): FacebookAdNode[] | null {
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||||
|
|
||||||
|
for (const candidate of candidates) {
|
||||||
|
const edges = findSearchEdges(candidate);
|
||||||
|
if (edges?.length) {
|
||||||
|
return edges.map((edge) => ({ node: edge.node }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.warn("No marketplace data found in HTML response");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts`
|
||||||
|
Expected: PASS for the rewritten search fixtures and existing unaffected tests.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts
|
||||||
|
git commit -m "refactor: rewrite facebook search parser for comet bootstrap"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 4: Replace Item Parsing With Candidate Scoring
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Test: `packages/core/test/facebook-core.test.ts`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Replace one old item fixture with a current-shape item fixture:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
listing: {
|
||||||
|
id: "123",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Vintage Chair",
|
||||||
|
formatted_price: { text: "CA$80" },
|
||||||
|
listing_price: { amount: "80.00", currency: "CAD", amount_with_offset: "80.00" },
|
||||||
|
redacted_description: { text: "Solid wood chair" },
|
||||||
|
location_text: { text: "Toronto, ON" },
|
||||||
|
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
|
||||||
|
condition: "USED",
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const item = extractFacebookItemData(html);
|
||||||
|
expect(item?.id).toBe("123");
|
||||||
|
expect(item?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet permalink bootstrap"`
|
||||||
|
Expected: FAIL because the current item extractor depends on legacy permalink markers.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
Replace the item extraction internals with a semantic candidate finder like this:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
function findMarketplaceItemCandidate(candidate: unknown): FacebookMarketplaceItem | null {
|
||||||
|
if (Array.isArray(candidate)) {
|
||||||
|
for (const item of candidate) {
|
||||||
|
const result = findMarketplaceItemCandidate(item);
|
||||||
|
if (result) return result;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRecord(candidate)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
candidate.id &&
|
||||||
|
candidate.__typename === "GroupCommerceProductItem" &&
|
||||||
|
candidate.marketplace_listing_title
|
||||||
|
) {
|
||||||
|
return candidate as FacebookMarketplaceItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const value of Object.values(candidate)) {
|
||||||
|
const result = findMarketplaceItemCandidate(value);
|
||||||
|
if (result) return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractFacebookItemData(htmlString: HTMLString): FacebookMarketplaceItem | null {
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||||
|
|
||||||
|
for (const candidate of candidates) {
|
||||||
|
const item = findMarketplaceItemCandidate(candidate);
|
||||||
|
if (item) {
|
||||||
|
return item;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts`
|
||||||
|
Expected: PASS for current-shape item tests and remaining parser tests.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||||
|
git commit -m "refactor: rewrite facebook item parser for comet bootstrap"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 5: Add HTML Fallback Extraction
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts`
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Test: `packages/core/test/facebook-core.test.ts`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Add these fallback tests:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
test("falls back to rendered search HTML when bootstrap payloads are undecodable", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<a href="https://www.facebook.com/marketplace/item/123/?ref=search">Vintage Lamp</a>
|
||||||
|
<span>CA$45</span>
|
||||||
|
<span>Toronto, ON</span>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
const parsed = ads ? parseFacebookAds(ads) : [];
|
||||||
|
expect(parsed[0].title).toBe("Vintage Lamp");
|
||||||
|
expect(parsed[0].listingPrice?.amountFormatted).toBe("CA$45");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("falls back to rendered item HTML when bootstrap payloads are undecodable", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<h1>Vintage Desk</h1>
|
||||||
|
<span>CA$120</span>
|
||||||
|
<span>Condition Used - Good</span>
|
||||||
|
<div>Description Solid oak desk.</div>
|
||||||
|
<div>Seller information Jordan</div>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const item = extractFacebookItemData(html);
|
||||||
|
expect(item?.marketplace_listing_title).toBe("Vintage Desk");
|
||||||
|
expect(item?.formatted_price?.text).toBe("CA$120");
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"`
|
||||||
|
Expected: FAIL because the extractor currently returns `null` without a structured candidate.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
Add route-specific HTML fallback helpers in `packages/core/src/scrapers/facebook.ts`:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
function extractSearchFallback(htmlString: HTMLString): FacebookAdNode[] | null {
|
||||||
|
const idMatch = htmlString.match(/marketplace\/item\/(\d+)/);
|
||||||
|
const titleMatch = htmlString.match(/marketplace\/item\/\d+\/[^>]*>([^<]+)</);
|
||||||
|
const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/);
|
||||||
|
const cityMatch = htmlString.match(/([A-Z][a-z]+,\s*[A-Z]{2})/);
|
||||||
|
|
||||||
|
if (!idMatch || !titleMatch || !priceMatch) return null;
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: idMatch[1],
|
||||||
|
marketplace_listing_title: titleMatch[1].trim(),
|
||||||
|
listing_price: {
|
||||||
|
amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
|
||||||
|
formatted_amount: priceMatch[0],
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
location: cityMatch
|
||||||
|
? { reverse_geocode: { city_page: { display_name: cityMatch[1].split(",")[0] } } }
|
||||||
|
: undefined,
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractItemFallback(htmlString: HTMLString): FacebookMarketplaceItem | null {
|
||||||
|
const titleMatch = htmlString.match(/<h1[^>]*>([^<]+)<\/h1>/i);
|
||||||
|
const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/);
|
||||||
|
if (!titleMatch || !priceMatch) return null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: "fallback-item",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: titleMatch[1].trim(),
|
||||||
|
formatted_price: { text: priceMatch[0] },
|
||||||
|
listing_price: {
|
||||||
|
amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: priceMatch[0].replace("CA$", "").replace(/,/g, ""),
|
||||||
|
},
|
||||||
|
redacted_description: { text: htmlString.includes("Description") ? htmlString.split("Description")[1].split("<")[0].trim() : "" },
|
||||||
|
is_live: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then call these helpers as the last fallback inside `extractFacebookMarketplaceData()` and `extractFacebookItemData()`.
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts
|
||||||
|
git commit -m "refactor: add facebook html fallbacks"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 6: Wire Route-Aware Failures Into Entry Points
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/test/facebook-integration.test.ts`
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts`
|
||||||
|
- Test: `packages/core/test/facebook-integration.test.ts`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Add these integration tests:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
test("returns empty search results for auth-gated search HTML", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
|
||||||
|
text: () => Promise.resolve("<html><body>You must log in to Facebook</body></html>"),
|
||||||
|
headers: { get: () => null },
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await fetchFacebookItems("bike", 1, "toronto", 25);
|
||||||
|
expect(results).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns null for unavailable item responses", async () => {
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||||
|
text: () => Promise.resolve("<html><body>Marketplace</body></html>"),
|
||||||
|
headers: { get: () => null },
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const item = await fetchFacebookItem("123");
|
||||||
|
expect(item).toBeNull();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-integration.test.ts --test-name-pattern "auth-gated|unavailable"`
|
||||||
|
Expected: FAIL because the entrypoints do not yet classify successful HTML responses by route/auth state.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Write minimal implementation**
|
||||||
|
|
||||||
|
Update both entrypoints to classify successful HTML before parsing:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const responseClass = classifyFacebookResponse(searchHtml, searchUrl);
|
||||||
|
if (responseClass.kind === "auth_gated") {
|
||||||
|
console.warn("Facebook marketplace search is auth-gated. Update FACEBOOK_COOKIE with a fresh raw Cookie header string.");
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const itemResponseClass = classifyFacebookResponse(itemHtml, itemUrl);
|
||||||
|
if (itemResponseClass.kind === "auth_gated") {
|
||||||
|
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (itemResponseClass.kind === "unavailable") {
|
||||||
|
console.warn(`Item ${itemId} appears to be unavailable in the marketplace.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the actual response URL from `fetchHtml` plumbing if that helper is extended to return both HTML and final URL; otherwise start by threading final URL support through the fetch helper in the same task.
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-integration.test.ts`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-integration.test.ts
|
||||||
|
git commit -m "refactor: handle facebook route-aware failure states"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 7: Run Full Verification And Live Probe
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `packages/core/src/scrapers/facebook.ts` if small cleanup is required
|
||||||
|
- Modify: `packages/core/test/facebook-core.test.ts` if small cleanup is required
|
||||||
|
- Modify: `packages/core/test/facebook-integration.test.ts` if small cleanup is required
|
||||||
|
|
||||||
|
- [ ] **Step 1: Run focused Facebook tests**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run broader core tests**
|
||||||
|
|
||||||
|
Run: `bun test packages/core/test`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 3: Run live authenticated Facebook probe**
|
||||||
|
|
||||||
|
Run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
set -a && source .env && set +a && bun --eval 'import { fetchFacebookItems, fetchFacebookItem } from "./packages/core/src/index.ts";
|
||||||
|
const results = await fetchFacebookItems("iphone", 1, "toronto", 3);
|
||||||
|
console.log("SEARCH_COUNT=" + results.length);
|
||||||
|
console.log(JSON.stringify(results[0] ?? null));
|
||||||
|
if (results[0]?.url) {
|
||||||
|
const match = results[0].url.match(/\/item\/(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
const item = await fetchFacebookItem(match[1]);
|
||||||
|
console.log(JSON.stringify(item));
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
|
||||||
|
- search returns at least one result
|
||||||
|
- item fetch returns non-null for the first live result when the route is not stale/unavailable
|
||||||
|
|
||||||
|
- [ ] **Step 4: Make any minimal cleanup needed to keep tests and live probe green**
|
||||||
|
|
||||||
|
If cleanup is needed, keep it limited to naming, dead-code removal caused by the rewrite, or small parser corrections directly exposed by the verification commands.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Re-run verification**
|
||||||
|
|
||||||
|
Run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts && bun test packages/core/test
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 6: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts
|
||||||
|
git commit -m "refactor: complete facebook comet scraper rewrite"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Self-Review
|
||||||
|
|
||||||
|
- Spec coverage: the plan covers classification, route-aware search parsing, route-aware item parsing, HTML fallbacks, explicit failure-state handling, test replacement, and live verification.
|
||||||
|
- Placeholder scan: no `TODO`, `TBD`, or unspecified “handle appropriately” steps remain.
|
||||||
|
- Type consistency: all planned functions and types use the same names across tasks: `classifyFacebookResponse`, `extractFacebookBootstrapCandidates`, `extractFacebookMarketplaceData`, and `extractFacebookItemData`.
|
||||||
@@ -0,0 +1,226 @@
|
|||||||
|
# Facebook Comet Rewrite Design
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Replace the legacy Facebook Marketplace scraper with a route-aware implementation built around current Comet bootstrap markers and route-specific extraction.
|
||||||
|
The new scraper will keep authenticated direct HTTP fetches as the primary transport, but it will stop treating legacy `require`, `__bbox`, and `marketplace_product_details_page` structures as the main parsing contract.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
- Replace both Facebook search and item-detail extraction with a current-shape parser.
|
||||||
|
- Keep authenticated direct HTTP requests as the primary fetch strategy.
|
||||||
|
- Parse route-specific Comet bootstrap/state payloads before falling back to rendered-HTML extraction.
|
||||||
|
- Detect auth-gated, unavailable, and unknown responses explicitly.
|
||||||
|
- Update tests so they model current route markers and failure modes instead of legacy page objects.
|
||||||
|
|
||||||
|
## Non-Goals
|
||||||
|
|
||||||
|
- Reworking non-Facebook scrapers.
|
||||||
|
- Converting the scraper to browser-only automation.
|
||||||
|
- Preserving old parser behavior for `marketplace_product_details_page` or `__bbox`-driven item extraction.
|
||||||
|
- Reverse-engineering every internal Facebook bootstrap payload shape exhaustively before implementation.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
The current implementation in `packages/core/src/scrapers/facebook.ts` still uses authenticated HTTP requests, which remains correct.
|
||||||
|
The search path parses embedded script JSON and looks for `marketplace_search.feed_units.edges`.
|
||||||
|
The item-detail path is centered on legacy extraction paths such as:
|
||||||
|
|
||||||
|
- `parsed.require[0][3].__bbox.result.data.viewer.marketplace_product_details_page.target`
|
||||||
|
- nested `__bbox.require[...]` variations
|
||||||
|
- recursive search through `parsed.require`
|
||||||
|
|
||||||
|
Live evidence gathered earlier in this session and by the isolated research subagent shows that current Facebook Marketplace pages are Comet route-driven and expose markers such as:
|
||||||
|
|
||||||
|
- `XCometMarketplaceSearchController`
|
||||||
|
- `XCometMarketplacePermalinkController`
|
||||||
|
- `routing_namespace":"fb_comet"`
|
||||||
|
- `use_ssr_state_manager":true`
|
||||||
|
- `ServerJS`
|
||||||
|
- `Bootloader`
|
||||||
|
- `data-sjs`
|
||||||
|
- `data-btmanifest`
|
||||||
|
|
||||||
|
The same live investigation also showed that authenticated item pages no longer expose the old `marketplace_product_details_page` marker reliably, while live search still returns usable results.
|
||||||
|
|
||||||
|
## Chosen Approach
|
||||||
|
|
||||||
|
Use a hybrid Comet-bootstrap parser.
|
||||||
|
|
||||||
|
The scraper will:
|
||||||
|
|
||||||
|
1. Fetch authenticated HTML directly.
|
||||||
|
2. Classify the response using current route and auth markers.
|
||||||
|
3. Parse inline bootstrap/state payloads using route-specific probes.
|
||||||
|
4. Fall back to rendered-HTML extraction only when bootstrap markers are present but the payload cannot be decoded into the expected search or item shape.
|
||||||
|
|
||||||
|
This keeps the cheaper direct-HTTP transport while shifting the parser contract from legacy page-object names to current Comet route structure.
|
||||||
|
|
||||||
|
## Design
|
||||||
|
|
||||||
|
### Route Classification
|
||||||
|
|
||||||
|
Add a small response-classification layer before data extraction.
|
||||||
|
It should identify these states from the fetched response URL and HTML:
|
||||||
|
|
||||||
|
- `auth_gated`
|
||||||
|
- `unavailable`
|
||||||
|
- `search`
|
||||||
|
- `item`
|
||||||
|
- `unknown`
|
||||||
|
|
||||||
|
Signals to use:
|
||||||
|
|
||||||
|
- final URL containing `/login/` or login-shell text
|
||||||
|
- final URL containing `unavailable_product=1`
|
||||||
|
- search controller markers such as `XCometMarketplaceSearchController`
|
||||||
|
- item controller markers such as `XCometMarketplacePermalinkController`
|
||||||
|
- shared Comet markers such as `routing_namespace":"fb_comet"`
|
||||||
|
|
||||||
|
This classification layer becomes the top-level contract for both fetch functions.
|
||||||
|
|
||||||
|
### Search Extraction
|
||||||
|
|
||||||
|
The search path will be rewritten around Comet search-route markers.
|
||||||
|
|
||||||
|
Primary behavior:
|
||||||
|
|
||||||
|
- fetch the Marketplace search HTML with auth cookies
|
||||||
|
- confirm the response class is `search`
|
||||||
|
- extract inline bootstrap/state blobs from script tags and page attributes
|
||||||
|
- probe for route-specific search payloads associated with `XCometMarketplaceSearchController`
|
||||||
|
- map decoded search results into summary listing records
|
||||||
|
|
||||||
|
Search summary fields should remain aligned with the current public output shape:
|
||||||
|
|
||||||
|
- item URL
|
||||||
|
- title
|
||||||
|
- formatted price and normalized cents when possible
|
||||||
|
- city/address summary when present
|
||||||
|
- seller summary when present in the search payload
|
||||||
|
- category/status/media fields only when they are present with stable meaning
|
||||||
|
|
||||||
|
Fallback behavior:
|
||||||
|
|
||||||
|
- if search route markers are present but structured payload decoding fails, extract listing summaries from rendered HTML anchors and text patterns
|
||||||
|
- use item links matching `/marketplace/item/<id>` as the anchor for fallback extraction
|
||||||
|
- treat fallback results as summary-only data, not rich detail data
|
||||||
|
|
||||||
|
### Item Extraction
|
||||||
|
|
||||||
|
The item-detail path will be rewritten around the Comet permalink route.
|
||||||
|
|
||||||
|
Primary behavior:
|
||||||
|
|
||||||
|
- fetch the item permalink HTML with auth cookies
|
||||||
|
- confirm the response class is `item`
|
||||||
|
- extract inline bootstrap/state blobs from script tags and page attributes
|
||||||
|
- probe for permalink payloads associated with `XCometMarketplacePermalinkController`
|
||||||
|
- decode the richest recoverable item record and map it into `FacebookListingDetails`
|
||||||
|
|
||||||
|
Priority item fields:
|
||||||
|
|
||||||
|
- item ID and permalink URL
|
||||||
|
- title
|
||||||
|
- formatted price and normalized cents when possible
|
||||||
|
- condition
|
||||||
|
- description
|
||||||
|
- listed age / creation date when derivable
|
||||||
|
- approximate location
|
||||||
|
- seller name and seller ID when present
|
||||||
|
- listing status when the payload makes it explicit
|
||||||
|
|
||||||
|
Fallback behavior:
|
||||||
|
|
||||||
|
- if permalink route markers are present but no stable payload object is decodable, extract data from rendered HTML text structure
|
||||||
|
- prioritize title, price, condition, description, location text, and seller module content
|
||||||
|
- return partial item data when core user-facing fields are present rather than failing solely because deeper commerce metadata is missing
|
||||||
|
|
||||||
|
### Bootstrap Parsing Strategy
|
||||||
|
|
||||||
|
The parser should stop assuming a single stable JSON path.
|
||||||
|
Instead, it should work in two phases:
|
||||||
|
|
||||||
|
1. Discover candidate bootstrap payloads.
|
||||||
|
2. Score candidates against the expected route shape.
|
||||||
|
|
||||||
|
Candidate discovery inputs:
|
||||||
|
|
||||||
|
- raw `<script>` contents
|
||||||
|
- `data-sjs` and related page attributes
|
||||||
|
- `ServerJS` / `Bootloader` inline blobs
|
||||||
|
- route controller names
|
||||||
|
|
||||||
|
Candidate scoring for search should favor objects that contain repeated result-card semantics, item IDs, listing links, titles, prices, or location summaries.
|
||||||
|
Candidate scoring for item pages should favor objects that contain singular listing semantics, title, price, condition, description, location, seller, or permalink context.
|
||||||
|
|
||||||
|
The parser should not depend on one hard-coded object name surviving forever.
|
||||||
|
Instead, it should look for route-specific semantic clusters and choose the strongest candidate.
|
||||||
|
|
||||||
|
### Legacy Removal
|
||||||
|
|
||||||
|
The old Facebook scraper should be removed as a primary strategy.
|
||||||
|
Specifically:
|
||||||
|
|
||||||
|
- delete old item-detail extraction paths centered on `marketplace_product_details_page`
|
||||||
|
- delete legacy-first `require` / `__bbox` navigation tables
|
||||||
|
- delete tests whose only purpose is to preserve those legacy paths
|
||||||
|
|
||||||
|
If a minimal legacy compatibility branch remains, it must be a last-resort fallback behind the new route-aware parser and should not shape test fixtures or design decisions.
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
|
||||||
|
Facebook responses should now fail with explicit route-aware outcomes:
|
||||||
|
|
||||||
|
1. Missing/invalid auth cookie input.
|
||||||
|
2. Auth-gated response.
|
||||||
|
3. Unavailable or stale item response.
|
||||||
|
4. Search or item route detected, but no decodable data found.
|
||||||
|
5. Unknown response shape.
|
||||||
|
|
||||||
|
Error messages should name the actual class of failure instead of implying that every parse miss is caused by expired cookies.
|
||||||
|
|
||||||
|
### Testing Strategy
|
||||||
|
|
||||||
|
Follow TDD for the rewrite.
|
||||||
|
Write failing tests for the new route-aware parser before replacing production code.
|
||||||
|
|
||||||
|
Coverage targets:
|
||||||
|
|
||||||
|
1. Search responses classify correctly from current Comet controller markers.
|
||||||
|
2. Item responses classify correctly from current Comet controller markers.
|
||||||
|
3. Login-gated and unavailable responses are detected before parsing.
|
||||||
|
4. Search bootstrap parsing produces summary listing results from current-shape fixtures.
|
||||||
|
5. Item bootstrap parsing produces rich listing details from current-shape fixtures.
|
||||||
|
6. Search fallback extraction works when route markers exist but structured payload decoding fails.
|
||||||
|
7. Item fallback extraction works when route markers exist but structured payload decoding fails.
|
||||||
|
8. Old legacy-only item fixtures are removed or rewritten so they no longer define the contract.
|
||||||
|
|
||||||
|
Verification target after implementation:
|
||||||
|
|
||||||
|
- `bun test packages/core/test/facebook-core.test.ts`
|
||||||
|
- `bun test packages/core/test/facebook-integration.test.ts`
|
||||||
|
- a live authenticated Facebook probe covering search and item routes
|
||||||
|
|
||||||
|
## Public API Surface
|
||||||
|
|
||||||
|
Keep the current public function names unless the rewrite proves that a signature change is required:
|
||||||
|
|
||||||
|
- `fetchFacebookItems(...)`
|
||||||
|
- `fetchFacebookItem(...)`
|
||||||
|
- `extractFacebookMarketplaceData(...)`
|
||||||
|
- `extractFacebookItemData(...)`
|
||||||
|
|
||||||
|
The internals should change substantially, but callers should not need a new integration surface for this rewrite.
|
||||||
|
|
||||||
|
## Risks
|
||||||
|
|
||||||
|
- Facebook may change bootstrap payload naming again, so route/controller markers are more stable than exact nested object paths but still not guaranteed.
|
||||||
|
- Search and item pages may each contain multiple partial payloads, making candidate ranking important.
|
||||||
|
- Fallback rendered-HTML extraction may be noisier than bootstrap decoding and needs clear precedence rules.
|
||||||
|
- Live fixtures can drift from production quickly, so tests must model route semantics rather than exact one-off payloads where possible.
|
||||||
|
|
||||||
|
## Rollout Notes
|
||||||
|
|
||||||
|
The code, fixtures, and tests should change together.
|
||||||
|
There should be no mixed state where the implementation is Comet-aware but the tests still encode `marketplace_product_details_page` as the primary contract.
|
||||||
@@ -75,13 +75,6 @@ interface FacebookEdge {
|
|||||||
[k: string]: unknown;
|
[k: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface FacebookMarketplaceSearch {
|
|
||||||
feed_units?: {
|
|
||||||
edges?: FacebookEdge[];
|
|
||||||
};
|
|
||||||
[k: string]: unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface FacebookMarketplaceItem {
|
interface FacebookMarketplaceItem {
|
||||||
// Basic identification
|
// Basic identification
|
||||||
id: string;
|
id: string;
|
||||||
@@ -173,6 +166,10 @@ interface FacebookMarketplaceItem {
|
|||||||
[k: string]: unknown;
|
[k: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const FACEBOOK_ITEM_HREF_RE = /\/marketplace\/item\/(\d+)/;
|
||||||
|
const FACEBOOK_PRICE_TEXT_RE = /^(CA\$|\$)\s*\d[\d,]*(?:\.\d{2})?$|^FREE$/i;
|
||||||
|
const FACEBOOK_LOCATION_TEXT_RE = /,\s*[A-Z]{2}$/;
|
||||||
|
|
||||||
export interface FacebookListingDetails {
|
export interface FacebookListingDetails {
|
||||||
url: string;
|
url: string;
|
||||||
title: string;
|
title: string;
|
||||||
@@ -286,7 +283,7 @@ async function fetchHtml(
|
|||||||
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
onRateInfo?: (remaining: string | null, reset: string | null) => void;
|
||||||
cookies?: string;
|
cookies?: string;
|
||||||
},
|
},
|
||||||
): Promise<HTMLString> {
|
): Promise<{ html: HTMLString; responseUrl: string }> {
|
||||||
const maxRetries = opts?.maxRetries ?? 3;
|
const maxRetries = opts?.maxRetries ?? 3;
|
||||||
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
const retryBaseMs = opts?.retryBaseMs ?? 500;
|
||||||
|
|
||||||
@@ -357,7 +354,7 @@ async function fetchHtml(
|
|||||||
const html = await res.text();
|
const html = await res.text();
|
||||||
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
// Respect per-request delay to keep at or under REQUESTS_PER_SECOND
|
||||||
await delay(DELAY_MS);
|
await delay(DELAY_MS);
|
||||||
return html;
|
return { html, responseUrl: res.url || url };
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (attempt >= maxRetries) throw err;
|
if (attempt >= maxRetries) throw err;
|
||||||
await delay((attempt + 1) * retryBaseMs);
|
await delay((attempt + 1) * retryBaseMs);
|
||||||
@@ -369,223 +366,477 @@ async function fetchHtml(
|
|||||||
|
|
||||||
// ----------------------------- Parsing -----------------------------
|
// ----------------------------- Parsing -----------------------------
|
||||||
|
|
||||||
|
export type FacebookResponseKind =
|
||||||
|
| "search"
|
||||||
|
| "item"
|
||||||
|
| "auth_gated"
|
||||||
|
| "unavailable"
|
||||||
|
| "unknown";
|
||||||
|
|
||||||
|
export function classifyFacebookResponse(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
responseUrl: string,
|
||||||
|
) {
|
||||||
|
const authGated =
|
||||||
|
responseUrl.includes("/login/") ||
|
||||||
|
htmlString.includes("You must log in") ||
|
||||||
|
htmlString.includes("log in to continue");
|
||||||
|
|
||||||
|
if (authGated) {
|
||||||
|
return { kind: "auth_gated" as const, authGated: true, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
const unavailable =
|
||||||
|
responseUrl.includes("unavailable_product=1") ||
|
||||||
|
htmlString.includes("This listing is no longer available") ||
|
||||||
|
htmlString.includes("listing has been removed");
|
||||||
|
if (unavailable) {
|
||||||
|
return { kind: "unavailable" as const, authGated: false, unavailable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (responseUrl.includes("/marketplace/item/")) {
|
||||||
|
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||||
|
return { kind: "search" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||||
|
return { kind: "item" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { kind: "unknown" as const, authGated: false, unavailable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractFacebookBootstrapCandidates(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): Record<string, unknown>[] {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const scripts = document.querySelectorAll("script");
|
||||||
|
const candidates: Record<string, unknown>[] = [];
|
||||||
|
|
||||||
|
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||||
|
const scriptText = script.textContent?.trim();
|
||||||
|
if (!scriptText) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(scriptText);
|
||||||
|
if (isRecord(parsed)) {
|
||||||
|
candidates.push(parsed as Record<string, unknown>);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// skip non-JSON script bodies
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] {
|
||||||
|
return (
|
||||||
|
Array.isArray(value) &&
|
||||||
|
value.length > 0 &&
|
||||||
|
value.every(
|
||||||
|
(edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreSearchEdges(edges: FacebookEdge[], score: number): number {
|
||||||
|
return score + Math.min(edges.length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
function findSearchEdges(
|
||||||
|
candidate: unknown,
|
||||||
|
score = 0,
|
||||||
|
): { edges: FacebookEdge[]; score: number } | null {
|
||||||
|
if (Array.isArray(candidate)) {
|
||||||
|
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||||
|
|
||||||
|
for (const item of candidate) {
|
||||||
|
const result = findSearchEdges(item, score);
|
||||||
|
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||||
|
bestMatch = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRecord(candidate)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||||
|
|
||||||
|
const feedUnits = candidate.feed_units;
|
||||||
|
if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) {
|
||||||
|
bestMatch = {
|
||||||
|
edges: feedUnits.edges,
|
||||||
|
score: scoreSearchEdges(feedUnits.edges, score + 2),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const resultGroups = candidate.resultGroups;
|
||||||
|
if (Array.isArray(resultGroups)) {
|
||||||
|
for (const group of resultGroups) {
|
||||||
|
if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) {
|
||||||
|
const result = {
|
||||||
|
edges: group.edges,
|
||||||
|
score: scoreSearchEdges(group.edges, score + 4),
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!bestMatch || result.score > bestMatch.score) {
|
||||||
|
bestMatch = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(candidate)) {
|
||||||
|
const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0));
|
||||||
|
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||||
|
bestMatch = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface FacebookMarketplaceItemMatch {
|
||||||
|
item: FacebookMarketplaceItem;
|
||||||
|
score: number;
|
||||||
|
path: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreMarketplaceItemPath(path: string[]): number {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
if (path.includes("payload")) {
|
||||||
|
score += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("viewer")) {
|
||||||
|
score += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("marketplace_product_details_page")) {
|
||||||
|
score += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("target")) {
|
||||||
|
score += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.includes("listing")) {
|
||||||
|
score += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
path.some(
|
||||||
|
(segment) =>
|
||||||
|
segment.includes("recommend") || segment.includes("related"),
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
score -= 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
return score - path.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectMarketplaceItemCandidates(
|
||||||
|
candidate: unknown,
|
||||||
|
path: string[] = [],
|
||||||
|
): FacebookMarketplaceItemMatch[] {
|
||||||
|
if (Array.isArray(candidate)) {
|
||||||
|
return candidate.flatMap((item) => collectMarketplaceItemCandidates(item, path));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRecord(candidate)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const matches: FacebookMarketplaceItemMatch[] = [];
|
||||||
|
|
||||||
|
if (
|
||||||
|
typeof candidate.id === "string" &&
|
||||||
|
candidate.__typename === "GroupCommerceProductItem" &&
|
||||||
|
typeof candidate.marketplace_listing_title === "string"
|
||||||
|
) {
|
||||||
|
matches.push({
|
||||||
|
item: candidate as FacebookMarketplaceItem,
|
||||||
|
score: scoreMarketplaceItemPath(path),
|
||||||
|
path,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(candidate)) {
|
||||||
|
matches.push(...collectMarketplaceItemCandidates(value, [...path, key]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseFacebookRenderedPrice(priceText: string) {
|
||||||
|
const trimmed = priceText.trim();
|
||||||
|
if (!trimmed || trimmed.toUpperCase() === "FREE") {
|
||||||
|
return {
|
||||||
|
amount: "0.00",
|
||||||
|
formatted_amount: trimmed || "FREE",
|
||||||
|
currency: "CAD",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const amountMatch = trimmed.match(/[\d,]+(?:\.\d{2})?/);
|
||||||
|
if (!amountMatch) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const amount = Number.parseFloat(amountMatch[0].replaceAll(",", ""));
|
||||||
|
if (!Number.isFinite(amount)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
amount: amount.toFixed(2),
|
||||||
|
formatted_amount: trimmed,
|
||||||
|
currency: "CAD",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractRenderedText(node: ParentNode, selector: string): string[] {
|
||||||
|
return Array.from(node.querySelectorAll(selector))
|
||||||
|
.map((element) => element.textContent?.trim())
|
||||||
|
.filter((text): text is string => Boolean(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractMarketplaceItemIdFromElement(element: Element | null): string | null {
|
||||||
|
const href = element?.getAttribute("href") || "";
|
||||||
|
return href.match(FACEBOOK_ITEM_HREF_RE)?.[1] ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookPermalinkItemId(document: Document): string | null {
|
||||||
|
const canonicalId = extractMarketplaceItemIdFromElement(
|
||||||
|
document.querySelector('link[rel="canonical"][href*="/marketplace/item/"]'),
|
||||||
|
);
|
||||||
|
if (canonicalId) {
|
||||||
|
return canonicalId;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ogUrl = document
|
||||||
|
.querySelector('meta[property="og:url"]')
|
||||||
|
?.getAttribute("content");
|
||||||
|
const ogId = ogUrl?.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||||
|
if (ogId) {
|
||||||
|
return ogId;
|
||||||
|
}
|
||||||
|
|
||||||
|
const title = document.querySelector("h1")?.textContent?.trim();
|
||||||
|
if (!title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const itemLinks = Array.from(
|
||||||
|
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||||
|
);
|
||||||
|
const selfLink = itemLinks.find((link) => link.textContent?.includes(title));
|
||||||
|
|
||||||
|
if (selfLink) {
|
||||||
|
return extractMarketplaceItemIdFromElement(selfLink);
|
||||||
|
}
|
||||||
|
|
||||||
|
return extractMarketplaceItemIdFromElement(itemLinks.at(-1) ?? null);
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookDescriptionText(document: Document): string | undefined {
|
||||||
|
const labels = Array.from(document.querySelectorAll("div, span, h2, h3, p"));
|
||||||
|
|
||||||
|
for (const label of labels) {
|
||||||
|
if (label.textContent?.trim() !== "Description") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let sibling = label.nextElementSibling;
|
||||||
|
while (sibling) {
|
||||||
|
const text = sibling.textContent?.trim();
|
||||||
|
if (text && text !== "Description") {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
sibling = sibling.nextElementSibling;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookMarketplaceHtmlFallback(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): FacebookAdNode[] | null {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const links = Array.from(
|
||||||
|
document.querySelectorAll('a[href*="/marketplace/item/"]'),
|
||||||
|
) as HTMLAnchorElement[];
|
||||||
|
const seenIds = new Set<string>();
|
||||||
|
const results: FacebookAdNode[] = [];
|
||||||
|
|
||||||
|
for (const link of links) {
|
||||||
|
const href = link.getAttribute("href") || "";
|
||||||
|
const id = href.match(FACEBOOK_ITEM_HREF_RE)?.[1];
|
||||||
|
if (!id || seenIds.has(id)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const texts = extractRenderedText(link, "span, div");
|
||||||
|
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||||
|
const location = texts.find((text) => FACEBOOK_LOCATION_TEXT_RE.test(text));
|
||||||
|
const title = texts.find(
|
||||||
|
(text) => text !== priceText && text !== location && !text.includes("/"),
|
||||||
|
);
|
||||||
|
if (!title || !priceText) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsedPrice = parseFacebookRenderedPrice(priceText);
|
||||||
|
if (!parsedPrice) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id,
|
||||||
|
marketplace_listing_title: title,
|
||||||
|
listing_price: parsedPrice,
|
||||||
|
location: location
|
||||||
|
? {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: {
|
||||||
|
display_name: location,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
: undefined,
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
seenIds.add(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results.length > 0 ? results : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFacebookItemHtmlFallback(
|
||||||
|
htmlString: HTMLString,
|
||||||
|
): FacebookMarketplaceItem | null {
|
||||||
|
const { document } = parseHTML(htmlString);
|
||||||
|
const title = document.querySelector("h1")?.textContent?.trim();
|
||||||
|
const id = extractFacebookPermalinkItemId(document);
|
||||||
|
|
||||||
|
if (!id || !title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const texts = extractRenderedText(document, "h1, span, div, p");
|
||||||
|
const priceText = texts.find((text) => FACEBOOK_PRICE_TEXT_RE.test(text));
|
||||||
|
const parsedPrice = priceText ? parseFacebookRenderedPrice(priceText) : null;
|
||||||
|
const location = texts.find(
|
||||||
|
(text) => text !== title && text !== priceText && FACEBOOK_LOCATION_TEXT_RE.test(text),
|
||||||
|
);
|
||||||
|
const description = extractFacebookDescriptionText(document);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: title,
|
||||||
|
formatted_price: priceText ? { text: priceText } : undefined,
|
||||||
|
listing_price: parsedPrice
|
||||||
|
? {
|
||||||
|
amount: parsedPrice.amount,
|
||||||
|
currency: parsedPrice.currency,
|
||||||
|
amount_with_offset: parsedPrice.amount,
|
||||||
|
}
|
||||||
|
: undefined,
|
||||||
|
location_text: location ? { text: location } : undefined,
|
||||||
|
redacted_description: description ? { text: description } : undefined,
|
||||||
|
is_live: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace search data from Facebook page script tags
|
Extract marketplace search data from Facebook page script tags
|
||||||
*/
|
*/
|
||||||
export function extractFacebookMarketplaceData(
|
export function extractFacebookMarketplaceData(
|
||||||
htmlString: HTMLString,
|
htmlString: HTMLString,
|
||||||
): FacebookAdNode[] | null {
|
): FacebookAdNode[] | null {
|
||||||
const { document } = parseHTML(htmlString);
|
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||||
const scripts = document.querySelectorAll("script");
|
let bestEdges: FacebookEdge[] | null = null;
|
||||||
|
let bestScore = -1;
|
||||||
|
|
||||||
let marketplaceData: FacebookMarketplaceSearch | null = null;
|
for (const candidate of candidates) {
|
||||||
|
const result = findSearchEdges(candidate);
|
||||||
|
if (!result?.edges.length) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Find the script containing the require data with marketplace_search
|
if (result.score > bestScore) {
|
||||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
bestScore = result.score;
|
||||||
const scriptText = script.textContent;
|
bestEdges = result.edges;
|
||||||
if (!scriptText) continue;
|
}
|
||||||
|
|
||||||
try {
|
|
||||||
const parsed = JSON.parse(scriptText);
|
|
||||||
|
|
||||||
// First check if this is the direct data structure (like in examples)
|
|
||||||
if (parsed.require && Array.isArray(parsed.require)) {
|
|
||||||
// Try multiple navigation paths to find marketplace_search
|
|
||||||
const paths = [
|
|
||||||
// Original path from example
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
|
|
||||||
.marketplace_search,
|
|
||||||
// Alternative path structure
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
|
|
||||||
// Another variation
|
|
||||||
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
|
|
||||||
// Direct access for some responses
|
|
||||||
() => {
|
|
||||||
for (const item of parsed.require) {
|
|
||||||
if (item && item.length >= 4 && item[3]) {
|
|
||||||
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
|
|
||||||
if (bbox) return bbox;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const getData of paths) {
|
|
||||||
try {
|
|
||||||
const result = getData();
|
|
||||||
if (
|
|
||||||
result &&
|
|
||||||
isRecord(result) &&
|
|
||||||
(result as Record<string, unknown>).feed_units?.edges?.length > 0
|
|
||||||
) {
|
|
||||||
marketplaceData = result as FacebookMarketplaceSearch;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (marketplaceData) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also check for direct marketplace_search in the parsed data
|
|
||||||
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
|
|
||||||
const searchData =
|
|
||||||
parsed.marketplace_search as FacebookMarketplaceSearch;
|
|
||||||
const feedLength = searchData.feed_units?.edges?.length ?? 0;
|
|
||||||
if (feedLength > 0) {
|
|
||||||
marketplaceData = searchData;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!marketplaceData?.feed_units?.edges?.length) {
|
if (!bestEdges?.length) {
|
||||||
|
if (htmlString.includes("XCometMarketplaceSearchController")) {
|
||||||
|
const htmlFallback = extractFacebookMarketplaceHtmlFallback(htmlString);
|
||||||
|
if (htmlFallback?.length) {
|
||||||
|
console.log(
|
||||||
|
`Successfully parsed ${htmlFallback.length} Facebook marketplace listings from rendered HTML fallback`,
|
||||||
|
);
|
||||||
|
return htmlFallback;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
console.warn("No marketplace data found in HTML response");
|
console.warn("No marketplace data found in HTML response");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`,
|
`Successfully parsed ${bestEdges.length} Facebook marketplace listings`,
|
||||||
);
|
);
|
||||||
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
|
return bestEdges.map((edge) => ({ node: edge.node }));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace item details from Facebook item page HTML
|
Extract marketplace item details from Facebook item page HTML
|
||||||
Updated for 2026 Facebook Marketplace API structure with multiple extraction paths
|
Updated for 2026 Facebook Marketplace bootstrap candidates
|
||||||
*/
|
*/
|
||||||
export function extractFacebookItemData(
|
export function extractFacebookItemData(
|
||||||
htmlString: HTMLString,
|
htmlString: HTMLString,
|
||||||
): FacebookMarketplaceItem | null {
|
): FacebookMarketplaceItem | null {
|
||||||
const { document } = parseHTML(htmlString);
|
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||||
const scripts = document.querySelectorAll("script");
|
let bestMatch: FacebookMarketplaceItemMatch | null = null;
|
||||||
|
|
||||||
for (const script of scripts) {
|
for (const candidate of candidates) {
|
||||||
const scriptText = script.textContent;
|
const matches = collectMarketplaceItemCandidates(candidate);
|
||||||
if (!scriptText) continue;
|
|
||||||
|
|
||||||
try {
|
for (const match of matches) {
|
||||||
const parsed = JSON.parse(scriptText);
|
if (
|
||||||
|
!bestMatch ||
|
||||||
// Check for the require structure with marketplace product details
|
match.score > bestMatch.score ||
|
||||||
if (parsed.require && Array.isArray(parsed.require)) {
|
(match.score === bestMatch.score && match.path.length < bestMatch.path.length)
|
||||||
// Try multiple extraction paths discovered from reverse engineering
|
) {
|
||||||
const extractionPaths = [
|
bestMatch = match;
|
||||||
// Path 1: Primary path from current API structure
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3].__bbox.result.data.viewer
|
|
||||||
.marketplace_product_details_page.target,
|
|
||||||
// Path 2: Alternative path with nested require
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][0].__bbox.require[3][3][1].__bbox.result.data
|
|
||||||
.viewer.marketplace_product_details_page.target,
|
|
||||||
// Path 3: Variation without the [0] index
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3].__bbox.require[3][3][1].__bbox.result.data
|
|
||||||
.viewer.marketplace_product_details_page.target,
|
|
||||||
// Path 4-5: Additional fallback paths for edge cases
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][1]?.__bbox?.result?.data?.viewer
|
|
||||||
?.marketplace_product_details_page?.target,
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][2]?.__bbox?.result?.data?.viewer
|
|
||||||
?.marketplace_product_details_page?.target,
|
|
||||||
];
|
|
||||||
|
|
||||||
let pathIndex = 0;
|
|
||||||
for (const getPath of extractionPaths) {
|
|
||||||
try {
|
|
||||||
const targetData = getPath();
|
|
||||||
if (
|
|
||||||
targetData &&
|
|
||||||
typeof targetData === "object" &&
|
|
||||||
targetData.id &&
|
|
||||||
targetData.marketplace_listing_title &&
|
|
||||||
targetData.__typename === "GroupCommerceProductItem"
|
|
||||||
) {
|
|
||||||
console.log(
|
|
||||||
`Successfully extracted Facebook item data using extraction path ${pathIndex + 1}`,
|
|
||||||
);
|
|
||||||
return targetData as FacebookMarketplaceItem;
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Path not found or invalid, try next path
|
|
||||||
}
|
|
||||||
pathIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: Search recursively for marketplace data in the parsed structure
|
|
||||||
const findMarketplaceData = (
|
|
||||||
obj: unknown,
|
|
||||||
depth = 0,
|
|
||||||
maxDepth = 10,
|
|
||||||
): FacebookMarketplaceItem | null => {
|
|
||||||
if (depth > maxDepth) return null; // Prevent infinite recursion
|
|
||||||
if (isRecord(obj)) {
|
|
||||||
// Check if this object matches the expected marketplace item structure
|
|
||||||
const candidate = obj as Record<string, unknown>;
|
|
||||||
if (
|
|
||||||
candidate.marketplace_listing_title &&
|
|
||||||
candidate.id &&
|
|
||||||
candidate.__typename === "GroupCommerceProductItem" &&
|
|
||||||
candidate.redacted_description
|
|
||||||
) {
|
|
||||||
return candidate as unknown as FacebookMarketplaceItem;
|
|
||||||
}
|
|
||||||
// Recursively search nested objects and arrays
|
|
||||||
for (const key in obj) {
|
|
||||||
const value = obj[key];
|
|
||||||
if (isRecord(value) || Array.isArray(value)) {
|
|
||||||
const result = findMarketplaceData(value, depth + 1, maxDepth);
|
|
||||||
if (result) return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (Array.isArray(obj)) {
|
|
||||||
// Search through arrays
|
|
||||||
for (const item of obj) {
|
|
||||||
const result = findMarketplaceData(item, depth + 1, maxDepth);
|
|
||||||
if (result) return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Search through the entire require structure
|
|
||||||
const recursiveResult = findMarketplaceData(parsed.require);
|
|
||||||
if (recursiveResult) {
|
|
||||||
console.log(
|
|
||||||
"Successfully extracted Facebook item data using recursive search",
|
|
||||||
);
|
|
||||||
return recursiveResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Additional search in other potential locations
|
|
||||||
if (
|
|
||||||
parsed.__bbox?.result?.data?.viewer?.marketplace_product_details_page
|
|
||||||
?.target
|
|
||||||
) {
|
|
||||||
const bboxData =
|
|
||||||
parsed.__bbox.result.data.viewer.marketplace_product_details_page
|
|
||||||
.target;
|
|
||||||
if (
|
|
||||||
bboxData &&
|
|
||||||
typeof bboxData === "object" &&
|
|
||||||
bboxData.id &&
|
|
||||||
bboxData.marketplace_listing_title &&
|
|
||||||
bboxData.__typename === "GroupCommerceProductItem"
|
|
||||||
) {
|
|
||||||
console.log(
|
|
||||||
"Successfully extracted Facebook item data from __bbox structure",
|
|
||||||
);
|
|
||||||
return bboxData as FacebookMarketplaceItem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} catch {}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestMatch) {
|
||||||
|
return bestMatch.item;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htmlString.includes("XCometMarketplacePermalinkController")) {
|
||||||
|
return extractFacebookItemHtmlFallback(htmlString);
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
@@ -838,8 +1089,9 @@ export default async function fetchFacebookItems(
|
|||||||
console.log(`Using ${cookies.length} cookies for authentication`);
|
console.log(`Using ${cookies.length} cookies for authentication`);
|
||||||
|
|
||||||
let searchHtml: string;
|
let searchHtml: string;
|
||||||
|
let searchResponseUrl = searchUrl;
|
||||||
try {
|
try {
|
||||||
searchHtml = await fetchHtml(searchUrl, DELAY_MS, {
|
const response = await fetchHtml(searchUrl, DELAY_MS, {
|
||||||
maxRetries: 3,
|
maxRetries: 3,
|
||||||
onRateInfo: (remaining, reset) => {
|
onRateInfo: (remaining, reset) => {
|
||||||
if (remaining && reset) {
|
if (remaining && reset) {
|
||||||
@@ -850,6 +1102,8 @@ export default async function fetchFacebookItems(
|
|||||||
},
|
},
|
||||||
cookies: cookiesHeader,
|
cookies: cookiesHeader,
|
||||||
});
|
});
|
||||||
|
searchHtml = response.html;
|
||||||
|
searchResponseUrl = response.responseUrl;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof HttpError) {
|
if (err instanceof HttpError) {
|
||||||
console.warn(
|
console.warn(
|
||||||
@@ -865,6 +1119,24 @@ export default async function fetchFacebookItems(
|
|||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const classification = classifyFacebookResponse(searchHtml, searchResponseUrl);
|
||||||
|
if (classification.authGated) {
|
||||||
|
console.warn("Facebook marketplace search redirected to login. Cookies may be expired.");
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.unavailable) {
|
||||||
|
console.warn("Facebook marketplace search returned an unavailable route.");
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.kind !== "search") {
|
||||||
|
console.warn(
|
||||||
|
`Facebook marketplace search returned unexpected route kind: ${classification.kind}.`,
|
||||||
|
);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
const ads = extractFacebookMarketplaceData(searchHtml);
|
const ads = extractFacebookMarketplaceData(searchHtml);
|
||||||
if (!ads || ads.length === 0) {
|
if (!ads || ads.length === 0) {
|
||||||
console.warn("No ads parsed from Facebook marketplace page.");
|
console.warn("No ads parsed from Facebook marketplace page.");
|
||||||
@@ -916,8 +1188,9 @@ export async function fetchFacebookItem(
|
|||||||
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
|
console.log(`Fetching Facebook marketplace item: ${itemUrl}`);
|
||||||
|
|
||||||
let itemHtml: string;
|
let itemHtml: string;
|
||||||
|
let itemResponseUrl = itemUrl;
|
||||||
try {
|
try {
|
||||||
itemHtml = await fetchHtml(itemUrl, 1000, {
|
const response = await fetchHtml(itemUrl, 1000, {
|
||||||
onRateInfo: (remaining, reset) => {
|
onRateInfo: (remaining, reset) => {
|
||||||
if (remaining && reset) {
|
if (remaining && reset) {
|
||||||
console.log(
|
console.log(
|
||||||
@@ -927,6 +1200,8 @@ export async function fetchFacebookItem(
|
|||||||
},
|
},
|
||||||
cookies: cookiesHeader,
|
cookies: cookiesHeader,
|
||||||
});
|
});
|
||||||
|
itemHtml = response.html;
|
||||||
|
itemResponseUrl = response.responseUrl;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof HttpError) {
|
if (err instanceof HttpError) {
|
||||||
console.warn(
|
console.warn(
|
||||||
@@ -967,31 +1242,31 @@ export async function fetchFacebookItem(
|
|||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const classification = classifyFacebookResponse(itemHtml, itemResponseUrl);
|
||||||
|
|
||||||
|
if (classification.authGated) {
|
||||||
|
logExtractionMetrics(false, itemId);
|
||||||
|
console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.unavailable || itemHtml.includes("This item has been sold")) {
|
||||||
|
logExtractionMetrics(false, itemId);
|
||||||
|
console.warn(`Item ${itemId} appears to be sold or removed from marketplace.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classification.kind !== "item") {
|
||||||
|
logExtractionMetrics(false, itemId);
|
||||||
|
console.warn(
|
||||||
|
`Item ${itemId} returned unexpected route kind: ${classification.kind}.`,
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
const itemData = extractFacebookItemData(itemHtml);
|
const itemData = extractFacebookItemData(itemHtml);
|
||||||
if (!itemData) {
|
if (!itemData) {
|
||||||
logExtractionMetrics(false, itemId);
|
logExtractionMetrics(false, itemId);
|
||||||
// Enhanced checking for specific failure scenarios
|
|
||||||
if (
|
|
||||||
itemHtml.includes("This listing is no longer available") ||
|
|
||||||
itemHtml.includes("listing has been removed") ||
|
|
||||||
itemHtml.includes("This item has been sold")
|
|
||||||
) {
|
|
||||||
console.warn(
|
|
||||||
`Item ${itemId} appears to be sold or removed from marketplace.`,
|
|
||||||
);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
itemHtml.includes("log in to Facebook") ||
|
|
||||||
itemHtml.includes("You must log in") ||
|
|
||||||
itemHtml.includes("authentication required")
|
|
||||||
) {
|
|
||||||
console.warn(
|
|
||||||
`Authentication failed for item ${itemId}. Cookies may be expired.`,
|
|
||||||
);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.warn(
|
console.warn(
|
||||||
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
`No item data found in Facebook marketplace page for item ${itemId}. This may indicate:`,
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||||
import {
|
import {
|
||||||
|
classifyFacebookResponse,
|
||||||
ensureFacebookCookies,
|
ensureFacebookCookies,
|
||||||
|
extractFacebookBootstrapCandidates,
|
||||||
extractFacebookItemData,
|
extractFacebookItemData,
|
||||||
extractFacebookMarketplaceData,
|
extractFacebookMarketplaceData,
|
||||||
fetchFacebookItem,
|
fetchFacebookItem,
|
||||||
@@ -367,43 +369,134 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
|
|
||||||
describe("Data Extraction", () => {
|
describe("Data Extraction", () => {
|
||||||
describe("extractFacebookItemData", () => {
|
describe("extractFacebookItemData", () => {
|
||||||
test("should extract item data from standard require structure", () => {
|
test("extracts item details from Comet permalink bootstrap candidates", () => {
|
||||||
const mockItemData = {
|
const html = `
|
||||||
id: "123456",
|
<html><body>
|
||||||
__typename: "GroupCommerceProductItem",
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
marketplace_listing_title: "Test Item",
|
<script>
|
||||||
formatted_price: { text: "$100.00" },
|
${JSON.stringify({
|
||||||
listing_price: { amount: "100.00", currency: "CAD" },
|
payload: {
|
||||||
is_live: true,
|
listing: {
|
||||||
};
|
id: "123",
|
||||||
const mockData = {
|
__typename: "GroupCommerceProductItem",
|
||||||
require: [
|
marketplace_listing_title: "Vintage Chair",
|
||||||
[
|
formatted_price: { text: "CA$80" },
|
||||||
null,
|
listing_price: {
|
||||||
null,
|
amount: "80.00",
|
||||||
null,
|
currency: "CAD",
|
||||||
{
|
amount_with_offset: "80.00",
|
||||||
__bbox: {
|
|
||||||
result: {
|
|
||||||
data: {
|
|
||||||
viewer: {
|
|
||||||
marketplace_product_details_page: {
|
|
||||||
target: mockItemData,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
|
redacted_description: { text: "Solid wood chair" },
|
||||||
|
location_text: { text: "Toronto, ON" },
|
||||||
|
marketplace_listing_seller: { id: "seller-1", name: "Alex" },
|
||||||
|
condition: "USED",
|
||||||
|
is_live: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
})}
|
||||||
],
|
</script>
|
||||||
],
|
</body></html>
|
||||||
};
|
`;
|
||||||
const html = `<html><body><script>${JSON.stringify(mockData)}</script></body></html>`;
|
|
||||||
|
|
||||||
const result = extractFacebookItemData(html);
|
const result = extractFacebookItemData(html);
|
||||||
expect(result).not.toBeNull();
|
expect(result).not.toBeNull();
|
||||||
expect(result?.id).toBe("123456");
|
expect(result?.id).toBe("123");
|
||||||
expect(result?.marketplace_listing_title).toBe("Test Item");
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("falls back to rendered item HTML when permalink bootstrap payloads are undecodable", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{invalid: json}</script>
|
||||||
|
<h1>Vintage Chair</h1>
|
||||||
|
<span>CA$80</span>
|
||||||
|
<div>Toronto, ON</div>
|
||||||
|
<div>Description</div>
|
||||||
|
<div>Solid wood chair</div>
|
||||||
|
<a href="/marketplace/item/123/">View listing</a>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookItemData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.id).toBe("123");
|
||||||
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
expect(result?.formatted_price?.text).toBe("CA$80");
|
||||||
|
expect(result?.location_text?.text).toBe("Toronto, ON");
|
||||||
|
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses canonical permalink context instead of earlier related links in item HTML fallback", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel="canonical" href="https://www.facebook.com/marketplace/item/123/" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{invalid: json}</script>
|
||||||
|
<a href="/marketplace/item/999/">
|
||||||
|
<span>Related Chair</span>
|
||||||
|
</a>
|
||||||
|
<h1>Vintage Chair</h1>
|
||||||
|
<span>CA$80</span>
|
||||||
|
<div>Toronto, ON</div>
|
||||||
|
<div>Message seller</div>
|
||||||
|
<div>Seller details</div>
|
||||||
|
<div>Description</div>
|
||||||
|
<div>Solid wood chair</div>
|
||||||
|
<a href="/marketplace/item/123/">View listing</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookItemData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.id).toBe("123");
|
||||||
|
expect(result?.marketplace_listing_title).toBe("Vintage Chair");
|
||||||
|
expect(result?.redacted_description?.text).toBe("Solid wood chair");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("prefers the canonical permalink target over earlier decoy items", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
recommendation_units: [
|
||||||
|
{
|
||||||
|
listing: {
|
||||||
|
id: "decoy-1",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Recommended Chair",
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
target: {
|
||||||
|
id: "real-123",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Canonical Chair",
|
||||||
|
formatted_price: { text: "CA$120" },
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: "120.00",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookItemData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result?.id).toBe("real-123");
|
||||||
|
expect(result?.marketplace_listing_title).toBe("Canonical Chair");
|
||||||
});
|
});
|
||||||
|
|
||||||
test("should handle missing item data", () => {
|
test("should handle missing item data", () => {
|
||||||
@@ -545,6 +638,33 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("falls back to rendered search HTML when search bootstrap payloads are undecodable", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>{invalid: json}</script>
|
||||||
|
<a href="/marketplace/item/987654321/">
|
||||||
|
<span>Vintage Bike</span>
|
||||||
|
<span>CA$120</span>
|
||||||
|
<span>Toronto, ON</span>
|
||||||
|
</a>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = extractFacebookMarketplaceData(html);
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result).toHaveLength(1);
|
||||||
|
expect(result?.[0].node.listing.id).toBe("987654321");
|
||||||
|
expect(result?.[0].node.listing.marketplace_listing_title).toBe(
|
||||||
|
"Vintage Bike",
|
||||||
|
);
|
||||||
|
expect(result?.[0].node.listing.listing_price).toEqual({
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle empty search results", () => {
|
test("should handle empty search results", () => {
|
||||||
const mockData = {
|
const mockData = {
|
||||||
require: [
|
require: [
|
||||||
@@ -571,6 +691,305 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
const result = extractFacebookMarketplaceData(html);
|
const result = extractFacebookMarketplaceData(html);
|
||||||
expect(result).toBeNull();
|
expect(result).toBeNull();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("classifies Comet search responses", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head><title>Marketplace</title></head>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>{"routing_namespace":"fb_comet","use_ssr_state_manager":true}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "search",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies Comet item responses", () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplacePermalinkController"</script>
|
||||||
|
<script>{"routing_namespace":"fb_comet"}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/item/123/",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "item",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies login-gated responses before parsing", () => {
|
||||||
|
const html = `<html><body>You must log in to Facebook</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "auth_gated",
|
||||||
|
authGated: true,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies unavailable item responses", () => {
|
||||||
|
const html = `<html><body>Marketplace</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "unavailable",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: true,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("classifies unknown responses when no signal is present", () => {
|
||||||
|
const html = `<html><body>Some random page</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "unknown",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does not false-positive on incidental login text", () => {
|
||||||
|
const html = `<html><body><footer>log in to Facebook to see your notifications</footer></body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/marketplace/toronto/search?query=bike",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "unknown",
|
||||||
|
authGated: false,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("detects auth gating from URL redirect", () => {
|
||||||
|
const html = `<html><body>Redirecting...</body></html>`;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
classifyFacebookResponse(
|
||||||
|
html,
|
||||||
|
"https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F456%2F",
|
||||||
|
),
|
||||||
|
).toEqual({
|
||||||
|
kind: "auth_gated",
|
||||||
|
authGated: true,
|
||||||
|
unavailable: false,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("extractFacebookBootstrapCandidates", () => {
|
||||||
|
test("extracts Comet bootstrap candidates from script tags", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>{"routing_namespace":"fb_comet"}</script>
|
||||||
|
<script>{"data":{"marketplace_search_bootstrap":{"edges":[{"node":{"listing":{"id":"1"}}}]}}}</script>
|
||||||
|
<script>not json</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
|
expect(candidates).toHaveLength(2);
|
||||||
|
expect(candidates[1]).toEqual({
|
||||||
|
data: {
|
||||||
|
marketplace_search_bootstrap: {
|
||||||
|
edges: [{ node: { listing: { id: "1" } } }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("keeps candidate order stable for later scoring", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>{"marker":"first"}</script>
|
||||||
|
<script>{"marker":"second"}</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
|
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("extracts search results from Comet bootstrap candidates", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Bike",
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
location: {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: { display_name: "Toronto" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toHaveLength(1);
|
||||||
|
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("prefers the strongest marketplace edge set when multiple edges arrays exist", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
incidental: {
|
||||||
|
feed_units: {
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "wrong-1",
|
||||||
|
marketplace_listing_title: "Wrong Listing",
|
||||||
|
listing_price: {
|
||||||
|
amount: "1.00",
|
||||||
|
formatted_amount: "CA$1",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "right-1",
|
||||||
|
marketplace_listing_title: "Right Listing",
|
||||||
|
listing_price: {
|
||||||
|
amount: "250.00",
|
||||||
|
formatted_amount: "CA$250",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toHaveLength(1);
|
||||||
|
expect(ads?.[0].node.listing.id).toBe("right-1");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects mixed edge arrays that contain non-listing entries", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Bike",
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
story: {
|
||||||
|
id: "not-a-listing",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toBeNull();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
||||||
import fetchFacebookItems from "../src/scrapers/facebook";
|
import fetchFacebookItems, { fetchFacebookItem } from "../src/scrapers/facebook";
|
||||||
|
|
||||||
// Mock fetch globally
|
// Mock fetch globally
|
||||||
const originalFetch = global.fetch;
|
const originalFetch = global.fetch;
|
||||||
@@ -27,77 +27,40 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
|
|
||||||
describe("Main Search Function", () => {
|
describe("Main Search Function", () => {
|
||||||
test("should successfully fetch search results", async () => {
|
test("should successfully fetch search results", async () => {
|
||||||
const mockSearchData = {
|
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
||||||
require: [
|
payload: {
|
||||||
[
|
resultGroups: [
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
{
|
{
|
||||||
__bbox: {
|
edges: [
|
||||||
result: {
|
{
|
||||||
data: {
|
node: {
|
||||||
marketplace_search: {
|
listing: {
|
||||||
feed_units: {
|
id: "1",
|
||||||
edges: [
|
marketplace_listing_title: "iPhone 13",
|
||||||
{
|
listing_price: {
|
||||||
node: {
|
amount: "500.00",
|
||||||
listing: {
|
formatted_amount: "CA$500",
|
||||||
id: "1",
|
currency: "CAD",
|
||||||
marketplace_listing_title: "iPhone 13 Pro",
|
|
||||||
listing_price: {
|
|
||||||
amount: "800.00",
|
|
||||||
formatted_amount: "$800.00",
|
|
||||||
currency: "CAD",
|
|
||||||
},
|
|
||||||
location: {
|
|
||||||
reverse_geocode: {
|
|
||||||
city_page: { display_name: "Toronto" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
creation_time: 1640995200,
|
|
||||||
is_live: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
node: {
|
|
||||||
listing: {
|
|
||||||
id: "2",
|
|
||||||
marketplace_listing_title: "Samsung Galaxy",
|
|
||||||
listing_price: {
|
|
||||||
amount: "600.00",
|
|
||||||
formatted_amount: "$600.00",
|
|
||||||
currency: "CAD",
|
|
||||||
},
|
|
||||||
location: {
|
|
||||||
reverse_geocode: {
|
|
||||||
city_page: { display_name: "Mississauga" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
creation_time: 1640995300,
|
|
||||||
is_live: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
|
location: {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: { display_name: "Toronto" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
],
|
},
|
||||||
};
|
})}</script></body></html>`;
|
||||||
|
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () => Promise.resolve(mockSearchHtml),
|
||||||
Promise.resolve(
|
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
|
||||||
),
|
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
},
|
},
|
||||||
@@ -105,9 +68,8 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
);
|
);
|
||||||
|
|
||||||
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
|
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
|
||||||
expect(results).toHaveLength(2);
|
expect(results).toHaveLength(1);
|
||||||
expect(results[0].title).toBe("iPhone 13 Pro");
|
expect(results[0].title).toBe("iPhone 13");
|
||||||
expect(results[1].title).toBe("Samsung Galaxy");
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("should filter out items without price", async () => {
|
test("should filter out items without price", async () => {
|
||||||
@@ -163,7 +125,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -218,7 +180,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -259,7 +221,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -292,6 +254,76 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
expect(results).toEqual([]);
|
expect(results).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("should return empty array for auth-gated search HTML", async () => {
|
||||||
|
const authGatedSearchHtml = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<a href="/marketplace/item/123456789/">
|
||||||
|
<span>Vintage Lamp</span>
|
||||||
|
<span>CA$45</span>
|
||||||
|
<span>Toronto, ON</span>
|
||||||
|
</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch",
|
||||||
|
text: () => Promise.resolve(authGatedSearchHtml),
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
|
||||||
|
expect(results).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should return empty array when search request lands on unknown route", async () => {
|
||||||
|
const wrongRouteHtml = `<html><body><script>${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Leaked Search Result",
|
||||||
|
listing_price: {
|
||||||
|
amount: "75.00",
|
||||||
|
formatted_amount: "CA$75",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}</script></body></html>`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/marketplace/toronto/",
|
||||||
|
text: () => Promise.resolve(wrongRouteHtml),
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await fetchFacebookItems("lamp", 1, "toronto", 25);
|
||||||
|
expect(results).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
test("should handle network errors", async () => {
|
test("should handle network errors", async () => {
|
||||||
global.fetch = mock(() => Promise.reject(new Error("Network error")));
|
global.fetch = mock(() => Promise.reject(new Error("Network error")));
|
||||||
|
|
||||||
@@ -358,7 +390,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -431,7 +463,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -500,7 +532,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -571,7 +603,7 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () =>
|
||||||
Promise.resolve(
|
Promise.resolve(
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
`<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
||||||
),
|
),
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
@@ -637,4 +669,45 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
expect(results).toEqual([]);
|
expect(results).toEqual([]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("Item Fetch Function", () => {
|
||||||
|
test("should return null for unavailable item responses", async () => {
|
||||||
|
const unavailableItemHtml = `
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<script>${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
listing: {
|
||||||
|
id: "related-123",
|
||||||
|
__typename: "GroupCommerceProductItem",
|
||||||
|
marketplace_listing_title: "Related Listing",
|
||||||
|
formatted_price: { text: "CA$90" },
|
||||||
|
listing_price: {
|
||||||
|
amount: "90.00",
|
||||||
|
currency: "CAD",
|
||||||
|
amount_with_offset: "90.00",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})}</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = mock(() =>
|
||||||
|
Promise.resolve({
|
||||||
|
ok: true,
|
||||||
|
url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1",
|
||||||
|
text: () => Promise.resolve(unavailableItemHtml),
|
||||||
|
headers: {
|
||||||
|
get: () => null,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await fetchFacebookItem("123");
|
||||||
|
expect(result).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user