diff --git a/docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md b/docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md new file mode 100644 index 0000000..e52d3e1 --- /dev/null +++ b/docs/superpowers/plans/2026-04-21-facebook-comet-rewrite.md @@ -0,0 +1,772 @@ +# Facebook Comet Rewrite Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the legacy Facebook Marketplace scraper with a route-aware hybrid Comet-bootstrap parser for both search and item routes. + +**Architecture:** Keep authenticated direct HTTP fetches as the transport. Classify each Facebook response first, then parse route-specific Comet bootstrap/state candidates, and fall back to rendered-HTML extraction only when bootstrap decoding cannot produce the expected search or item shape. + +**Tech Stack:** Bun, TypeScript, `bun:test`, `linkedom`, existing shared cookie/http helpers + +--- + +## File Structure + +- Modify: `packages/core/src/scrapers/facebook.ts` + - Owns Facebook fetch flow, response classification, bootstrap candidate extraction, search parsing, item parsing, and HTML fallbacks. +- Modify: `packages/core/test/facebook-core.test.ts` + - Owns unit coverage for response classification, bootstrap parsing, fallback parsing, and route-aware item/search extraction behavior. +- Modify: `packages/core/test/facebook-integration.test.ts` + - Owns higher-level fetch flow tests, auth/degradation behavior, and result shaping for search/item entrypoints. + +### Task 1: Add Route Classification Coverage + +**Files:** +- Modify: `packages/core/test/facebook-core.test.ts` +- Modify: `packages/core/src/scrapers/facebook.ts` +- Test: `packages/core/test/facebook-core.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Add these tests near the Facebook parser tests in `packages/core/test/facebook-core.test.ts`: + +```ts +test("classifies Comet search responses", () => { + const html = ` + + Marketplace + + + + + + `; + + expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/search?query=bike")).toEqual({ + kind: "search", + authGated: false, + unavailable: false, + }); +}); + +test("classifies Comet item responses", () => { + const html = ` + + + + + + + `; + + expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/item/123/")).toEqual({ + kind: "item", + authGated: false, + unavailable: false, + }); +}); + +test("classifies login-gated responses before parsing", () => { + const html = `You must log in to Facebook`; + + expect(classifyFacebookResponse(html, "https://www.facebook.com/login/?next=%2Fmarketplace%2Fitem%2F123%2F")).toEqual({ + kind: "auth_gated", + authGated: true, + unavailable: false, + }); +}); + +test("classifies unavailable item responses", () => { + const html = `Marketplace`; + + expect(classifyFacebookResponse(html, "https://www.facebook.com/marketplace/toronto/?unavailable_product=1")).toEqual({ + kind: "unavailable", + authGated: false, + unavailable: true, + }); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"` +Expected: FAIL because `classifyFacebookResponse` does not exist yet. + +- [ ] **Step 3: Write minimal implementation** + +Add this type and function near the parsing section in `packages/core/src/scrapers/facebook.ts`: + +```ts +type FacebookResponseKind = "search" | "item" | "auth_gated" | "unavailable" | "unknown"; + +export function classifyFacebookResponse(htmlString: HTMLString, responseUrl: string) { + const authGated = + responseUrl.includes("/login/") || + htmlString.includes("You must log in to Facebook") || + htmlString.includes("log in to Facebook"); + + if (authGated) { + return { kind: "auth_gated" as const, authGated: true, unavailable: false }; + } + + const unavailable = responseUrl.includes("unavailable_product=1"); + if (unavailable) { + return { kind: "unavailable" as const, authGated: false, unavailable: true }; + } + + if (htmlString.includes("XCometMarketplaceSearchController")) { + return { kind: "search" as const, authGated: false, unavailable: false }; + } + + if (htmlString.includes("XCometMarketplacePermalinkController")) { + return { kind: "item" as const, authGated: false, unavailable: false }; + } + + return { kind: "unknown" as const, authGated: false, unavailable: false }; +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "classifies"` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts +git commit -m "refactor: add facebook response classification" +``` + +### Task 2: Add Bootstrap Candidate Extraction + +**Files:** +- Modify: `packages/core/test/facebook-core.test.ts` +- Modify: `packages/core/src/scrapers/facebook.ts` +- Test: `packages/core/test/facebook-core.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Add these tests: + +```ts +test("extracts Comet bootstrap candidates from script tags", () => { + const html = ` + + + + + + `; + + const candidates = extractFacebookBootstrapCandidates(html); + expect(candidates).toHaveLength(2); + expect(candidates[1]).toEqual({ + data: { + marketplace_search_bootstrap: { + edges: [{ node: { listing: { id: "1" } } }], + }, + }, + }); +}); + +test("keeps candidate order stable for later scoring", () => { + const html = ` + + + + + `; + + const candidates = extractFacebookBootstrapCandidates(html); + expect(candidates.map((candidate) => candidate.marker)).toEqual(["first", "second"]); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"` +Expected: FAIL because `extractFacebookBootstrapCandidates` does not exist. + +- [ ] **Step 3: Write minimal implementation** + +Add this helper near the parser utilities in `packages/core/src/scrapers/facebook.ts`: + +```ts +export function extractFacebookBootstrapCandidates(htmlString: HTMLString): Record[] { + const { document } = parseHTML(htmlString); + const scripts = document.querySelectorAll("script"); + const candidates: Record[] = []; + + for (const script of Array.from(scripts) as HTMLScriptElement[]) { + const scriptText = script.textContent?.trim(); + if (!scriptText) continue; + + try { + const parsed = JSON.parse(scriptText); + if (isRecord(parsed)) { + candidates.push(parsed); + } + } catch { + // Ignore non-JSON script bodies. + } + } + + return candidates; +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "bootstrap candidates"` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts +git commit -m "refactor: add facebook bootstrap candidate extraction" +``` + +### Task 3: Replace Search Parsing With Candidate Scoring + +**Files:** +- Modify: `packages/core/test/facebook-core.test.ts` +- Modify: `packages/core/test/facebook-integration.test.ts` +- Modify: `packages/core/src/scrapers/facebook.ts` +- Test: `packages/core/test/facebook-core.test.ts` +- Test: `packages/core/test/facebook-integration.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Add a core test for route-aware search extraction: + +```ts +test("extracts search results from Comet bootstrap candidates", () => { + const html = ` + + + + + `; + + const ads = extractFacebookMarketplaceData(html); + expect(ads).toHaveLength(1); + expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike"); +}); +``` + +Replace one integration fixture with a current-shape search fixture: + +```ts +const mockSearchHtml = ` + + + + +`; +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet bootstrap candidates"` +Expected: FAIL because the current search extractor only understands legacy `marketplace_search` shapes. + +- [ ] **Step 3: Write minimal implementation** + +Replace the search extraction internals in `extractFacebookMarketplaceData()` with candidate scoring like this: + +```ts +function findSearchEdges(candidate: unknown): FacebookEdge[] | null { + if (Array.isArray(candidate)) { + for (const item of candidate) { + const result = findSearchEdges(item); + if (result) return result; + } + return null; + } + + if (!isRecord(candidate)) { + return null; + } + + const directEdges = candidate.feed_units?.edges; + if (Array.isArray(directEdges)) { + return directEdges as FacebookEdge[]; + } + + const resultGroups = candidate.resultGroups; + if (Array.isArray(resultGroups)) { + for (const group of resultGroups) { + if (isRecord(group) && Array.isArray(group.edges)) { + return group.edges as FacebookEdge[]; + } + } + } + + for (const value of Object.values(candidate)) { + const result = findSearchEdges(value); + if (result) return result; + } + + return null; +} + +export function extractFacebookMarketplaceData(htmlString: HTMLString): FacebookAdNode[] | null { + const candidates = extractFacebookBootstrapCandidates(htmlString); + + for (const candidate of candidates) { + const edges = findSearchEdges(candidate); + if (edges?.length) { + return edges.map((edge) => ({ node: edge.node })); + } + } + + console.warn("No marketplace data found in HTML response"); + return null; +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts` +Expected: PASS for the rewritten search fixtures and existing unaffected tests. + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts +git commit -m "refactor: rewrite facebook search parser for comet bootstrap" +``` + +### Task 4: Replace Item Parsing With Candidate Scoring + +**Files:** +- Modify: `packages/core/test/facebook-core.test.ts` +- Modify: `packages/core/src/scrapers/facebook.ts` +- Test: `packages/core/test/facebook-core.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Replace one old item fixture with a current-shape item fixture: + +```ts +test("extracts item details from Comet permalink bootstrap candidates", () => { + const html = ` + + + + + `; + + const item = extractFacebookItemData(html); + expect(item?.id).toBe("123"); + expect(item?.marketplace_listing_title).toBe("Vintage Chair"); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "Comet permalink bootstrap"` +Expected: FAIL because the current item extractor depends on legacy permalink markers. + +- [ ] **Step 3: Write minimal implementation** + +Replace the item extraction internals with a semantic candidate finder like this: + +```ts +function findMarketplaceItemCandidate(candidate: unknown): FacebookMarketplaceItem | null { + if (Array.isArray(candidate)) { + for (const item of candidate) { + const result = findMarketplaceItemCandidate(item); + if (result) return result; + } + return null; + } + + if (!isRecord(candidate)) { + return null; + } + + if ( + candidate.id && + candidate.__typename === "GroupCommerceProductItem" && + candidate.marketplace_listing_title + ) { + return candidate as FacebookMarketplaceItem; + } + + for (const value of Object.values(candidate)) { + const result = findMarketplaceItemCandidate(value); + if (result) return result; + } + + return null; +} + +export function extractFacebookItemData(htmlString: HTMLString): FacebookMarketplaceItem | null { + const candidates = extractFacebookBootstrapCandidates(htmlString); + + for (const candidate of candidates) { + const item = findMarketplaceItemCandidate(candidate); + if (item) { + return item; + } + } + + return null; +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `bun test packages/core/test/facebook-core.test.ts` +Expected: PASS for current-shape item tests and remaining parser tests. + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts +git commit -m "refactor: rewrite facebook item parser for comet bootstrap" +``` + +### Task 5: Add HTML Fallback Extraction + +**Files:** +- Modify: `packages/core/test/facebook-core.test.ts` +- Modify: `packages/core/src/scrapers/facebook.ts` +- Test: `packages/core/test/facebook-core.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Add these fallback tests: + +```ts +test("falls back to rendered search HTML when bootstrap payloads are undecodable", () => { + const html = ` + + + Vintage Lamp + CA$45 + Toronto, ON + + `; + + const ads = extractFacebookMarketplaceData(html); + const parsed = ads ? parseFacebookAds(ads) : []; + expect(parsed[0].title).toBe("Vintage Lamp"); + expect(parsed[0].listingPrice?.amountFormatted).toBe("CA$45"); +}); + +test("falls back to rendered item HTML when bootstrap payloads are undecodable", () => { + const html = ` + + +

Vintage Desk

+ CA$120 + Condition Used - Good +
Description Solid oak desk.
+
Seller information Jordan
+ + `; + + const item = extractFacebookItemData(html); + expect(item?.marketplace_listing_title).toBe("Vintage Desk"); + expect(item?.formatted_price?.text).toBe("CA$120"); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"` +Expected: FAIL because the extractor currently returns `null` without a structured candidate. + +- [ ] **Step 3: Write minimal implementation** + +Add route-specific HTML fallback helpers in `packages/core/src/scrapers/facebook.ts`: + +```ts +function extractSearchFallback(htmlString: HTMLString): FacebookAdNode[] | null { + const idMatch = htmlString.match(/marketplace\/item\/(\d+)/); + const titleMatch = htmlString.match(/marketplace\/item\/\d+\/[^>]*>([^<]+)]*>([^<]+)<\/h1>/i); + const priceMatch = htmlString.match(/CA\$\d+(?:,\d{3})*(?:\.\d{2})?/); + if (!titleMatch || !priceMatch) return null; + + return { + id: "fallback-item", + __typename: "GroupCommerceProductItem", + marketplace_listing_title: titleMatch[1].trim(), + formatted_price: { text: priceMatch[0] }, + listing_price: { + amount: priceMatch[0].replace("CA$", "").replace(/,/g, ""), + currency: "CAD", + amount_with_offset: priceMatch[0].replace("CA$", "").replace(/,/g, ""), + }, + redacted_description: { text: htmlString.includes("Description") ? htmlString.split("Description")[1].split("<")[0].trim() : "" }, + is_live: true, + }; +} +``` + +Then call these helpers as the last fallback inside `extractFacebookMarketplaceData()` and `extractFacebookItemData()`. + +- [ ] **Step 4: Run test to verify it passes** + +Run: `bun test packages/core/test/facebook-core.test.ts --test-name-pattern "falls back"` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts +git commit -m "refactor: add facebook html fallbacks" +``` + +### Task 6: Wire Route-Aware Failures Into Entry Points + +**Files:** +- Modify: `packages/core/test/facebook-integration.test.ts` +- Modify: `packages/core/src/scrapers/facebook.ts` +- Test: `packages/core/test/facebook-integration.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Add these integration tests: + +```ts +test("returns empty search results for auth-gated search HTML", async () => { + global.fetch = mock(() => + Promise.resolve({ + ok: true, + url: "https://www.facebook.com/login/?next=%2Fmarketplace%2Ftoronto%2Fsearch", + text: () => Promise.resolve("You must log in to Facebook"), + headers: { get: () => null }, + }), + ); + + const results = await fetchFacebookItems("bike", 1, "toronto", 25); + expect(results).toEqual([]); +}); + +test("returns null for unavailable item responses", async () => { + global.fetch = mock(() => + Promise.resolve({ + ok: true, + url: "https://www.facebook.com/marketplace/toronto/?unavailable_product=1", + text: () => Promise.resolve("Marketplace"), + headers: { get: () => null }, + }), + ); + + const item = await fetchFacebookItem("123"); + expect(item).toBeNull(); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test packages/core/test/facebook-integration.test.ts --test-name-pattern "auth-gated|unavailable"` +Expected: FAIL because the entrypoints do not yet classify successful HTML responses by route/auth state. + +- [ ] **Step 3: Write minimal implementation** + +Update both entrypoints to classify successful HTML before parsing: + +```ts +const responseClass = classifyFacebookResponse(searchHtml, searchUrl); +if (responseClass.kind === "auth_gated") { + console.warn("Facebook marketplace search is auth-gated. Update FACEBOOK_COOKIE with a fresh raw Cookie header string."); + return []; +} + +const itemResponseClass = classifyFacebookResponse(itemHtml, itemUrl); +if (itemResponseClass.kind === "auth_gated") { + console.warn(`Authentication failed for item ${itemId}. Cookies may be expired.`); + return null; +} + +if (itemResponseClass.kind === "unavailable") { + console.warn(`Item ${itemId} appears to be unavailable in the marketplace.`); + return null; +} +``` + +Use the actual response URL from `fetchHtml` plumbing if that helper is extended to return both HTML and final URL; otherwise start by threading final URL support through the fetch helper in the same task. + +- [ ] **Step 4: Run test to verify it passes** + +Run: `bun test packages/core/test/facebook-integration.test.ts` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-integration.test.ts +git commit -m "refactor: handle facebook route-aware failure states" +``` + +### Task 7: Run Full Verification And Live Probe + +**Files:** +- Modify: `packages/core/src/scrapers/facebook.ts` if small cleanup is required +- Modify: `packages/core/test/facebook-core.test.ts` if small cleanup is required +- Modify: `packages/core/test/facebook-integration.test.ts` if small cleanup is required + +- [ ] **Step 1: Run focused Facebook tests** + +Run: `bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts` +Expected: PASS + +- [ ] **Step 2: Run broader core tests** + +Run: `bun test packages/core/test` +Expected: PASS + +- [ ] **Step 3: Run live authenticated Facebook probe** + +Run: + +```bash +set -a && source .env && set +a && bun --eval 'import { fetchFacebookItems, fetchFacebookItem } from "./packages/core/src/index.ts"; +const results = await fetchFacebookItems("iphone", 1, "toronto", 3); +console.log("SEARCH_COUNT=" + results.length); +console.log(JSON.stringify(results[0] ?? null)); +if (results[0]?.url) { + const match = results[0].url.match(/\/item\/(\d+)/); + if (match) { + const item = await fetchFacebookItem(match[1]); + console.log(JSON.stringify(item)); + } +}' +``` + +Expected: + +- search returns at least one result +- item fetch returns non-null for the first live result when the route is not stale/unavailable + +- [ ] **Step 4: Make any minimal cleanup needed to keep tests and live probe green** + +If cleanup is needed, keep it limited to naming, dead-code removal caused by the rewrite, or small parser corrections directly exposed by the verification commands. + +- [ ] **Step 5: Re-run verification** + +Run: + +```bash +bun test packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts && bun test packages/core/test +``` + +Expected: PASS + +- [ ] **Step 6: Commit** + +```bash +git add packages/core/src/scrapers/facebook.ts packages/core/test/facebook-core.test.ts packages/core/test/facebook-integration.test.ts +git commit -m "refactor: complete facebook comet scraper rewrite" +``` + +## Self-Review + +- Spec coverage: the plan covers classification, route-aware search parsing, route-aware item parsing, HTML fallbacks, explicit failure-state handling, test replacement, and live verification. +- Placeholder scan: no `TODO`, `TBD`, or unspecified “handle appropriately” steps remain. +- Type consistency: all planned functions and types use the same names across tasks: `classifyFacebookResponse`, `extractFacebookBootstrapCandidates`, `extractFacebookMarketplaceData`, and `extractFacebookItemData`.