diff --git a/docs/superpowers/specs/2026-05-01-facebook-anti-bot-challenge-solver-design.md b/docs/superpowers/specs/2026-05-01-facebook-anti-bot-challenge-solver-design.md new file mode 100644 index 0000000..0ff624b --- /dev/null +++ b/docs/superpowers/specs/2026-05-01-facebook-anti-bot-challenge-solver-design.md @@ -0,0 +1,173 @@ +# Facebook Marketplace Anti-Bot Challenge Solver Design + +## Summary + +Add a challenge-detection and challenge-solving layer to the Facebook Marketplace +scraper so it can handle anti-bot gates (checkpoint pages, token rotation, cookie +requirements) programmatically. +Build the solver in pure Bun — no browser automation in production. +Use `agent-browser` only for one-time debug reconnaissance. + +## Goals + +- Identify which anti-bot challenge(s) Facebook Marketplace triggers against + programmatic HTTP requests. +- Implement detection + solving for each discovered challenge type. +- Wire the solver into `fetchFacebookItems` and `fetchFacebookItem` so challenges are + handled transparently. +- Follow the same pattern as the existing `ebay-challenge.ts` (detect → solve → retry + with clearance). +- Zero browser automation at runtime. + Pure `fetch` + `Bun` APIs + npm packages only. + +## Non-Goals + +- Solving login/auth-wall challenges (those require fresh cookies — not solvable + programmatically). +- Full account login automation (cookies must be provided by the user). +- Browser-based scraping or Puppeteer/Playwright integration. +- Solving challenges for non-Marketplace Facebook endpoints. + +## Current State + +The Facebook scraper (`packages/core/src/scrapers/facebook.ts`) fetches Marketplace +search and item pages via authenticated `fetch` with cookies from `FACEBOOK_COOKIE` env +var. It: + +- Sends a browser-like header set (`sec-ch-ua`, `user-agent`, etc.) +- Parses SSR HTML for embedded JSON in script tags +- Has no challenge detection — if Facebook returns a challenge page, the scraper + silently fails (no listings parsed, classifies as “unknown”) +- Depends entirely on cookie freshness + +The eBay scraper already follows the challenge-solver pattern in this codebase: +`ebay.ts` uses `warmEbaySession()`, `isChallengeRedirect()`, `isChallengeHtml()`, and +`solveEbayChallenge()` from `ebay-challenge.ts`. + +## Chosen Approach + +**Reconnaissance-first development:** + +1. Use `agent-browser` (debug only) to capture a real Facebook Marketplace browsing + session via HAR. +2. Probe programmatic `fetch` to see what Facebook returns without a browser. +3. Diff the two to identify the gap (missing headers? + missing cookies? missing JS execution?). +4. Build a modular solver in `packages/core/src/utils/facebook-challenge.ts` that + detects each challenge type and applies the appropriate fix. +5. Wire it into `facebook.ts` following the eBay pattern. + +## Design + +### File Plan + +| File | Purpose | +| --- | --- | +| `packages/core/src/utils/facebook-challenge.ts` | Challenge detection, solving, and cookie/session utilities | +| `packages/core/src/scrapers/facebook.ts` | Modified: warmup, challenge detection before parsing, retry loop | +| `packages/core/test/facebook-challenge.test.ts` | Unit tests with mock challenge HTML fixtures | + +### Flow + +``` +fetchFacebookItems(searchUrl) + ├── warmFacebookSession() → GET facebook.com/ (collect datr + Akamai cookies) + ├── fetchHtml(searchUrl) → receives response + ├── detectFacebookChallenge(response) + │ ├── checkpoint/challenge HTML → solveCheckpointChallenge() + │ ├── redirect to /login → fail (cookies expired) + │ ├── missing required cookies → regenerate session + │ ├── 429 rate limit → backoff + retry (existing http.ts handles this) + │ └── no challenge → proceed to parsing + ├── if solveCheckpointChallenge succeeds → retry fetchHtml with clearance cookie + └── parse results +``` + +### Challenge Types (to be confirmed by reconnaissance) + +| Type | Expected Signal | Solving Strategy | +| --- | --- | --- | +| Login wall | Redirect to `/login` or HTML `"You must log in"` | Fail — user must provide fresh cookies | +| Checkpoint page | HTML contains `checkpoint` or `challenge` path | Parse hidden form fields, compute proof-of-work if present, submit answer endpoint | +| `datr` cookie missing | No `datr` in cookie jar → request fails | Fetch homepage first to obtain `datr` (session warmup) | +| DTSG token needed | Form submissions fail with CSRF error | Extract `fb_dtsg` from page HTML, include in request body | +| GraphQL header check | Request blocked without internal headers | Extract `x-fb-friendly-name` from browser HAR, replicate | +| Akamai/bot-manager | Redirect loops or blank pages without Akamai cookies | Homepage warmup to collect `bm_sv`, `bm_mi`, etc. | + +### Key Modules + +**`facebook-challenge.ts`:** + +``` +// Session warmup — fetch homepage to prime cookies +warmFacebookSession(): Promise> + +// Challenge detection +detectFacebookChallenge(html, status, url, headers): ChallengeType | null + +// Checkpoint solver +solveCheckpointChallenge(html, cookies): Promise + +// DTSG token extraction +extractDtsg(html): string | null + +// Cookie jar management (shared with ebay.ts pattern) +mergeCookies(...): Record +``` + +**`ChallengeResult` type:** +```ts +interface ChallengeResult { + solved: boolean; + cookies?: Record; // clearance cookies to replay + token?: string; // challenge response token + error?: string; // why it failed +} +``` + +### Error Handling + +- Solver failure → return `ChallengeResult { solved: false, error: "..." }`, scraper + logs warning and returns empty results (never throws). +- Unrecognized challenge → log the response URL and HTML snippet for future analysis. +- Rate limits → handled by existing `http.ts` exponential backoff (no change needed). +- Solver timeout → 30s cap on any challenge computation, fall back to `solved: false`. + +### Testing + +| Test | What It Verifies | +| --- | --- | +| `detectFacebookChallenge` with sample checkpoint HTML | Correctly identifies checkpoint challenge | +| `detectFacebookChallenge` with normal search HTML | Returns null (no false positives) | +| `detectFacebookChallenge` with login redirect | Identifies auth-gated | +| `solveCheckpointChallenge` with known PoW params | Produces correct answer | +| `warmFacebookSession` with mocked fetch | Collects expected cookies | +| `extractDtsg` with sample page HTML | Extracts the DTSG token | +| Integration: fetch → challenge → solve → retry → results | End-to-end mock flow | +| Solver throws → scraper returns empty, no crash | Graceful fallback | +| Solver unknown challenge → logs warning, returns empty | No unhandled challenge crashes | + +Test data will use anonymized HTML fixtures (no real user data). + +## Reconnaissance Steps (debug-only, one-time) + +1. **Probe programmatically:** `fetch` Marketplace search with/without cookies, record + status code and HTML. +2. **Browser session:** `agent-browser` → log into Facebook → navigate Marketplace → + record HAR. +3. **Diff analysis:** Compare browser request headers vs. + our programmatic headers. +4. **Cookie inventory:** List all cookies from browser session, identify which are + essential. +5. **Challenge trigger:** Identify what change in request signature triggers a + challenge. +6. **Replay test:** Replay browser’s exact request via `fetch` to confirm + headers/cookies are the differentiator. + +All reconnaissance artifacts saved under `docs/facebook-challenge/`. + +## Decisions Deferred to Post-Reconnaissance + +- Exact challenge types and solving strategies (depends on what Facebook actually uses). +- Whether a PoW solver, CAPTCHA solver, or token-extraction approach is needed. +- npm package dependencies (only add what the reconnaissance proves necessary). diff --git a/packages/core/src/utils/facebook-challenge.ts b/packages/core/src/utils/facebook-challenge.ts new file mode 100644 index 0000000..a879632 --- /dev/null +++ b/packages/core/src/utils/facebook-challenge.ts @@ -0,0 +1,128 @@ +// Facebook Marketplace session & challenge utilities + +// ------------------ Types ------------------ + +export type ChallengeType = + | "login_wall" + | "checkpoint" + | "bad_headers" + | "rate_limited" + | "none"; + +// ------------------ Constants ------------------ + +const FACEBOOK_BROWSER_HEADERS: Record = { + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", + "cache-control": "no-cache", + "upgrade-insecure-requests": "1", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "sec-ch-ua": + '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Linux"', + "user-agent": + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", +}; + +// ------------------ Cookie Management ------------------ + +function parseSetCookies(setCookieHeaders: string[]): Record { + const cookies: Record = {}; + for (const header of setCookieHeaders) { + const parts = header.split(";"); + const firstPart = parts[0]?.trim(); + if (!firstPart) continue; + const eqIdx = firstPart.indexOf("="); + if (eqIdx === -1) continue; + const name = firstPart.slice(0, eqIdx).trim(); + const value = firstPart.slice(eqIdx + 1).trim(); + if (name && value) { + cookies[name] = value; + } + } + return cookies; +} + +function cookiesToHeader(cookies: Record): string { + return Object.entries(cookies) + .map(([name, value]) => `${name}=${value}`) + .join("; "); +} + +// ------------------ Session Warmup ------------------ + +export async function warmFacebookSession(): Promise> { + try { + const res = await fetch("https://www.facebook.com/", { + method: "GET", + headers: FACEBOOK_BROWSER_HEADERS, + redirect: "manual", + signal: AbortSignal.timeout(10000), + }); + + const setCookies = res.headers.getSetCookie?.() ?? []; + return parseSetCookies(setCookies); + } catch { + return {}; + } +} + +// ------------------ Challenge Detection ------------------ + +export function detectFacebookChallenge( + status: number, + html: string, + responseUrl: string, +): ChallengeType { + if (status === 400) { + return "bad_headers"; + } + + if (status === 429) { + return "rate_limited"; + } + + if (responseUrl.includes("/login/")) { + return "login_wall"; + } + + if (html.includes("You must log in") || html.includes("log in to continue")) { + return "login_wall"; + } + + if ( + responseUrl.includes("/checkpoint/") || + (html.includes("checkpoint") && html.includes("challenge")) + ) { + return "checkpoint"; + } + + return "none"; +} + +// ------------------ Header Construction ------------------ + +export function buildFacebookHeaders( + cookieJar: Record, + extraHeaders?: Record, +): Record { + const headers: Record = { + ...FACEBOOK_BROWSER_HEADERS, + }; + + const cookieString = cookiesToHeader(cookieJar); + if (cookieString) { + headers.cookie = cookieString; + } + + if (extraHeaders) { + Object.assign(headers, extraHeaders); + } + + return headers; +}