refactor: rewrite facebook search parser for comet bootstrap
This commit is contained in:
@@ -75,13 +75,6 @@ interface FacebookEdge {
|
|||||||
[k: string]: unknown;
|
[k: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface FacebookMarketplaceSearch {
|
|
||||||
feed_units?: {
|
|
||||||
edges?: FacebookEdge[];
|
|
||||||
};
|
|
||||||
[k: string]: unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface FacebookMarketplaceItem {
|
interface FacebookMarketplaceItem {
|
||||||
// Basic identification
|
// Basic identification
|
||||||
id: string;
|
id: string;
|
||||||
@@ -432,89 +425,108 @@ export function extractFacebookBootstrapCandidates(
|
|||||||
return candidates;
|
return candidates;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] {
|
||||||
|
return (
|
||||||
|
Array.isArray(value) &&
|
||||||
|
value.length > 0 &&
|
||||||
|
value.every(
|
||||||
|
(edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreSearchEdges(edges: FacebookEdge[], score: number): number {
|
||||||
|
return score + Math.min(edges.length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
function findSearchEdges(
|
||||||
|
candidate: unknown,
|
||||||
|
score = 0,
|
||||||
|
): { edges: FacebookEdge[]; score: number } | null {
|
||||||
|
if (Array.isArray(candidate)) {
|
||||||
|
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||||
|
|
||||||
|
for (const item of candidate) {
|
||||||
|
const result = findSearchEdges(item, score);
|
||||||
|
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||||
|
bestMatch = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRecord(candidate)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||||
|
|
||||||
|
const feedUnits = candidate.feed_units;
|
||||||
|
if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) {
|
||||||
|
bestMatch = {
|
||||||
|
edges: feedUnits.edges,
|
||||||
|
score: scoreSearchEdges(feedUnits.edges, score + 2),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const resultGroups = candidate.resultGroups;
|
||||||
|
if (Array.isArray(resultGroups)) {
|
||||||
|
for (const group of resultGroups) {
|
||||||
|
if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) {
|
||||||
|
const result = {
|
||||||
|
edges: group.edges,
|
||||||
|
score: scoreSearchEdges(group.edges, score + 4),
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!bestMatch || result.score > bestMatch.score) {
|
||||||
|
bestMatch = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(candidate)) {
|
||||||
|
const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0));
|
||||||
|
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||||
|
bestMatch = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Extract marketplace search data from Facebook page script tags
|
Extract marketplace search data from Facebook page script tags
|
||||||
*/
|
*/
|
||||||
export function extractFacebookMarketplaceData(
|
export function extractFacebookMarketplaceData(
|
||||||
htmlString: HTMLString,
|
htmlString: HTMLString,
|
||||||
): FacebookAdNode[] | null {
|
): FacebookAdNode[] | null {
|
||||||
const { document } = parseHTML(htmlString);
|
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||||
const scripts = document.querySelectorAll("script");
|
let bestEdges: FacebookEdge[] | null = null;
|
||||||
|
let bestScore = -1;
|
||||||
|
|
||||||
let marketplaceData: FacebookMarketplaceSearch | null = null;
|
for (const candidate of candidates) {
|
||||||
|
const result = findSearchEdges(candidate);
|
||||||
|
if (!result?.edges.length) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Find the script containing the require data with marketplace_search
|
if (result.score > bestScore) {
|
||||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
bestScore = result.score;
|
||||||
const scriptText = script.textContent;
|
bestEdges = result.edges;
|
||||||
if (!scriptText) continue;
|
}
|
||||||
|
|
||||||
try {
|
|
||||||
const parsed = JSON.parse(scriptText);
|
|
||||||
|
|
||||||
// First check if this is the direct data structure (like in examples)
|
|
||||||
if (parsed.require && Array.isArray(parsed.require)) {
|
|
||||||
// Try multiple navigation paths to find marketplace_search
|
|
||||||
const paths = [
|
|
||||||
// Original path from example
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
|
|
||||||
.marketplace_search,
|
|
||||||
// Alternative path structure
|
|
||||||
() =>
|
|
||||||
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
|
|
||||||
// Another variation
|
|
||||||
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
|
|
||||||
// Direct access for some responses
|
|
||||||
() => {
|
|
||||||
for (const item of parsed.require) {
|
|
||||||
if (item && item.length >= 4 && item[3]) {
|
|
||||||
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
|
|
||||||
if (bbox) return bbox;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const getData of paths) {
|
|
||||||
try {
|
|
||||||
const result = getData();
|
|
||||||
if (
|
|
||||||
result &&
|
|
||||||
isRecord(result) &&
|
|
||||||
(result as Record<string, unknown>).feed_units?.edges?.length > 0
|
|
||||||
) {
|
|
||||||
marketplaceData = result as FacebookMarketplaceSearch;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (marketplaceData) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also check for direct marketplace_search in the parsed data
|
|
||||||
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
|
|
||||||
const searchData =
|
|
||||||
parsed.marketplace_search as FacebookMarketplaceSearch;
|
|
||||||
const feedLength = searchData.feed_units?.edges?.length ?? 0;
|
|
||||||
if (feedLength > 0) {
|
|
||||||
marketplaceData = searchData;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!marketplaceData?.feed_units?.edges?.length) {
|
if (!bestEdges?.length) {
|
||||||
console.warn("No marketplace data found in HTML response");
|
console.warn("No marketplace data found in HTML response");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`,
|
`Successfully parsed ${bestEdges.length} Facebook marketplace listings`,
|
||||||
);
|
);
|
||||||
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
|
return bestEdges.map((edge) => ({ node: edge.node }));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -727,6 +727,151 @@ describe("Facebook Marketplace Scraper Core Tests", () => {
|
|||||||
const candidates = extractFacebookBootstrapCandidates(html);
|
const candidates = extractFacebookBootstrapCandidates(html);
|
||||||
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
|
expect(candidates.map((c) => c.marker)).toEqual(["first", "second"]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("extracts search results from Comet bootstrap candidates", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Bike",
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
location: {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: { display_name: "Toronto" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toHaveLength(1);
|
||||||
|
expect(ads?.[0].node.listing.marketplace_listing_title).toBe("Bike");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("prefers the strongest marketplace edge set when multiple edges arrays exist", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
incidental: {
|
||||||
|
feed_units: {
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "wrong-1",
|
||||||
|
marketplace_listing_title: "Wrong Listing",
|
||||||
|
listing_price: {
|
||||||
|
amount: "1.00",
|
||||||
|
formatted_amount: "CA$1",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "right-1",
|
||||||
|
marketplace_listing_title: "Right Listing",
|
||||||
|
listing_price: {
|
||||||
|
amount: "250.00",
|
||||||
|
formatted_amount: "CA$250",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toHaveLength(1);
|
||||||
|
expect(ads?.[0].node.listing.id).toBe("right-1");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects mixed edge arrays that contain non-listing entries", () => {
|
||||||
|
const html = `
|
||||||
|
<html><body>
|
||||||
|
<script>"XCometMarketplaceSearchController"</script>
|
||||||
|
<script>
|
||||||
|
${JSON.stringify({
|
||||||
|
payload: {
|
||||||
|
resultGroups: [
|
||||||
|
{
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
listing: {
|
||||||
|
id: "1",
|
||||||
|
marketplace_listing_title: "Bike",
|
||||||
|
listing_price: {
|
||||||
|
amount: "120.00",
|
||||||
|
formatted_amount: "CA$120",
|
||||||
|
currency: "CAD",
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
node: {
|
||||||
|
story: {
|
||||||
|
id: "not-a-listing",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
})}
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const ads = extractFacebookMarketplaceData(html);
|
||||||
|
expect(ads).toBeNull();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -27,77 +27,40 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
|
|
||||||
describe("Main Search Function", () => {
|
describe("Main Search Function", () => {
|
||||||
test("should successfully fetch search results", async () => {
|
test("should successfully fetch search results", async () => {
|
||||||
const mockSearchData = {
|
const mockSearchHtml = `<html><body><script>"XCometMarketplaceSearchController"</script><script>${JSON.stringify({
|
||||||
require: [
|
payload: {
|
||||||
[
|
resultGroups: [
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
{
|
{
|
||||||
__bbox: {
|
edges: [
|
||||||
result: {
|
{
|
||||||
data: {
|
node: {
|
||||||
marketplace_search: {
|
listing: {
|
||||||
feed_units: {
|
id: "1",
|
||||||
edges: [
|
marketplace_listing_title: "iPhone 13",
|
||||||
{
|
listing_price: {
|
||||||
node: {
|
amount: "500.00",
|
||||||
listing: {
|
formatted_amount: "CA$500",
|
||||||
id: "1",
|
currency: "CAD",
|
||||||
marketplace_listing_title: "iPhone 13 Pro",
|
|
||||||
listing_price: {
|
|
||||||
amount: "800.00",
|
|
||||||
formatted_amount: "$800.00",
|
|
||||||
currency: "CAD",
|
|
||||||
},
|
|
||||||
location: {
|
|
||||||
reverse_geocode: {
|
|
||||||
city_page: { display_name: "Toronto" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
creation_time: 1640995200,
|
|
||||||
is_live: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
node: {
|
|
||||||
listing: {
|
|
||||||
id: "2",
|
|
||||||
marketplace_listing_title: "Samsung Galaxy",
|
|
||||||
listing_price: {
|
|
||||||
amount: "600.00",
|
|
||||||
formatted_amount: "$600.00",
|
|
||||||
currency: "CAD",
|
|
||||||
},
|
|
||||||
location: {
|
|
||||||
reverse_geocode: {
|
|
||||||
city_page: { display_name: "Mississauga" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
creation_time: 1640995300,
|
|
||||||
is_live: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
|
location: {
|
||||||
|
reverse_geocode: {
|
||||||
|
city_page: { display_name: "Toronto" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
is_live: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
],
|
},
|
||||||
};
|
})}</script></body></html>`;
|
||||||
|
|
||||||
global.fetch = mock(() =>
|
global.fetch = mock(() =>
|
||||||
Promise.resolve({
|
Promise.resolve({
|
||||||
ok: true,
|
ok: true,
|
||||||
text: () =>
|
text: () => Promise.resolve(mockSearchHtml),
|
||||||
Promise.resolve(
|
|
||||||
`<html><body><script>${JSON.stringify(mockSearchData)}</script></body></html>`,
|
|
||||||
),
|
|
||||||
headers: {
|
headers: {
|
||||||
get: () => null,
|
get: () => null,
|
||||||
},
|
},
|
||||||
@@ -105,9 +68,8 @@ describe("Facebook Marketplace Scraper Integration Tests", () => {
|
|||||||
);
|
);
|
||||||
|
|
||||||
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
|
const results = await fetchFacebookItems("iPhone", 1, "toronto", 25);
|
||||||
expect(results).toHaveLength(2);
|
expect(results).toHaveLength(1);
|
||||||
expect(results[0].title).toBe("iPhone 13 Pro");
|
expect(results[0].title).toBe("iPhone 13");
|
||||||
expect(results[1].title).toBe("Samsung Galaxy");
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("should filter out items without price", async () => {
|
test("should filter out items without price", async () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user