refactor: rewrite facebook search parser for comet bootstrap
This commit is contained in:
@@ -75,13 +75,6 @@ interface FacebookEdge {
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceSearch {
|
||||
feed_units?: {
|
||||
edges?: FacebookEdge[];
|
||||
};
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface FacebookMarketplaceItem {
|
||||
// Basic identification
|
||||
id: string;
|
||||
@@ -432,89 +425,108 @@ export function extractFacebookBootstrapCandidates(
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function isFacebookSearchEdgeArray(value: unknown): value is FacebookEdge[] {
|
||||
return (
|
||||
Array.isArray(value) &&
|
||||
value.length > 0 &&
|
||||
value.every(
|
||||
(edge) => isRecord(edge) && isRecord(edge.node) && isRecord(edge.node.listing),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function scoreSearchEdges(edges: FacebookEdge[], score: number): number {
|
||||
return score + Math.min(edges.length, 3);
|
||||
}
|
||||
|
||||
function findSearchEdges(
|
||||
candidate: unknown,
|
||||
score = 0,
|
||||
): { edges: FacebookEdge[]; score: number } | null {
|
||||
if (Array.isArray(candidate)) {
|
||||
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||
|
||||
for (const item of candidate) {
|
||||
const result = findSearchEdges(item, score);
|
||||
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||
bestMatch = result;
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
if (!isRecord(candidate)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let bestMatch: { edges: FacebookEdge[]; score: number } | null = null;
|
||||
|
||||
const feedUnits = candidate.feed_units;
|
||||
if (isRecord(feedUnits) && isFacebookSearchEdgeArray(feedUnits.edges)) {
|
||||
bestMatch = {
|
||||
edges: feedUnits.edges,
|
||||
score: scoreSearchEdges(feedUnits.edges, score + 2),
|
||||
};
|
||||
}
|
||||
|
||||
const resultGroups = candidate.resultGroups;
|
||||
if (Array.isArray(resultGroups)) {
|
||||
for (const group of resultGroups) {
|
||||
if (isRecord(group) && isFacebookSearchEdgeArray(group.edges)) {
|
||||
const result = {
|
||||
edges: group.edges,
|
||||
score: scoreSearchEdges(group.edges, score + 4),
|
||||
};
|
||||
|
||||
if (!bestMatch || result.score > bestMatch.score) {
|
||||
bestMatch = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const [key, value] of Object.entries(candidate)) {
|
||||
const result = findSearchEdges(value, score + (key === "payload" ? 1 : 0));
|
||||
if (result && (!bestMatch || result.score > bestMatch.score)) {
|
||||
bestMatch = result;
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
/**
|
||||
Extract marketplace search data from Facebook page script tags
|
||||
*/
|
||||
export function extractFacebookMarketplaceData(
|
||||
htmlString: HTMLString,
|
||||
): FacebookAdNode[] | null {
|
||||
const { document } = parseHTML(htmlString);
|
||||
const scripts = document.querySelectorAll("script");
|
||||
const candidates = extractFacebookBootstrapCandidates(htmlString);
|
||||
let bestEdges: FacebookEdge[] | null = null;
|
||||
let bestScore = -1;
|
||||
|
||||
let marketplaceData: FacebookMarketplaceSearch | null = null;
|
||||
for (const candidate of candidates) {
|
||||
const result = findSearchEdges(candidate);
|
||||
if (!result?.edges.length) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the script containing the require data with marketplace_search
|
||||
for (const script of Array.from(scripts) as HTMLScriptElement[]) {
|
||||
const scriptText = script.textContent;
|
||||
if (!scriptText) continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(scriptText);
|
||||
|
||||
// First check if this is the direct data structure (like in examples)
|
||||
if (parsed.require && Array.isArray(parsed.require)) {
|
||||
// Try multiple navigation paths to find marketplace_search
|
||||
const paths = [
|
||||
// Original path from example
|
||||
() =>
|
||||
parsed.require[0][3][0].__bbox.require[0][3][1].__bbox.result.data
|
||||
.marketplace_search,
|
||||
// Alternative path structure
|
||||
() =>
|
||||
parsed.require[0][3][1]?.__bbox?.result?.data?.marketplace_search,
|
||||
// Another variation
|
||||
() => parsed.require[0][3][0].__bbox.result.data.marketplace_search,
|
||||
// Direct access for some responses
|
||||
() => {
|
||||
for (const item of parsed.require) {
|
||||
if (item && item.length >= 4 && item[3]) {
|
||||
const bbox = item[3]?.__bbox?.result?.data?.marketplace_search;
|
||||
if (bbox) return bbox;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
},
|
||||
];
|
||||
|
||||
for (const getData of paths) {
|
||||
try {
|
||||
const result = getData();
|
||||
if (
|
||||
result &&
|
||||
isRecord(result) &&
|
||||
(result as Record<string, unknown>).feed_units?.edges?.length > 0
|
||||
) {
|
||||
marketplaceData = result as FacebookMarketplaceSearch;
|
||||
break;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
|
||||
if (marketplaceData) break;
|
||||
}
|
||||
|
||||
// Also check for direct marketplace_search in the parsed data
|
||||
if (parsed.marketplace_search && isRecord(parsed.marketplace_search)) {
|
||||
const searchData =
|
||||
parsed.marketplace_search as FacebookMarketplaceSearch;
|
||||
const feedLength = searchData.feed_units?.edges?.length ?? 0;
|
||||
if (feedLength > 0) {
|
||||
marketplaceData = searchData;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
if (result.score > bestScore) {
|
||||
bestScore = result.score;
|
||||
bestEdges = result.edges;
|
||||
}
|
||||
}
|
||||
|
||||
if (!marketplaceData?.feed_units?.edges?.length) {
|
||||
if (!bestEdges?.length) {
|
||||
console.warn("No marketplace data found in HTML response");
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Successfully parsed ${marketplaceData.feed_units.edges.length} Facebook marketplace listings`,
|
||||
`Successfully parsed ${bestEdges.length} Facebook marketplace listings`,
|
||||
);
|
||||
return marketplaceData.feed_units.edges.map((edge) => ({ node: edge.node }));
|
||||
return bestEdges.map((edge) => ({ node: edge.node }));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user