Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 218 additions & 37 deletions src/scrapers/twitter/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,27 @@ puppeteer.use(StealthPlugin());
// ============================================================================

const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
const randomDelay = (min = 1000, max = 3000) => sleep(min + Math.random() * (max - min));

/** Human-like delay using log-normal distribution with occasional distraction spikes. */
const randomDelay = (min = 2000, max = 7000) => {
const u1 = Math.random();
const u2 = Math.random();
const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2);
const median = min + (max - min) * 0.4;
const spread = (max - min) * 0.25;
const base = median + z * spread;
const distraction = Math.random() < 0.08 ? 8000 + Math.random() * 12000 : 0;
const delay = Math.max(min, Math.min(base, max)) + distraction;
return sleep(delay);
};

/** Throw if the page redirected to login (expired/invalid cookie). */
function checkAuth(page) {
const url = page.url();
if (url.includes('/login') || url.includes('/i/flow/login')) {
throw new Error('Authentication failed — cookie may be expired.\n\nRun: xactions login');
}
}

/**
* Create a browser instance with stealth settings.
Expand Down Expand Up @@ -438,51 +458,212 @@ export async function searchTweets(page, query, options = {}) {
// ============================================================================
// Thread Scraper
// ============================================================================
// TweetDetail GraphQL helpers (shared by scrapeThread and scrapePost)
// ============================================================================

/**
* Scrape a full tweet thread
* Fetch TweetDetail GraphQL API from the page context using session cookies.
* The page must already be on x.com (for cookies to be available).
* Includes a human-like delay before each call.
*/
export async function scrapeThread(page, tweetUrl) {
await page.goto(tweetUrl, { waitUntil: 'networkidle2' });
await randomDelay();
async function fetchTweetDetail(page, tweetId) {
await randomDelay(2000, 5000);
return page.evaluate(async (id) => {
const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1];
if (!ct0) return null;
const variables = JSON.stringify({
focalTweetId: id, with_rux_injections: false, rankingMode: 'Relevance',
includePromotedContent: false, withCommunity: true,
withQuickPromoteEligibilityTweetFields: true, withBirdwatchNotes: true, withVoice: true,
});
const features = JSON.stringify({
rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true,
responsive_web_graphql_skip_user_profile_image_extensions_enabled: false,
creator_subscriptions_tweet_preview_api_enabled: true,
longform_notetweets_consumption_enabled: true,
responsive_web_twitter_article_tweet_consumption_enabled: true,
responsive_web_edit_tweet_api_enabled: true,
graphql_is_translatable_rweb_tweet_is_translatable_enabled: true,
view_counts_everywhere_api_enabled: true,
freedom_of_speech_not_reach_fetch_enabled: true,
tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true,
longform_notetweets_rich_text_read_enabled: true,
});
const url = `https://x.com/i/api/graphql/t66713qxyDI9pc4Jyb6wxQ/TweetDetail?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`;
try {
const resp = await fetch(url, {
headers: {
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session',
},
credentials: 'include',
});
return await resp.json();
} catch { return null; }
}, tweetId);
}

for (let i = 0; i < 5; i++) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await randomDelay(1000, 2000);
/** Extract timeline entries from a TweetDetail GraphQL response. */
function extractEntries(graphqlData) {
const instructions = graphqlData?.data?.threaded_conversation_with_injections_v2?.instructions || [];
const entries = [];
for (const inst of instructions) {
if (inst.entries) entries.push(...inst.entries);
}
return entries;
}

const thread = await page.evaluate(() => {
const articles = document.querySelectorAll('article[data-testid="tweet"]');
const mainTweetId = window.location.pathname.match(/status\/(\d+)/)?.[1];

const mainArticle = Array.from(articles).find(a =>
a.querySelector(`a[href*="/status/${mainTweetId}"]`)
);
const mainAuthor = mainArticle?.querySelector('[data-testid="User-Name"] a')?.href?.split('/')[3];
/** Unwrap TweetWithVisibilityResults wrapper. */
function unwrapResult(result) {
if (result?.__typename === 'TweetWithVisibilityResults') return result.tweet;
return result;
}

return Array.from(articles)
.map((article) => {
const textEl = article.querySelector('[data-testid="tweetText"]');
const authorLink = article.querySelector('[data-testid="User-Name"] a[href^="/"]');
const timeEl = article.querySelector('time');
const linkEl = article.querySelector('a[href*="/status/"]');

const author = authorLink?.href?.split('/')[3];

return {
id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || null,
text: textEl?.textContent || null,
author,
timestamp: timeEl?.getAttribute('datetime') || null,
url: linkEl?.href || null,
isMainAuthor: author === mainAuthor,
platform: 'twitter',
};
})
.filter(t => t.id && t.isMainAuthor);
/** Get screen_name from a user result (handles both new core and legacy paths). */
function getScreenName(result) {
const user = result?.core?.user_results?.result;
return user?.core?.screen_name || user?.legacy?.screen_name || '';
}

/**
* Parse rich data from a single tweet GraphQL result.
* Does NOT recurse into quoted tweets — returns quotedTweetId for the caller to handle.
*/
function parseTweetResult(result) {
result = unwrapResult(result);
if (!result?.legacy) return null;

const legacy = result.legacy;
const author = getScreenName(result);
const text = result.note_tweet?.note_tweet_results?.result?.text || legacy.full_text || '';

// Media: images and videos
const media = (legacy.extended_entities?.media || []).map(m => {
const item = { type: m.type, url: m.media_url_https };
if (m.type === 'video' || m.type === 'animated_gif') {
const best = m.video_info?.variants
?.filter(v => v.content_type === 'video/mp4')
.sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0];
if (best) item.videoUrl = best.url;
}
return item;
});

return thread;
// Article (X Articles — long-form posts)
let article = null;
if (result.article?.article_results?.result) {
const a = result.article.article_results.result;
article = {
id: a.rest_id || null,
title: a.title || null,
coverImage: a.cover_media?.media_info?.original_img_url || null,
url: `https://x.com/${author}/article/${result.rest_id}`,
};
}

// Card (link previews — external URLs)
let card = null;
if (result.card?.legacy?.binding_values) {
const vals = {};
for (const v of result.card.legacy.binding_values) {
vals[v.key] = v.value?.string_value || v.value?.scribe_value?.value || v.value?.image_value?.url || '';
}
if (vals.title || vals.card_url) {
card = { title: vals.title || '', description: vals.description || '', url: vals.card_url || '', image: vals.thumbnail_image_original || '' };
}
}

// URLs: external links in tweet text (from both legacy and note_tweet entities)
const rawUrls = [
...(legacy.entities?.urls || []),
...(result.note_tweet?.note_tweet_results?.result?.entity_set?.urls || []),
];
const urls = rawUrls
.map(u => ({ url: u.expanded_url || u.url || '', display: u.display_url || '' }))
.filter(u => u.url && !u.url.includes('x.com/') && !u.url.includes('twitter.com/'));

// Quoted tweet ID (for recursive fetching — not parsed from this response)
const quotedTweetId = result.quoted_status_result?.result?.rest_id || legacy.quoted_status_id_str || null;

return {
id: result.rest_id,
author,
text,
timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : null,
url: `https://x.com/${author}/status/${result.rest_id}`,
media,
article,
card,
urls: urls.length > 0 ? urls : undefined,
quotedTweetId,
inReplyTo: legacy.in_reply_to_status_id_str || null,
replies: legacy.reply_count || 0,
retweets: legacy.retweet_count || 0,
likes: legacy.favorite_count || 0,
views: result.views?.count || '0',
platform: 'twitter',
};
}

/**
* From a list of entries, collect all tweets by a given author and filter
* to the self-reply thread chain (root tweet + author replying to themselves).
*/
function parseThreadFromEntries(entries, mainAuthor, mainTweetId) {
const candidates = new Map();

for (const entry of entries) {
const result = unwrapResult(entry.content?.itemContent?.tweet_results?.result);
if (result && getScreenName(result).toLowerCase() === mainAuthor.toLowerCase()) {
const parsed = parseTweetResult(result);
if (parsed) candidates.set(parsed.id, parsed);
}
for (const item of (entry.content?.items || [])) {
const r = unwrapResult(item.item?.itemContent?.tweet_results?.result);
if (r && getScreenName(r).toLowerCase() === mainAuthor.toLowerCase()) {
const parsed = parseTweetResult(r);
if (parsed) candidates.set(parsed.id, parsed);
}
}
}

const threadIds = new Set(candidates.keys());
return Array.from(candidates.values())
.filter(t => t.id === mainTweetId || (t.inReplyTo && threadIds.has(t.inReplyTo)))
.sort((a, b) => {
const ta = t => t.timestamp ? new Date(t.timestamp).getTime() : 0;
return ta(a) - ta(b);
});
}

// ============================================================================
// Thread Scraper
// ============================================================================

/**
* Scrape a full tweet thread (author's self-reply chain).
*
* Uses the TweetDetail GraphQL API directly instead of DOM scraping —
* X doesn't render self-reply threads as article elements in the DOM,
* especially for high-engagement tweets.
*/
export async function scrapeThread(page, tweetUrl) {
const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null;
const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null;
if (!mainTweetId || !mainAuthor) return [];

await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 });
checkAuth(page);
await randomDelay(2000, 3000);

const graphqlData = await fetchTweetDetail(page, mainTweetId);
if (!graphqlData) return [];

const entries = extractEntries(graphqlData);
const thread = parseThreadFromEntries(entries, mainAuthor, mainTweetId);

// Strip internal fields for backward compatibility
return thread.map(({ inReplyTo, quotedTweetId, media, article, card, urls, ...rest }) => rest);
}

// ============================================================================
Expand Down