diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js index ded588d..77ee6b3 100644 --- a/src/scrapers/twitter/index.js +++ b/src/scrapers/twitter/index.js @@ -27,7 +27,27 @@ puppeteer.use(StealthPlugin()); // ============================================================================ const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); -const randomDelay = (min = 1000, max = 3000) => sleep(min + Math.random() * (max - min)); + +/** Human-like delay using log-normal distribution with occasional distraction spikes. */ +const randomDelay = (min = 2000, max = 7000) => { + const u1 = Math.random(); + const u2 = Math.random(); + const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2); + const median = min + (max - min) * 0.4; + const spread = (max - min) * 0.25; + const base = median + z * spread; + const distraction = Math.random() < 0.08 ? 8000 + Math.random() * 12000 : 0; + const delay = Math.max(min, Math.min(base, max)) + distraction; + return sleep(delay); +}; + +/** Throw if the page redirected to login (expired/invalid cookie). */ +function checkAuth(page) { + const url = page.url(); + if (url.includes('/login') || url.includes('/i/flow/login')) { + throw new Error('Authentication failed — cookie may be expired.\n\nRun: xactions login'); + } +} /** * Create a browser instance with stealth settings. @@ -438,51 +458,212 @@ export async function searchTweets(page, query, options = {}) { // ============================================================================ // Thread Scraper // ============================================================================ +// TweetDetail GraphQL helpers (shared by scrapeThread and scrapePost) +// ============================================================================ /** - * Scrape a full tweet thread + * Fetch TweetDetail GraphQL API from the page context using session cookies. + * The page must already be on x.com (for cookies to be available). + * Includes a human-like delay before each call. */ -export async function scrapeThread(page, tweetUrl) { - await page.goto(tweetUrl, { waitUntil: 'networkidle2' }); - await randomDelay(); +async function fetchTweetDetail(page, tweetId) { + await randomDelay(2000, 5000); + return page.evaluate(async (id) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return null; + const variables = JSON.stringify({ + focalTweetId: id, with_rux_injections: false, rankingMode: 'Relevance', + includePromotedContent: false, withCommunity: true, + withQuickPromoteEligibilityTweetFields: true, withBirdwatchNotes: true, withVoice: true, + }); + const features = JSON.stringify({ + rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + freedom_of_speech_not_reach_fetch_enabled: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + }); + const url = `https://x.com/i/api/graphql/t66713qxyDI9pc4Jyb6wxQ/TweetDetail?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`; + try { + const resp = await fetch(url, { + headers: { + 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', + 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session', + }, + credentials: 'include', + }); + return await resp.json(); + } catch { return null; } + }, tweetId); +} - for (let i = 0; i < 5; i++) { - await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); - await randomDelay(1000, 2000); +/** Extract timeline entries from a TweetDetail GraphQL response. */ +function extractEntries(graphqlData) { + const instructions = graphqlData?.data?.threaded_conversation_with_injections_v2?.instructions || []; + const entries = []; + for (const inst of instructions) { + if (inst.entries) entries.push(...inst.entries); } + return entries; +} - const thread = await page.evaluate(() => { - const articles = document.querySelectorAll('article[data-testid="tweet"]'); - const mainTweetId = window.location.pathname.match(/status\/(\d+)/)?.[1]; - - const mainArticle = Array.from(articles).find(a => - a.querySelector(`a[href*="/status/${mainTweetId}"]`) - ); - const mainAuthor = mainArticle?.querySelector('[data-testid="User-Name"] a')?.href?.split('/')[3]; +/** Unwrap TweetWithVisibilityResults wrapper. */ +function unwrapResult(result) { + if (result?.__typename === 'TweetWithVisibilityResults') return result.tweet; + return result; +} - return Array.from(articles) - .map((article) => { - const textEl = article.querySelector('[data-testid="tweetText"]'); - const authorLink = article.querySelector('[data-testid="User-Name"] a[href^="/"]'); - const timeEl = article.querySelector('time'); - const linkEl = article.querySelector('a[href*="/status/"]'); - - const author = authorLink?.href?.split('/')[3]; - - return { - id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || null, - text: textEl?.textContent || null, - author, - timestamp: timeEl?.getAttribute('datetime') || null, - url: linkEl?.href || null, - isMainAuthor: author === mainAuthor, - platform: 'twitter', - }; - }) - .filter(t => t.id && t.isMainAuthor); +/** Get screen_name from a user result (handles both new core and legacy paths). */ +function getScreenName(result) { + const user = result?.core?.user_results?.result; + return user?.core?.screen_name || user?.legacy?.screen_name || ''; +} + +/** + * Parse rich data from a single tweet GraphQL result. + * Does NOT recurse into quoted tweets — returns quotedTweetId for the caller to handle. + */ +function parseTweetResult(result) { + result = unwrapResult(result); + if (!result?.legacy) return null; + + const legacy = result.legacy; + const author = getScreenName(result); + const text = result.note_tweet?.note_tweet_results?.result?.text || legacy.full_text || ''; + + // Media: images and videos + const media = (legacy.extended_entities?.media || []).map(m => { + const item = { type: m.type, url: m.media_url_https }; + if (m.type === 'video' || m.type === 'animated_gif') { + const best = m.video_info?.variants + ?.filter(v => v.content_type === 'video/mp4') + .sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0]; + if (best) item.videoUrl = best.url; + } + return item; }); - return thread; + // Article (X Articles — long-form posts) + let article = null; + if (result.article?.article_results?.result) { + const a = result.article.article_results.result; + article = { + id: a.rest_id || null, + title: a.title || null, + coverImage: a.cover_media?.media_info?.original_img_url || null, + url: `https://x.com/${author}/article/${result.rest_id}`, + }; + } + + // Card (link previews — external URLs) + let card = null; + if (result.card?.legacy?.binding_values) { + const vals = {}; + for (const v of result.card.legacy.binding_values) { + vals[v.key] = v.value?.string_value || v.value?.scribe_value?.value || v.value?.image_value?.url || ''; + } + if (vals.title || vals.card_url) { + card = { title: vals.title || '', description: vals.description || '', url: vals.card_url || '', image: vals.thumbnail_image_original || '' }; + } + } + + // URLs: external links in tweet text (from both legacy and note_tweet entities) + const rawUrls = [ + ...(legacy.entities?.urls || []), + ...(result.note_tweet?.note_tweet_results?.result?.entity_set?.urls || []), + ]; + const urls = rawUrls + .map(u => ({ url: u.expanded_url || u.url || '', display: u.display_url || '' })) + .filter(u => u.url && !u.url.includes('x.com/') && !u.url.includes('twitter.com/')); + + // Quoted tweet ID (for recursive fetching — not parsed from this response) + const quotedTweetId = result.quoted_status_result?.result?.rest_id || legacy.quoted_status_id_str || null; + + return { + id: result.rest_id, + author, + text, + timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : null, + url: `https://x.com/${author}/status/${result.rest_id}`, + media, + article, + card, + urls: urls.length > 0 ? urls : undefined, + quotedTweetId, + inReplyTo: legacy.in_reply_to_status_id_str || null, + replies: legacy.reply_count || 0, + retweets: legacy.retweet_count || 0, + likes: legacy.favorite_count || 0, + views: result.views?.count || '0', + platform: 'twitter', + }; +} + +/** + * From a list of entries, collect all tweets by a given author and filter + * to the self-reply thread chain (root tweet + author replying to themselves). + */ +function parseThreadFromEntries(entries, mainAuthor, mainTweetId) { + const candidates = new Map(); + + for (const entry of entries) { + const result = unwrapResult(entry.content?.itemContent?.tweet_results?.result); + if (result && getScreenName(result).toLowerCase() === mainAuthor.toLowerCase()) { + const parsed = parseTweetResult(result); + if (parsed) candidates.set(parsed.id, parsed); + } + for (const item of (entry.content?.items || [])) { + const r = unwrapResult(item.item?.itemContent?.tweet_results?.result); + if (r && getScreenName(r).toLowerCase() === mainAuthor.toLowerCase()) { + const parsed = parseTweetResult(r); + if (parsed) candidates.set(parsed.id, parsed); + } + } + } + + const threadIds = new Set(candidates.keys()); + return Array.from(candidates.values()) + .filter(t => t.id === mainTweetId || (t.inReplyTo && threadIds.has(t.inReplyTo))) + .sort((a, b) => { + const ta = t => t.timestamp ? new Date(t.timestamp).getTime() : 0; + return ta(a) - ta(b); + }); +} + +// ============================================================================ +// Thread Scraper +// ============================================================================ + +/** + * Scrape a full tweet thread (author's self-reply chain). + * + * Uses the TweetDetail GraphQL API directly instead of DOM scraping — + * X doesn't render self-reply threads as article elements in the DOM, + * especially for high-engagement tweets. + */ +export async function scrapeThread(page, tweetUrl) { + const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null; + const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null; + if (!mainTweetId || !mainAuthor) return []; + + await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); + await randomDelay(2000, 3000); + + const graphqlData = await fetchTweetDetail(page, mainTweetId); + if (!graphqlData) return []; + + const entries = extractEntries(graphqlData); + const thread = parseThreadFromEntries(entries, mainAuthor, mainTweetId); + + // Strip internal fields for backward compatibility + return thread.map(({ inReplyTo, quotedTweetId, media, article, card, urls, ...rest }) => rest); } // ============================================================================