From a4a5f6b9d6c9fb84daa0b01f13a7fa0fa774069c Mon Sep 17 00:00:00 2001 From: nj-io <26359601+nj-io@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:01:58 +0000 Subject: [PATCH 1/3] feat: rewrite scrapeThread to use TweetDetail GraphQL API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace DOM-based thread scraping with direct GraphQL API calls. X doesn't render self-reply threads as article elements in the DOM, causing empty results — especially for high-engagement tweets. The new approach: - Calls TweetDetail GraphQL API from the page context using session cookies - Gets full_text (no truncation, no "Show more" needed) - note_tweet support for long-form posts - Filters to self-reply chain only (author replying to themselves) - Chronological sorting Also introduces shared helpers for future use by scrapePost: - fetchTweetDetail() — GraphQL API caller - parseTweetResult() — rich data extraction (text, media, article, card, external URLs, engagement stats) - parseThreadFromEntries() — thread chain detection - extractEntries(), unwrapResult(), getScreenName() Fixes: - screen_name moved from user.legacy to user.core in X's GraphQL schema - Self-replies missing from API response for viral tweets (2000+ replies) now handled gracefully (returns available tweets) Supersedes #12 which patches the DOM approach — this replaces it entirely. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/scrapers/twitter/index.js | 230 ++++++++++++++++++++++++++++------ 1 file changed, 194 insertions(+), 36 deletions(-) diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js index ded588d..141ee3d 100644 --- a/src/scrapers/twitter/index.js +++ b/src/scrapers/twitter/index.js @@ -438,51 +438,209 @@ export async function searchTweets(page, query, options = {}) { // ============================================================================ // Thread Scraper // ============================================================================ +// TweetDetail GraphQL helpers (shared by scrapeThread and scrapePost) +// ============================================================================ /** - * Scrape a full tweet thread + * Fetch TweetDetail GraphQL API from the page context using session cookies. + * The page must already be on x.com (for cookies to be available). */ -export async function scrapeThread(page, tweetUrl) { - await page.goto(tweetUrl, { waitUntil: 'networkidle2' }); - await randomDelay(); +async function fetchTweetDetail(page, tweetId) { + return page.evaluate(async (id) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return null; + const variables = JSON.stringify({ + focalTweetId: id, with_rux_injections: false, rankingMode: 'Relevance', + includePromotedContent: false, withCommunity: true, + withQuickPromoteEligibilityTweetFields: true, withBirdwatchNotes: true, withVoice: true, + }); + const features = JSON.stringify({ + rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + freedom_of_speech_not_reach_fetch_enabled: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + }); + const url = `https://x.com/i/api/graphql/t66713qxyDI9pc4Jyb6wxQ/TweetDetail?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`; + try { + const resp = await fetch(url, { + headers: { + 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', + 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session', + }, + credentials: 'include', + }); + return await resp.json(); + } catch { return null; } + }, tweetId); +} - for (let i = 0; i < 5; i++) { - await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); - await randomDelay(1000, 2000); +/** Extract timeline entries from a TweetDetail GraphQL response. */ +function extractEntries(graphqlData) { + const instructions = graphqlData?.data?.threaded_conversation_with_injections_v2?.instructions || []; + const entries = []; + for (const inst of instructions) { + if (inst.entries) entries.push(...inst.entries); } + return entries; +} - const thread = await page.evaluate(() => { - const articles = document.querySelectorAll('article[data-testid="tweet"]'); - const mainTweetId = window.location.pathname.match(/status\/(\d+)/)?.[1]; - - const mainArticle = Array.from(articles).find(a => - a.querySelector(`a[href*="/status/${mainTweetId}"]`) - ); - const mainAuthor = mainArticle?.querySelector('[data-testid="User-Name"] a')?.href?.split('/')[3]; +/** Unwrap TweetWithVisibilityResults wrapper. */ +function unwrapResult(result) { + if (result?.__typename === 'TweetWithVisibilityResults') return result.tweet; + return result; +} - return Array.from(articles) - .map((article) => { - const textEl = article.querySelector('[data-testid="tweetText"]'); - const authorLink = article.querySelector('[data-testid="User-Name"] a[href^="/"]'); - const timeEl = article.querySelector('time'); - const linkEl = article.querySelector('a[href*="/status/"]'); - - const author = authorLink?.href?.split('/')[3]; - - return { - id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || null, - text: textEl?.textContent || null, - author, - timestamp: timeEl?.getAttribute('datetime') || null, - url: linkEl?.href || null, - isMainAuthor: author === mainAuthor, - platform: 'twitter', - }; - }) - .filter(t => t.id && t.isMainAuthor); +/** Get screen_name from a user result (handles both new core and legacy paths). */ +function getScreenName(result) { + const user = result?.core?.user_results?.result; + return user?.core?.screen_name || user?.legacy?.screen_name || ''; +} + +/** + * Parse rich data from a single tweet GraphQL result. + * Does NOT recurse into quoted tweets — returns quotedTweetId for the caller to handle. + */ +function parseTweetResult(result) { + result = unwrapResult(result); + if (!result?.legacy) return null; + + const legacy = result.legacy; + const author = getScreenName(result); + const text = result.note_tweet?.note_tweet_results?.result?.text || legacy.full_text || ''; + + // Media: images and videos + const media = (legacy.extended_entities?.media || []).map(m => { + const item = { type: m.type, url: m.media_url_https }; + if (m.type === 'video' || m.type === 'animated_gif') { + const best = m.video_info?.variants + ?.filter(v => v.content_type === 'video/mp4') + .sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0]; + if (best) item.videoUrl = best.url; + } + return item; }); - return thread; + // Article (X Articles — long-form posts) + let article = null; + if (result.article?.article_results?.result) { + const a = result.article.article_results.result; + article = { + id: a.rest_id || null, + title: a.title || null, + coverImage: a.cover_media?.media_info?.original_img_url || null, + url: `https://x.com/${author}/article/${result.rest_id}`, + }; + } + + // Card (link previews — external URLs) + let card = null; + if (result.card?.legacy?.binding_values) { + const vals = {}; + for (const v of result.card.legacy.binding_values) { + vals[v.key] = v.value?.string_value || v.value?.scribe_value?.value || v.value?.image_value?.url || ''; + } + if (vals.title || vals.card_url) { + card = { title: vals.title || '', description: vals.description || '', url: vals.card_url || '', image: vals.thumbnail_image_original || '' }; + } + } + + // URLs: external links in tweet text (from both legacy and note_tweet entities) + const rawUrls = [ + ...(legacy.entities?.urls || []), + ...(result.note_tweet?.note_tweet_results?.result?.entity_set?.urls || []), + ]; + const urls = rawUrls + .map(u => ({ url: u.expanded_url || u.url || '', display: u.display_url || '' })) + .filter(u => u.url && !u.url.includes('x.com/') && !u.url.includes('twitter.com/')); + + // Quoted tweet ID (for recursive fetching — not parsed from this response) + const quotedTweetId = result.quoted_status_result?.result?.rest_id || legacy.quoted_status_id_str || null; + + return { + id: result.rest_id, + author, + text, + timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : null, + url: `https://x.com/${author}/status/${result.rest_id}`, + media, + article, + card, + urls: urls.length > 0 ? urls : undefined, + quotedTweetId, + inReplyTo: legacy.in_reply_to_status_id_str || null, + replies: legacy.reply_count || 0, + retweets: legacy.retweet_count || 0, + likes: legacy.favorite_count || 0, + views: result.views?.count || '0', + platform: 'twitter', + }; +} + +/** + * From a list of entries, collect all tweets by a given author and filter + * to the self-reply thread chain (root tweet + author replying to themselves). + */ +function parseThreadFromEntries(entries, mainAuthor, mainTweetId) { + const candidates = new Map(); + + for (const entry of entries) { + const result = unwrapResult(entry.content?.itemContent?.tweet_results?.result); + if (result && getScreenName(result).toLowerCase() === mainAuthor.toLowerCase()) { + const parsed = parseTweetResult(result); + if (parsed) candidates.set(parsed.id, parsed); + } + for (const item of (entry.content?.items || [])) { + const r = unwrapResult(item.item?.itemContent?.tweet_results?.result); + if (r && getScreenName(r).toLowerCase() === mainAuthor.toLowerCase()) { + const parsed = parseTweetResult(r); + if (parsed) candidates.set(parsed.id, parsed); + } + } + } + + const threadIds = new Set(candidates.keys()); + return Array.from(candidates.values()) + .filter(t => t.id === mainTweetId || (t.inReplyTo && threadIds.has(t.inReplyTo))) + .sort((a, b) => { + const ta = t => t.timestamp ? new Date(t.timestamp).getTime() : 0; + return ta(a) - ta(b); + }); +} + +// ============================================================================ +// Thread Scraper +// ============================================================================ + +/** + * Scrape a full tweet thread (author's self-reply chain). + * + * Uses the TweetDetail GraphQL API directly instead of DOM scraping — + * X doesn't render self-reply threads as article elements in the DOM, + * especially for high-engagement tweets. + */ +export async function scrapeThread(page, tweetUrl) { + const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null; + const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null; + if (!mainTweetId || !mainAuthor) return []; + + await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + await randomDelay(2000, 3000); + + const graphqlData = await fetchTweetDetail(page, mainTweetId); + if (!graphqlData) return []; + + const entries = extractEntries(graphqlData); + const thread = parseThreadFromEntries(entries, mainAuthor, mainTweetId); + + // Strip internal fields for backward compatibility + return thread.map(({ inReplyTo, quotedTweetId, media, article, card, urls, ...rest }) => rest); } // ============================================================================ From 2e58d0d75c3ddeb6a14b10e34e0b739fa99fa83c Mon Sep 17 00:00:00 2001 From: nj-io <26359601+nj-io@users.noreply.github.com> Date: Sun, 5 Apr 2026 10:31:44 +0000 Subject: [PATCH 2/3] add human-like delays and auth checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace uniform randomDelay (1-3s) with log-normal distribution (2-7s base + 8% distraction spikes of 8-20s) - Add checkAuth() guard after page navigation — fails fast on expired cookies - Add randomDelay before each fetchTweetDetail API call to simulate human browsing between tweet reads --- src/scrapers/twitter/index.js | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js index 141ee3d..77ee6b3 100644 --- a/src/scrapers/twitter/index.js +++ b/src/scrapers/twitter/index.js @@ -27,7 +27,27 @@ puppeteer.use(StealthPlugin()); // ============================================================================ const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); -const randomDelay = (min = 1000, max = 3000) => sleep(min + Math.random() * (max - min)); + +/** Human-like delay using log-normal distribution with occasional distraction spikes. */ +const randomDelay = (min = 2000, max = 7000) => { + const u1 = Math.random(); + const u2 = Math.random(); + const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2); + const median = min + (max - min) * 0.4; + const spread = (max - min) * 0.25; + const base = median + z * spread; + const distraction = Math.random() < 0.08 ? 8000 + Math.random() * 12000 : 0; + const delay = Math.max(min, Math.min(base, max)) + distraction; + return sleep(delay); +}; + +/** Throw if the page redirected to login (expired/invalid cookie). */ +function checkAuth(page) { + const url = page.url(); + if (url.includes('/login') || url.includes('/i/flow/login')) { + throw new Error('Authentication failed — cookie may be expired.\n\nRun: xactions login'); + } +} /** * Create a browser instance with stealth settings. @@ -444,8 +464,10 @@ export async function searchTweets(page, query, options = {}) { /** * Fetch TweetDetail GraphQL API from the page context using session cookies. * The page must already be on x.com (for cookies to be available). + * Includes a human-like delay before each call. */ async function fetchTweetDetail(page, tweetId) { + await randomDelay(2000, 5000); return page.evaluate(async (id) => { const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; if (!ct0) return null; @@ -631,6 +653,7 @@ export async function scrapeThread(page, tweetUrl) { if (!mainTweetId || !mainAuthor) return []; await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); await randomDelay(2000, 3000); const graphqlData = await fetchTweetDetail(page, mainTweetId); From 80890dd263ba521e91a9e01c3cb95d54196cffe5 Mon Sep 17 00:00:00 2001 From: nj-io <26359601+nj-io@users.noreply.github.com> Date: Sun, 5 Apr 2026 10:35:44 +0000 Subject: [PATCH 3/3] feat: add scrapePost and x_read_post for full rich tweet reading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New scrapePost() function and x_read_post MCP tool that reads any tweet URL with full rich data and recursive quote tweet resolution. Features: - Single tweets or threads (auto-detected via self-reply chain) - Rich data per tweet: text, media (images + best-quality video URL), X Articles (title + cover image + URL), cards (link previews), external URLs (Substack, GitHub, etc.), engagement stats - Recursive quote tweet resolution — if a quoted tweet is itself a thread, or contains its own quote tweet, those are fetched too (up to 5 levels deep) - Human-like delays between API calls (inherited from fetchTweetDetail) - Auth check on navigation (inherited from shared helpers) Depends on: #17 (scrapeThread GraphQL rewrite with shared helpers) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/mcp/local-tools.js | 7 ++++ src/mcp/server.js | 11 ++++++ src/scrapers/index.js | 2 ++ src/scrapers/twitter/index.js | 64 +++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+) diff --git a/src/mcp/local-tools.js b/src/mcp/local-tools.js index 7ad47e4..d8e2f69 100644 --- a/src/mcp/local-tools.js +++ b/src/mcp/local-tools.js @@ -20,6 +20,7 @@ import { scrapeTweets, searchTweets, scrapeThread, + scrapePost, scrapeLikes, scrapeMedia, scrapeListMembers, @@ -202,6 +203,11 @@ export async function x_get_thread({ url }) { return scrapeThread(pg, url); } +export async function x_read_post({ url }) { + const { page: pg } = await ensureBrowser(); + return scrapePost(pg, url); +} + export async function x_best_time_to_post({ username, limit = 100 }) { const { page: pg } = await ensureBrowser(); const tweets = await scrapeTweets(pg, username, { limit }); @@ -1346,6 +1352,7 @@ export const toolMap = { x_get_tweets, x_search_tweets, x_get_thread, + x_read_post, x_best_time_to_post, // Core actions x_follow, diff --git a/src/mcp/server.js b/src/mcp/server.js index 1f46777..3507af0 100755 --- a/src/mcp/server.js +++ b/src/mcp/server.js @@ -1129,6 +1129,17 @@ const TOOLS = [ required: ['url'], }, }, + { + name: 'x_read_post', + description: 'Read a tweet/post with full rich data. Returns thread if the post is part of one (author self-replies only). Recursively resolves quoted tweets — if a quoted tweet is itself a thread or contains its own quote tweet, those are fetched too. Each tweet includes: text, media (images + video URLs), X Articles, cards (link previews), external URLs, and engagement stats.', + inputSchema: { + type: 'object', + properties: { + url: { type: 'string', description: 'URL of the tweet/post' }, + }, + required: ['url'], + }, + }, // ====== Posting Analytics ====== { name: 'x_best_time_to_post', diff --git a/src/scrapers/index.js b/src/scrapers/index.js index ff6646f..8c284fa 100644 --- a/src/scrapers/index.js +++ b/src/scrapers/index.js @@ -79,6 +79,7 @@ export const { scrapeTweets, searchTweets, scrapeThread, + scrapePost, scrapeLikes, scrapeHashtag, scrapeMedia, @@ -308,6 +309,7 @@ export default { scrapeTweets, searchTweets, scrapeThread, + scrapePost, scrapeLikes, scrapeHashtag, scrapeMedia, diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js index 77ee6b3..de38fc7 100644 --- a/src/scrapers/twitter/index.js +++ b/src/scrapers/twitter/index.js @@ -666,6 +666,69 @@ export async function scrapeThread(page, tweetUrl) { return thread.map(({ inReplyTo, quotedTweetId, media, article, card, urls, ...rest }) => rest); } +// ============================================================================ +// Post Scraper (rich data + recursive quoted tweets) +// ============================================================================ + +/** + * Scrape a single post or thread with full rich data. + * + * Returns the thread (1 tweet if single post, N if thread) with rich data + * per tweet: text, media, article, card, external URLs, engagement, and + * recursively resolved quoted posts (which may themselves be threads). + * + * @param {import('puppeteer').Page} page + * @param {string} tweetUrl + * @param {number} [maxDepth=5] - Max recursion depth for nested quote tweets + */ +export async function scrapePost(page, tweetUrl, maxDepth = 5) { + const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null; + const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null; + if (!mainTweetId || !mainAuthor) throw new Error('Invalid tweet URL'); + + // Ensure we're on x.com for cookie access + if (!page.url().includes('x.com')) { + await page.goto('https://x.com', { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); + await randomDelay(2000, 3000); + } + + return _scrapePostRecursive(page, mainTweetId, mainAuthor, maxDepth, 0); +} + +async function _scrapePostRecursive(page, tweetId, author, maxDepth, depth) { + const graphqlData = await fetchTweetDetail(page, tweetId); + if (!graphqlData) return { thread: [] }; + + const entries = extractEntries(graphqlData); + const thread = parseThreadFromEntries(entries, author, tweetId); + if (thread.length === 0) return { thread: [] }; + + // For each thread tweet, resolve its quoted post recursively + for (const tweet of thread) { + if (tweet.quotedTweetId && depth < maxDepth) { + const qtData = await fetchTweetDetail(page, tweet.quotedTweetId); + if (qtData) { + const qtEntries = extractEntries(qtData); + const focalEntry = qtEntries.find(e => + e.entryId?.includes(tweet.quotedTweetId)); + const focalResult = unwrapResult( + focalEntry?.content?.itemContent?.tweet_results?.result); + const qtAuthor = focalResult ? getScreenName(focalResult) : ''; + + if (qtAuthor) { + tweet.quotedPost = await _scrapePostRecursive( + page, tweet.quotedTweetId, qtAuthor, maxDepth, depth + 1); + } + } + } + delete tweet.quotedTweetId; + delete tweet.inReplyTo; + } + + return { thread }; +} + // ============================================================================ // Likes Scraper // ============================================================================ @@ -1119,6 +1182,7 @@ export default { scrapeTweets, searchTweets, scrapeThread, + scrapePost, scrapeLikes, scrapeHashtag, scrapeMedia,