From a4a5f6b9d6c9fb84daa0b01f13a7fa0fa774069c Mon Sep 17 00:00:00 2001
From: nj-io <26359601+nj-io@users.noreply.github.com>
Date: Sun, 5 Apr 2026 09:01:58 +0000
Subject: [PATCH 1/3] feat: rewrite scrapeThread to use TweetDetail GraphQL API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace DOM-based thread scraping with direct GraphQL API calls.
X doesn't render self-reply threads as article elements in the DOM,
causing empty results — especially for high-engagement tweets.

The new approach:
- Calls TweetDetail GraphQL API from the page context using session cookies
- Gets full_text (no truncation, no "Show more" needed)
- note_tweet support for long-form posts
- Filters to self-reply chain only (author replying to themselves)
- Chronological sorting

Also introduces shared helpers for future use by scrapePost:
- fetchTweetDetail() — GraphQL API caller
- parseTweetResult() — rich data extraction (text, media, article,
  card, external URLs, engagement stats)
- parseThreadFromEntries() — thread chain detection
- extractEntries(), unwrapResult(), getScreenName()

Fixes:
- screen_name moved from user.legacy to user.core in X's GraphQL schema
- Self-replies missing from API response for viral tweets (2000+ replies)
  now handled gracefully (returns available tweets)

Supersedes #12 which patches the DOM approach — this replaces it entirely.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/scrapers/twitter/index.js | 230 ++++++++++++++++++++++++++++------
 1 file changed, 194 insertions(+), 36 deletions(-)

diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js
index ded588d..141ee3d 100644
--- a/src/scrapers/twitter/index.js
+++ b/src/scrapers/twitter/index.js
@@ -438,51 +438,209 @@ export async function searchTweets(page, query, options = {}) {
 // ============================================================================
 // Thread Scraper
 // ============================================================================
+// TweetDetail GraphQL helpers (shared by scrapeThread and scrapePost)
+// ============================================================================
 
 /**
- * Scrape a full tweet thread
+ * Fetch TweetDetail GraphQL API from the page context using session cookies.
+ * The page must already be on x.com (for cookies to be available).
  */
-export async function scrapeThread(page, tweetUrl) {
-  await page.goto(tweetUrl, { waitUntil: 'networkidle2' });
-  await randomDelay();
+async function fetchTweetDetail(page, tweetId) {
+  return page.evaluate(async (id) => {
+    const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1];
+    if (!ct0) return null;
+    const variables = JSON.stringify({
+      focalTweetId: id, with_rux_injections: false, rankingMode: 'Relevance',
+      includePromotedContent: false, withCommunity: true,
+      withQuickPromoteEligibilityTweetFields: true, withBirdwatchNotes: true, withVoice: true,
+    });
+    const features = JSON.stringify({
+      rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true,
+      responsive_web_graphql_skip_user_profile_image_extensions_enabled: false,
+      creator_subscriptions_tweet_preview_api_enabled: true,
+      longform_notetweets_consumption_enabled: true,
+      responsive_web_twitter_article_tweet_consumption_enabled: true,
+      responsive_web_edit_tweet_api_enabled: true,
+      graphql_is_translatable_rweb_tweet_is_translatable_enabled: true,
+      view_counts_everywhere_api_enabled: true,
+      freedom_of_speech_not_reach_fetch_enabled: true,
+      tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true,
+      longform_notetweets_rich_text_read_enabled: true,
+    });
+    const url = `https://x.com/i/api/graphql/t66713qxyDI9pc4Jyb6wxQ/TweetDetail?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`;
+    try {
+      const resp = await fetch(url, {
+        headers: {
+          'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
+          'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session',
+        },
+        credentials: 'include',
+      });
+      return await resp.json();
+    } catch { return null; }
+  }, tweetId);
+}
 
-  for (let i = 0; i < 5; i++) {
-    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
-    await randomDelay(1000, 2000);
+/** Extract timeline entries from a TweetDetail GraphQL response. */
+function extractEntries(graphqlData) {
+  const instructions = graphqlData?.data?.threaded_conversation_with_injections_v2?.instructions || [];
+  const entries = [];
+  for (const inst of instructions) {
+    if (inst.entries) entries.push(...inst.entries);
   }
+  return entries;
+}
 
-  const thread = await page.evaluate(() => {
-    const articles = document.querySelectorAll('article[data-testid="tweet"]');
-    const mainTweetId = window.location.pathname.match(/status\/(\d+)/)?.[1];
-    
-    const mainArticle = Array.from(articles).find(a => 
-      a.querySelector(`a[href*="/status/${mainTweetId}"]`)
-    );
-    const mainAuthor = mainArticle?.querySelector('[data-testid="User-Name"] a')?.href?.split('/')[3];
+/** Unwrap TweetWithVisibilityResults wrapper. */
+function unwrapResult(result) {
+  if (result?.__typename === 'TweetWithVisibilityResults') return result.tweet;
+  return result;
+}
 
-    return Array.from(articles)
-      .map((article) => {
-        const textEl = article.querySelector('[data-testid="tweetText"]');
-        const authorLink = article.querySelector('[data-testid="User-Name"] a[href^="/"]');
-        const timeEl = article.querySelector('time');
-        const linkEl = article.querySelector('a[href*="/status/"]');
-        
-        const author = authorLink?.href?.split('/')[3];
-        
-        return {
-          id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || null,
-          text: textEl?.textContent || null,
-          author,
-          timestamp: timeEl?.getAttribute('datetime') || null,
-          url: linkEl?.href || null,
-          isMainAuthor: author === mainAuthor,
-          platform: 'twitter',
-        };
-      })
-      .filter(t => t.id && t.isMainAuthor);
+/** Get screen_name from a user result (handles both new core and legacy paths). */
+function getScreenName(result) {
+  const user = result?.core?.user_results?.result;
+  return user?.core?.screen_name || user?.legacy?.screen_name || '';
+}
+
+/**
+ * Parse rich data from a single tweet GraphQL result.
+ * Does NOT recurse into quoted tweets — returns quotedTweetId for the caller to handle.
+ */
+function parseTweetResult(result) {
+  result = unwrapResult(result);
+  if (!result?.legacy) return null;
+
+  const legacy = result.legacy;
+  const author = getScreenName(result);
+  const text = result.note_tweet?.note_tweet_results?.result?.text || legacy.full_text || '';
+
+  // Media: images and videos
+  const media = (legacy.extended_entities?.media || []).map(m => {
+    const item = { type: m.type, url: m.media_url_https };
+    if (m.type === 'video' || m.type === 'animated_gif') {
+      const best = m.video_info?.variants
+        ?.filter(v => v.content_type === 'video/mp4')
+        .sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0];
+      if (best) item.videoUrl = best.url;
+    }
+    return item;
   });
 
-  return thread;
+  // Article (X Articles — long-form posts)
+  let article = null;
+  if (result.article?.article_results?.result) {
+    const a = result.article.article_results.result;
+    article = {
+      id: a.rest_id || null,
+      title: a.title || null,
+      coverImage: a.cover_media?.media_info?.original_img_url || null,
+      url: `https://x.com/${author}/article/${result.rest_id}`,
+    };
+  }
+
+  // Card (link previews — external URLs)
+  let card = null;
+  if (result.card?.legacy?.binding_values) {
+    const vals = {};
+    for (const v of result.card.legacy.binding_values) {
+      vals[v.key] = v.value?.string_value || v.value?.scribe_value?.value || v.value?.image_value?.url || '';
+    }
+    if (vals.title || vals.card_url) {
+      card = { title: vals.title || '', description: vals.description || '', url: vals.card_url || '', image: vals.thumbnail_image_original || '' };
+    }
+  }
+
+  // URLs: external links in tweet text (from both legacy and note_tweet entities)
+  const rawUrls = [
+    ...(legacy.entities?.urls || []),
+    ...(result.note_tweet?.note_tweet_results?.result?.entity_set?.urls || []),
+  ];
+  const urls = rawUrls
+    .map(u => ({ url: u.expanded_url || u.url || '', display: u.display_url || '' }))
+    .filter(u => u.url && !u.url.includes('x.com/') && !u.url.includes('twitter.com/'));
+
+  // Quoted tweet ID (for recursive fetching — not parsed from this response)
+  const quotedTweetId = result.quoted_status_result?.result?.rest_id || legacy.quoted_status_id_str || null;
+
+  return {
+    id: result.rest_id,
+    author,
+    text,
+    timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : null,
+    url: `https://x.com/${author}/status/${result.rest_id}`,
+    media,
+    article,
+    card,
+    urls: urls.length > 0 ? urls : undefined,
+    quotedTweetId,
+    inReplyTo: legacy.in_reply_to_status_id_str || null,
+    replies: legacy.reply_count || 0,
+    retweets: legacy.retweet_count || 0,
+    likes: legacy.favorite_count || 0,
+    views: result.views?.count || '0',
+    platform: 'twitter',
+  };
+}
+
+/**
+ * From a list of entries, collect all tweets by a given author and filter
+ * to the self-reply thread chain (root tweet + author replying to themselves).
+ */
+function parseThreadFromEntries(entries, mainAuthor, mainTweetId) {
+  const candidates = new Map();
+
+  for (const entry of entries) {
+    const result = unwrapResult(entry.content?.itemContent?.tweet_results?.result);
+    if (result && getScreenName(result).toLowerCase() === mainAuthor.toLowerCase()) {
+      const parsed = parseTweetResult(result);
+      if (parsed) candidates.set(parsed.id, parsed);
+    }
+    for (const item of (entry.content?.items || [])) {
+      const r = unwrapResult(item.item?.itemContent?.tweet_results?.result);
+      if (r && getScreenName(r).toLowerCase() === mainAuthor.toLowerCase()) {
+        const parsed = parseTweetResult(r);
+        if (parsed) candidates.set(parsed.id, parsed);
+      }
+    }
+  }
+
+  const threadIds = new Set(candidates.keys());
+  return Array.from(candidates.values())
+    .filter(t => t.id === mainTweetId || (t.inReplyTo && threadIds.has(t.inReplyTo)))
+    .sort((a, b) => {
+      const ta = t => t.timestamp ? new Date(t.timestamp).getTime() : 0;
+      return ta(a) - ta(b);
+    });
+}
+
+// ============================================================================
+// Thread Scraper
+// ============================================================================
+
+/**
+ * Scrape a full tweet thread (author's self-reply chain).
+ *
+ * Uses the TweetDetail GraphQL API directly instead of DOM scraping —
+ * X doesn't render self-reply threads as article elements in the DOM,
+ * especially for high-engagement tweets.
+ */
+export async function scrapeThread(page, tweetUrl) {
+  const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null;
+  const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null;
+  if (!mainTweetId || !mainAuthor) return [];
+
+  await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 });
+  await randomDelay(2000, 3000);
+
+  const graphqlData = await fetchTweetDetail(page, mainTweetId);
+  if (!graphqlData) return [];
+
+  const entries = extractEntries(graphqlData);
+  const thread = parseThreadFromEntries(entries, mainAuthor, mainTweetId);
+
+  // Strip internal fields for backward compatibility
+  return thread.map(({ inReplyTo, quotedTweetId, media, article, card, urls, ...rest }) => rest);
 }
 
 // ============================================================================

From 2e58d0d75c3ddeb6a14b10e34e0b739fa99fa83c Mon Sep 17 00:00:00 2001
From: nj-io <26359601+nj-io@users.noreply.github.com>
Date: Sun, 5 Apr 2026 10:31:44 +0000
Subject: [PATCH 2/3] add human-like delays and auth checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace uniform randomDelay (1-3s) with log-normal distribution
  (2-7s base + 8% distraction spikes of 8-20s)
- Add checkAuth() guard after page navigation — fails fast on expired cookies
- Add randomDelay before each fetchTweetDetail API call to simulate
  human browsing between tweet reads
---
 src/scrapers/twitter/index.js | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js
index 141ee3d..77ee6b3 100644
--- a/src/scrapers/twitter/index.js
+++ b/src/scrapers/twitter/index.js
@@ -27,7 +27,27 @@ puppeteer.use(StealthPlugin());
 // ============================================================================
 
 const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
-const randomDelay = (min = 1000, max = 3000) => sleep(min + Math.random() * (max - min));
+
+/** Human-like delay using log-normal distribution with occasional distraction spikes. */
+const randomDelay = (min = 2000, max = 7000) => {
+  const u1 = Math.random();
+  const u2 = Math.random();
+  const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2);
+  const median = min + (max - min) * 0.4;
+  const spread = (max - min) * 0.25;
+  const base = median + z * spread;
+  const distraction = Math.random() < 0.08 ? 8000 + Math.random() * 12000 : 0;
+  const delay = Math.max(min, Math.min(base, max)) + distraction;
+  return sleep(delay);
+};
+
+/** Throw if the page redirected to login (expired/invalid cookie). */
+function checkAuth(page) {
+  const url = page.url();
+  if (url.includes('/login') || url.includes('/i/flow/login')) {
+    throw new Error('Authentication failed — cookie may be expired.\n\nRun: xactions login');
+  }
+}
 
 /**
  * Create a browser instance with stealth settings.
@@ -444,8 +464,10 @@ export async function searchTweets(page, query, options = {}) {
 /**
  * Fetch TweetDetail GraphQL API from the page context using session cookies.
  * The page must already be on x.com (for cookies to be available).
+ * Includes a human-like delay before each call.
  */
 async function fetchTweetDetail(page, tweetId) {
+  await randomDelay(2000, 5000);
   return page.evaluate(async (id) => {
     const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1];
     if (!ct0) return null;
@@ -631,6 +653,7 @@ export async function scrapeThread(page, tweetUrl) {
   if (!mainTweetId || !mainAuthor) return [];
 
   await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 });
+  checkAuth(page);
   await randomDelay(2000, 3000);
 
   const graphqlData = await fetchTweetDetail(page, mainTweetId);

From 80890dd263ba521e91a9e01c3cb95d54196cffe5 Mon Sep 17 00:00:00 2001
From: nj-io <26359601+nj-io@users.noreply.github.com>
Date: Sun, 5 Apr 2026 10:35:44 +0000
Subject: [PATCH 3/3] feat: add scrapePost and x_read_post for full rich tweet
 reading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New scrapePost() function and x_read_post MCP tool that reads any
tweet URL with full rich data and recursive quote tweet resolution.

Features:
- Single tweets or threads (auto-detected via self-reply chain)
- Rich data per tweet: text, media (images + best-quality video URL),
  X Articles (title + cover image + URL), cards (link previews),
  external URLs (Substack, GitHub, etc.), engagement stats
- Recursive quote tweet resolution — if a quoted tweet is itself a
  thread, or contains its own quote tweet, those are fetched too
  (up to 5 levels deep)
- Human-like delays between API calls (inherited from fetchTweetDetail)
- Auth check on navigation (inherited from shared helpers)

Depends on: #17 (scrapeThread GraphQL rewrite with shared helpers)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/mcp/local-tools.js        |  7 ++++
 src/mcp/server.js             | 11 ++++++
 src/scrapers/index.js         |  2 ++
 src/scrapers/twitter/index.js | 64 +++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+)

diff --git a/src/mcp/local-tools.js b/src/mcp/local-tools.js
index 7ad47e4..d8e2f69 100644
--- a/src/mcp/local-tools.js
+++ b/src/mcp/local-tools.js
@@ -20,6 +20,7 @@ import {
   scrapeTweets,
   searchTweets,
   scrapeThread,
+  scrapePost,
   scrapeLikes,
   scrapeMedia,
   scrapeListMembers,
@@ -202,6 +203,11 @@ export async function x_get_thread({ url }) {
   return scrapeThread(pg, url);
 }
 
+export async function x_read_post({ url }) {
+  const { page: pg } = await ensureBrowser();
+  return scrapePost(pg, url);
+}
+
 export async function x_best_time_to_post({ username, limit = 100 }) {
   const { page: pg } = await ensureBrowser();
   const tweets = await scrapeTweets(pg, username, { limit });
@@ -1346,6 +1352,7 @@ export const toolMap = {
   x_get_tweets,
   x_search_tweets,
   x_get_thread,
+  x_read_post,
   x_best_time_to_post,
   // Core actions
   x_follow,
diff --git a/src/mcp/server.js b/src/mcp/server.js
index 1f46777..3507af0 100755
--- a/src/mcp/server.js
+++ b/src/mcp/server.js
@@ -1129,6 +1129,17 @@ const TOOLS = [
       required: ['url'],
     },
   },
+  {
+    name: 'x_read_post',
+    description: 'Read a tweet/post with full rich data. Returns thread if the post is part of one (author self-replies only). Recursively resolves quoted tweets — if a quoted tweet is itself a thread or contains its own quote tweet, those are fetched too. Each tweet includes: text, media (images + video URLs), X Articles, cards (link previews), external URLs, and engagement stats.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        url: { type: 'string', description: 'URL of the tweet/post' },
+      },
+      required: ['url'],
+    },
+  },
   // ====== Posting Analytics ======
   {
     name: 'x_best_time_to_post',
diff --git a/src/scrapers/index.js b/src/scrapers/index.js
index ff6646f..8c284fa 100644
--- a/src/scrapers/index.js
+++ b/src/scrapers/index.js
@@ -79,6 +79,7 @@ export const {
   scrapeTweets,
   searchTweets,
   scrapeThread,
+  scrapePost,
   scrapeLikes,
   scrapeHashtag,
   scrapeMedia,
@@ -308,6 +309,7 @@ export default {
   scrapeTweets,
   searchTweets,
   scrapeThread,
+  scrapePost,
   scrapeLikes,
   scrapeHashtag,
   scrapeMedia,
diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js
index 77ee6b3..de38fc7 100644
--- a/src/scrapers/twitter/index.js
+++ b/src/scrapers/twitter/index.js
@@ -666,6 +666,69 @@ export async function scrapeThread(page, tweetUrl) {
   return thread.map(({ inReplyTo, quotedTweetId, media, article, card, urls, ...rest }) => rest);
 }
 
+// ============================================================================
+// Post Scraper (rich data + recursive quoted tweets)
+// ============================================================================
+
+/**
+ * Scrape a single post or thread with full rich data.
+ *
+ * Returns the thread (1 tweet if single post, N if thread) with rich data
+ * per tweet: text, media, article, card, external URLs, engagement, and
+ * recursively resolved quoted posts (which may themselves be threads).
+ *
+ * @param {import('puppeteer').Page} page
+ * @param {string} tweetUrl
+ * @param {number} [maxDepth=5] - Max recursion depth for nested quote tweets
+ */
+export async function scrapePost(page, tweetUrl, maxDepth = 5) {
+  const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null;
+  const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null;
+  if (!mainTweetId || !mainAuthor) throw new Error('Invalid tweet URL');
+
+  // Ensure we're on x.com for cookie access
+  if (!page.url().includes('x.com')) {
+    await page.goto('https://x.com', { waitUntil: 'networkidle2', timeout: 30000 });
+    checkAuth(page);
+    await randomDelay(2000, 3000);
+  }
+
+  return _scrapePostRecursive(page, mainTweetId, mainAuthor, maxDepth, 0);
+}
+
+async function _scrapePostRecursive(page, tweetId, author, maxDepth, depth) {
+  const graphqlData = await fetchTweetDetail(page, tweetId);
+  if (!graphqlData) return { thread: [] };
+
+  const entries = extractEntries(graphqlData);
+  const thread = parseThreadFromEntries(entries, author, tweetId);
+  if (thread.length === 0) return { thread: [] };
+
+  // For each thread tweet, resolve its quoted post recursively
+  for (const tweet of thread) {
+    if (tweet.quotedTweetId && depth < maxDepth) {
+      const qtData = await fetchTweetDetail(page, tweet.quotedTweetId);
+      if (qtData) {
+        const qtEntries = extractEntries(qtData);
+        const focalEntry = qtEntries.find(e =>
+          e.entryId?.includes(tweet.quotedTweetId));
+        const focalResult = unwrapResult(
+          focalEntry?.content?.itemContent?.tweet_results?.result);
+        const qtAuthor = focalResult ? getScreenName(focalResult) : '';
+
+        if (qtAuthor) {
+          tweet.quotedPost = await _scrapePostRecursive(
+            page, tweet.quotedTweetId, qtAuthor, maxDepth, depth + 1);
+        }
+      }
+    }
+    delete tweet.quotedTweetId;
+    delete tweet.inReplyTo;
+  }
+
+  return { thread };
+}
+
 // ============================================================================
 // Likes Scraper
 // ============================================================================
@@ -1119,6 +1182,7 @@ export default {
   scrapeTweets,
   searchTweets,
   scrapeThread,
+  scrapePost,
   scrapeLikes,
   scrapeHashtag,
   scrapeMedia,