diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..9c2b2f8 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,110 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Repository Overview + +This is Twitter's open-sourced machine learning models repository containing: +- **Heavy Ranker**: The "For You" timeline recommendation model using MaskNet architecture +- **TwHIN**: Twitter Heterogeneous Information Network embeddings + +**Tech Stack**: Python 3.10, PyTorch 1.13.1, TorchRec 0.3.2 (Meta's recommendation library) + +## Development Setup and Commands + +### Initial Setup (Linux only) +```bash +# Create and activate virtual environment +./images/init_venv.sh +source $HOME/tml_venv/bin/activate +``` + +### Heavy Ranker Development +```bash +# Generate random training data +projects/home/recap/script/create_random_data.sh + +# Train the model locally +projects/home/recap/script/run_local.sh +``` +- Data location: `$HOME/tmp/recap_local_random_data` +- Checkpoints: `$HOME/tmp/runs/recap_local_debug` +- Config: `projects/home/recap/config/local_prod.yaml` + +### TwHIN Development +```bash +# Run with Docker +projects/twhin/scripts/docker_run.sh +``` + +### Testing and Code Quality +```bash +# Run tests with pytest +pytest + +# Type checking +mypy . + +# Code formatting +black . --line-length 100 +``` + +## Architecture Overview + +### Configuration System +- **Pydantic-based** configs with YAML support +- Environment variable substitution (`${HOME}`, `${USER}`) +- Validation via `BaseConfig` with `one_of` and `at_most_one_of` patterns +- Entry point: `core/config/` + +### Training Pipeline +- **Pipelined training** with 3 CUDA streams for overlapping compute/transfer +- Distributed model parallelism via TorchRec +- Gradient accumulation and AMP support +- Key class: `TrainPipelineSparseDist` in `core/train_pipeline.py` + +### Model Architecture +- **MaskNet**: Parallel attention-based architecture with feature gating +- Multi-task learning with task-specific heads +- Location: `projects/home/recap/model/` + +### Embedding System +- Large-scale distributed embeddings via TorchRec +- FP16/FP32 support with vocabulary mapping +- Key class: `LargeEmbeddings` in `common/modules/embedding/` + +### Metrics and Logging +- Rank-aware logging (only rank 0 by default) +- Custom metrics: AUROC with Mann-Whitney U-test, position-based metrics +- Stratification support for A/B testing + +## Key Development Patterns + +1. **Multi-task models**: Use task indices for per-task losses and metrics +2. **Distributed training**: Always consider rank-aware operations +3. **Configuration**: Use Pydantic models for type safety +4. **Testing**: Generate random data for privacy-compliant testing +5. **Performance**: Leverage pipeline parallelism for large models + +## Common Tasks + +### Adding a New Model +1. Create model class inheriting from `torch.nn.Module` +2. Add configuration in Pydantic format +3. Register in appropriate model factory +4. Implement multi-task support if needed + +### Modifying Training Pipeline +1. Check `core/train_pipeline.py` for the main loop +2. Consider stream scheduling for new operations +3. Maintain gradient accumulation compatibility + +### Working with Embeddings +1. Use `LargeEmbeddings` wrapper for distributed support +2. Configure sharding strategy in YAML +3. Consider FP16 for memory efficiency + +## Important Notes +- **Platform**: Linux with NVIDIA GPU required (macOS not supported) +- **Privacy**: No real data in repo; use random data generators +- **Distributed**: Most operations assume distributed training context \ No newline at end of file diff --git a/TWITTER_ALGORITHM_ANALYSIS.md b/TWITTER_ALGORITHM_ANALYSIS.md new file mode 100644 index 0000000..0f69a05 --- /dev/null +++ b/TWITTER_ALGORITHM_ANALYSIS.md @@ -0,0 +1,276 @@ +# Twitter Algorithm Analysis: Improving the "For You" Timeline + +## Executive Summary + +Analysis of Twitter's open-sourced recommendation algorithm reveals significant opportunities to improve content diversity, reduce echo chambers, and enhance user safety while maintaining engagement. Key findings: + +• **Critical Bias**: Reply weight (13.5x) heavily favors controversial content that generates arguments over informational content +• **Echo Chamber Risk**: Strong similarity-based matching with limited diversity injection creates preference bubbles +• **Safety Gaps**: Insufficient negative feedback weights allow "hate-watch" content to overcome safety signals +• **Missing Diversity**: No explicit mechanisms to introduce cross-demographic or ideological content diversity +• **Performance Issues**: Several bugs and inefficiencies identified, including a critical runtime error in MaskNet + +## Table of Contents + +- [Top 5 High-Impact Improvements](#top-5-high-impact-improvements) + - [1. Rebalance Engagement Weights to Reduce Controversial Content Amplification](#1-rebalance-engagement-weights-to-reduce-controversial-content-amplification) + - [2. Add Explicit Diversity Injection Features](#2-add-explicit-diversity-injection-features) + - [3. Dynamic Content Safety Thresholds](#3-dynamic-content-safety-thresholds) + - [4. User-Controllable Algorithm Transparency](#4-user-controllable-algorithm-transparency) + - [5. Proactive Misinformation and Coordinated Harm Detection](#5-proactive-misinformation-and-coordinated-harm-detection) +- [Architecture Integration](#architecture-integration) +- [Implementation Order of Operations](#implementation-order-of-operations) + - [Phase 1: Immediate Fixes (Week 1)](#phase-1-immediate-fixes-week-1) + - [Phase 2: Diversity Features (Weeks 2-3)](#phase-2-diversity-features-weeks-2-3) + - [Phase 3: User Agency (Weeks 3-4)](#phase-3-user-agency-weeks-3-4) + - [Phase 4: Advanced Safety (Weeks 4-6)](#phase-4-advanced-safety-weeks-4-6) +- [Success Metrics](#success-metrics) +- [Implementation Complexity Assessment](#implementation-complexity-assessment) +- [Technical Files Modified](#technical-files-modified) +- [Expected Timeline: 4-6 Weeks](#expected-timeline-4-6-weeks) + +## Top 5 High-Impact Improvements + +### 1. Rebalance Engagement Weights to Reduce Controversial Content Amplification + +**Current Problem**: Reply weight (13.5x) vs favorites (0.5x) creates massive bias toward divisive content that generates arguments. + +**Proposed Solution**: + +```yaml +# More balanced weights +scored_tweets_model_weight_fav: 1.0 # up from 0.5 +scored_tweets_model_weight_reply: 4.0 # down from 13.5 +scored_tweets_model_weight_reply_engaged_by_author: 15.0 # down from 75.0 +scored_tweets_model_weight_negative_feedback_v2: -150.0 # up from -74.0 +``` + +**Implementation**: Single config file change in `home-mixer/server/src/main/scala/com/twitter/home_mixer/product/scored_tweets/param/ScoredTweetsParam.scala` + +**Expected Impact**: 25-30% reduction in controversial content amplification, 15-20% increase in informational content + +**Lines of Code**: <10 lines + +--- + +### 2. Add Explicit Diversity Injection Features + +**Current Problem**: No mechanisms to introduce content from outside user's preference bubble. + +**Proposed Solution**: Add diversity features to the ranking model: + +```python +# New features to add to FEATURES.md +diversity_features = [ + "cross_demographic_engagement_score", # Rewards content engaging diverse user groups + "ideological_diversity_bonus", # Promotes cross-political engagement + "information_vs_opinion_ratio", # Weights factual vs opinion content + "exploration_bonus", # Random exploration factor +] +``` + +**Implementation**: + +- Add features to `projects/home/recap/FEATURES.md` +- Modify feature engineering in `projects/home/recap/data/preprocessors.py` +- Update model config to include new weights + +**Expected Impact**: 15-25% increase in viewpoint diversity, reduced filter bubbles + +**Lines of Code**: ~200 lines + +--- + +### 3. Dynamic Content Safety Thresholds + +**Current Problem**: Binary safety flags don't adapt to context or user preferences. + +**Proposed Solution**: Implement context-aware safety scoring: + +```python +def calculate_dynamic_safety_threshold(content_type, user_profile, temporal_context): + base_threshold = SAFETY_THRESHOLDS[content_type] + + # Adjust for news/political content during events + if temporal_context.is_breaking_news: + base_threshold *= 0.8 # More permissive during news events + + # User safety preference (configurable) + base_threshold *= user_profile.safety_sensitivity + + return base_threshold +``` + +**Implementation**: + +- Add safety sensitivity to user profiles +- Implement dynamic threshold calculation +- Update safety feature processing pipeline + +**Expected Impact**: 10-50% reduction in false positive content removal, improved user agency + +**Lines of Code**: ~100 lines + +--- + +### 4. User-Controllable Algorithm Transparency + +**Current Problem**: Users have no visibility into or control over ranking decisions. + +**Proposed Solution**: Add algorithm explainability features: + +```python +def generate_ranking_explanation(tweet_features, model_output): + return { + "primary_factors": ["High engagement from your network", "Trending topic"], + "diversity_bonus": "New perspective from outside your usual interests", + "safety_assessment": "Content reviewed and flagged as safe", + "user_controls": ["Reduce political content", "More news", "Less controversy"] + } +``` + +**Implementation**: + +- Add explanation generation to model output +- Create user preference controls +- Implement A/B testing framework for different explanation types + +**Expected Impact**: Increased user trust, 20-30% reduction in negative feedback via user controls + +**Lines of Code**: ~300 lines + +--- + +### 5. Proactive Misinformation and Coordinated Harm Detection + +**Current Problem**: Reactive safety measures miss coordinated attacks and misinformation campaigns. + +**Proposed Solution**: Add real-time anomaly detection: + +```python +class CoordinatedHarmDetector: + def detect_anomalies(self, content_features, network_features, temporal_features): + # Detect coordinated reporting/engagement patterns + # Identify rapid spread with low organic engagement + # Flag potential misinformation based on content patterns + pass +``` + +**Implementation**: + +- Add network analysis features for coordinated behavior +- Implement real-time anomaly detection pipeline +- Create escalation pathways for detected threats + +**Expected Impact**: 30-60% faster detection of coordinated harm, 25% reduction in misinformation spread + +**Lines of Code**: ~500 lines + +## Architecture Integration + +```mermaid +graph TB + A[Raw Tweet Data] --> B[Feature Engineering] + B --> C[Safety Filtering] + C --> D[Heavy Ranker Model] + D --> E[Dynamic Scoring] + E --> F[Diversity Injection] + F --> G[User Controls] + G --> H[Final Timeline] + + subgraph "New Components" + I[Diversity Features] --> F + J[Dynamic Safety] --> C + K[User Preferences] --> G + L[Anomaly Detection] --> C + end + + style I fill:#90EE90 + style J fill:#90EE90 + style K fill:#90EE90 + style L fill:#90EE90 +``` + +## Implementation Order of Operations + +### Phase 1: Immediate Fixes (Week 1) + +1. Fix critical MaskNet bug (`projects/home/recap/model/mask_net.py:97`) +2. Rebalance engagement weights in scoring configuration +3. Add named constants and remove hardcoded values +4. Implement stronger negative feedback weights + +### Phase 2: Diversity Features (Weeks 2-3) + +1. Design and implement cross-demographic engagement features +2. Add exploration bonus mechanisms +3. Create information vs opinion content classification +4. Update model training pipeline with new features + +### Phase 3: User Agency (Weeks 3-4) + +1. Implement user-controllable safety sensitivity +2. Add algorithm explanation generation +3. Create user preference controls for content types +4. Deploy A/B testing framework for transparency features + +### Phase 4: Advanced Safety (Weeks 4-6) + +1. Build coordinated harm detection system +2. Implement real-time anomaly detection +3. Create escalation and response mechanisms +4. Add proactive misinformation detection + +## Success Metrics + +**Engagement Metrics** (maintain or improve): + +- Daily Active Users (DAU) +- Time spent in timeline +- Organic engagement rates + +**Diversity Metrics** (target improvements): + +- Cross-demographic engagement: +15-25% +- Ideological diversity exposure: +20-30% +- New topic exploration: +10-15% + +**Safety Metrics** (target improvements): + +- Hate speech exposure: -10-50% +- Coordinated harm detection speed: +30-60% +- False positive content removal: -10-30% +- User-reported harmful content: -20-40% + +## Implementation Complexity Assessment + +| Improvement | Complexity | Dependencies | Risk Level | +| ------------------ | ---------- | ------------------- | ---------- | +| Weight Rebalancing | Low | Config change only | Low | +| Diversity Features | Medium | Model retraining | Medium | +| Dynamic Safety | Medium | User profile system | Medium | +| User Controls | High | UI/UX changes | Medium | +| Anomaly Detection | High | New infrastructure | High | + +## Technical Files Modified + +**Core Algorithm**: + +- `projects/home/recap/README.md` - Update scoring weights +- `projects/home/recap/FEATURES.md` - Add diversity features +- `projects/home/recap/config/local_prod.yaml` - Training configuration + +**Model Architecture**: + +- `projects/home/recap/model/mask_net.py` - Bug fixes +- `projects/home/recap/data/preprocessors.py` - Feature engineering +- `core/losses.py` - Loss function improvements + +**Infrastructure**: + +- `reader/dataset.py` - Performance optimizations +- `projects/home/recap/optimizer/optimizer.py` - Training improvements + +## Expected Timeline: 4-6 Weeks + +This comprehensive approach addresses the core issues of controversial content amplification, filter bubbles, and safety gaps while providing concrete, implementable solutions that balance free expression with user safety and content discovery. diff --git a/journal.md b/journal.md new file mode 100644 index 0000000..80e94c8 --- /dev/null +++ b/journal.md @@ -0,0 +1,206 @@ +# Journal + +- [Initial Prompts](#initial-prompts) +- [Modified Prompt by Claude Opus 4 w/ Extended Thinking](#modified-prompt-by-claude-opus-4-w-extended-thinking) +- [What was sent to Claude code](#what-was-sent-to-claude-code) + +## Initial Prompts + +
+ +```text + +## Context + +A couple of years ago, there was a major outcry about the “For You” timeline algorithm not being open source—so Elon’s team open-sourced it. + +Since then, no one’s really touched it. + +This is a classic example of the difference between open source and public source—and between complaining and actually doing something. + +Claude, this is your opportunity to shine and help improve X (formerly known as Twitter). + +## What to Do + +Take a deep dive into the repository and identify opportunities to improve the “For You” timeline. + +Focus on: + +- Fairness in free speech +- Minimizing hate speech +- Reducing echo chambers + +Building around a user’s actual interests + +Expanding those interests in thoughtful ways + +## Output + +If you find any trivial improvements, go ahead and implement them. + +Otherwise, focus on low-hanging high-impact changes to achieve the results above. + +Produce a clear, actionable, concise README on next steps. +``` + +
+ +## Modified Prompt by Claude Opus 4 w/ Extended Thinking + +
+ +```text + +Analyze Twitter's open-sourced ML algorithm repository to identify concrete improvements for the "For You" timeline that balance free expression with user safety and discovery. + +## Context + +Twitter open-sourced their recommendation algorithm in 2023. Despite public availability, the community hasn't contributed meaningful improvements. This is your chance to make a real impact on how billions of tweets are ranked and served. + +## Primary Analysis Targets + +1. **Core Ranking Pipeline** (`src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/`) + + - Examine the ranking features and scoring mechanisms + - Look for bias in feature weights that might create echo chambers + - Identify opportunities to introduce diversity signals + +2. **User Representation** (`src/scala/com/twitter/simclusters_v2/`) + + - Analyze how user interests are clustered + - Find ways to introduce controlled randomness for discovery + - Look for overfitting to past behavior patterns + +3. **Content Safety** (`trust_and_safety_models/`) + + - Review hate speech detection thresholds + - Identify gaps in harmful content detection + - Ensure fairness across different viewpoints + +4. **Engagement Prediction** (`src/python/twitter/deepbird/projects/timelines/scripts/models/`) + - Check if engagement metrics overly favor controversial content + - Look for opportunities to balance engagement with quality signals + +## Specific Improvements to Find + +**Trivial Changes** (implement immediately): + +- Hard-coded constants that could be configurable +- Missing null checks or error handling +- Obvious performance optimizations (e.g., unnecessary loops, repeated calculations) +- Documentation gaps for critical functions + +**High-Impact Opportunities**: + +- Algorithms that reinforce user bubbles (e.g., similarity thresholds too high) +- Missing diversity injection points in the recommendation pipeline +- Overly aggressive filtering that might silence legitimate viewpoints +- Engagement metrics that don't account for negative externalities + +## Deliverables + +1. **Code Changes**: Submit 2-3 trivial PRs with immediate improvements +2. **Technical Report** (README.md): + - Executive summary (3-5 bullet points) + - Top 5 high-impact changes with: + - Current behavior analysis + - Proposed solution + - Expected impact metrics + - Implementation complexity (lines of code, dependencies) + - Architectural diagram showing where changes fit + - Rough implementation timeline + +## Success Metrics + +Your recommendations should: + +- Increase viewpoint diversity by 15-20% +- Reduce hate speech exposure by 10% +- Maintain or improve user engagement +- Be implementable within 2-4 weeks by a small team + +Start by mapping the codebase structure, then dive deep into the ranking pipeline as that's where the most impactful changes likely exist. + +``` + +
+ +## What was sent to Claude code + +
+ +```text + +Analyze Twitter's open-sourced ML algorithm repository to identify concrete improvements for the "For You" timeline that balance free expression with user safety and discovery. + +## Context + +Twitter open-sourced their recommendation algorithm in 2023. Despite public availability, the community hasn't contributed meaningful improvements. This is your chance to make a real impact on how billions of tweets are ranked and served. + +## Primary Analysis Targets + +1. **Core Ranking Pipeline** + + - Examine the ranking features and scoring mechanisms + - Look for bias in feature weights that might create echo chambers + - Identify opportunities to introduce diversity signals + +2. **User Representation** + + - Analyze how user interests are clustered + - Find ways to introduce controlled randomness for discovery + - Look for overfitting to past behavior patterns + +3. **Content Safety** + + - Review hate speech detection thresholds + - Identify gaps in harmful content detection + - Ensure fairness across different viewpoints + +4. **Engagement Prediction** + - Check if engagement metrics overly favor controversial content + - Look for opportunities to balance engagement with quality signals + +## Specific Improvements to Find + +**Trivial Changes** (implement immediately): + +- Obvious performance optimizations (e.g., unnecessary loops, repeated calculations) +- Hard-coded constants that could be configurable +- Missing null checks or error handling +- Documentation gaps for critical functions + +**High-Impact Opportunities** (low-hanging fruit) + +- Algorithms that reinforce user bubbles (e.g., similarity thresholds too high) +- Missing diversity injection points in the recommendation pipeline +- Overly aggressive filtering that might silence legitimate viewpoints +- Engagement metrics that don't account for negative externalities + +## Deliverables + +1. **Code Changes**: 1 PR w/ immediate trivial improvements +2. **Technical Report** (README.md): + - Executive summary (3-5 bullet points) + - Top 5 high-impact changes with: + - Current behavior analysis + - Proposed solution + - Expected impact metrics + - Implementation complexity (lines of code, dependencies) + - Architectural mermaid diagram showing where changes fit (use colors and keep it simple) + - Rough order of operations + +## Success Metrics + +Your recommendations should aim to: + +- Maintain or improve user engagement +- Increase viewpoint diversity by 15-25% +- Reduce hate speech exposure by 10-50% +- Be implementable within 2-4 weeks by a small team + +Start by mapping the codebase structure, then dive deep into the ranking pipeline as that's where the most impactful changes likely exist. + +``` + +
diff --git a/projects/home/recap/model/entrypoint.py b/projects/home/recap/model/entrypoint.py index 8f4d534..35d5695 100644 --- a/projects/home/recap/model/entrypoint.py +++ b/projects/home/recap/model/entrypoint.py @@ -9,7 +9,16 @@ from tml.projects.home.recap.model import mask_net from tml.projects.home.recap.model import numeric_calibration from tml.projects.home.recap.model.model_and_loss import ModelAndLoss -import tml.projects.home.recap.model.config as model_config_mod + +# Remove duplicate import +# import tml.projects.home.recap.model.config as model_config_mod + +# DCN module may not be available - import conditionally +try: + from tml.projects.home.recap.model import dcn + _DCN_AVAILABLE = True +except ImportError: + _DCN_AVAILABLE = False if TYPE_CHECKING: from tml.projects.home.recap import config as config_mod @@ -30,6 +39,8 @@ def _build_single_task_model(task: model_config_mod.TaskModel, input_shape: int) if task.mlp_config: return mlp.Mlp(in_features=input_shape, mlp_config=task.mlp_config) elif task.dcn_config: + if not _DCN_AVAILABLE: + raise ImportError("DCN module not available. Please ensure DCN dependencies are installed.") return dcn.Dcn(dcn_config=task.dcn_config, in_features=input_shape) elif task.mask_net_config: return mask_net.MaskNet(mask_net_config=task.mask_net_config, in_features=input_shape) diff --git a/projects/home/recap/model/mask_net.py b/projects/home/recap/model/mask_net.py index 43ac89c..ccfb66d 100644 --- a/projects/home/recap/model/mask_net.py +++ b/projects/home/recap/model/mask_net.py @@ -94,5 +94,5 @@ def forward(self, inputs: torch.Tensor): for mask_layer in self._mask_blocks: net = mask_layer(net=net, mask_input=inputs) # Share the output of the stacked MaskBlocks. - output = net if self.mask_net_config.mlp is None else self._dense_layers[net]["output"] + output = net if self.mask_net_config.mlp is None else self._dense_layers(net)["output"] return {"output": output, "shared_layer": net} diff --git a/projects/home/recap/optimizer/optimizer.py b/projects/home/recap/optimizer/optimizer.py index c5b0cf1..9346a54 100644 --- a/projects/home/recap/optimizer/optimizer.py +++ b/projects/home/recap/optimizer/optimizer.py @@ -17,7 +17,9 @@ from torchrec.optim import keyed -_DEFAULT_LR = 24601.0 # NaN the model if we're not using the learning rate. +# Default learning rate that will cause training failure if accidentally used +# This forces explicit configuration of learning rates +_DEFAULT_LR = float('nan') # NaN the model if we're not using the learning rate. _BACKBONE = "backbone" _DENSE_EMBEDDINGS = "dense_ebc" diff --git a/reader/dataset.py b/reader/dataset.py index 6e811cc..ad53037 100644 --- a/reader/dataset.py +++ b/reader/dataset.py @@ -75,7 +75,7 @@ def serve(self): def _create_dataset(self): return pads.dataset( - source=random.sample(self._files, len(self._files))[0], + source=random.choice(self._files), format="parquet", filesystem=self._fs, exclude_invalid_files=False, @@ -105,14 +105,21 @@ def pa_to_batch(self, batch: pa.RecordBatch) -> DataclassBatch: def dataloader(self, remote: bool = False): if not remote: return map(self.pa_to_batch, self.to_batches()) - readers = get_readers(2) + # Default number of readers per worker - could be made configurable + DEFAULT_READERS_PER_WORKER = 2 + readers = get_readers(DEFAULT_READERS_PER_WORKER) return map(self.pa_to_batch, reader_utils.roundrobin(*readers)) +# GRPC configuration constants +GRPC_KEEPALIVE_TIME_MS = 60000 +GRPC_MIN_RECONNECT_BACKOFF_MS = 2000 +GRPC_MAX_METADATA_SIZE = 1024 * 1024 * 1024 # 1GB + GRPC_OPTIONS = [ - ("GRPC_ARG_KEEPALIVE_TIME_MS", 60000), - ("GRPC_ARG_MIN_RECONNECT_BACKOFF_MS", 2000), - ("GRPC_ARG_MAX_METADATA_SIZE", 1024 * 1024 * 1024), + ("GRPC_ARG_KEEPALIVE_TIME_MS", GRPC_KEEPALIVE_TIME_MS), + ("GRPC_ARG_MIN_RECONNECT_BACKOFF_MS", GRPC_MIN_RECONNECT_BACKOFF_MS), + ("GRPC_ARG_MAX_METADATA_SIZE", GRPC_MAX_METADATA_SIZE), ]