diff --git a/.gitignore b/.gitignore
index 7eade253..40ac8ab9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,17 @@
 *.pkl
 *.zip
+*.pth
+*.txt
+*.ckpt
+*.pyc
+*.onnx
+*.data
+*.lock
+*/__pycache__/
+*.DS_Store
+*.idea/
+*.pytest_cache/
+*.ruff_cache/
 data/
 .ipynb_checkpoints
 
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..9227b116
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Python: 当前文件",
+      "type": "python",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal",
+      "cwd": "${workspaceFolder}",
+      "justMyCode": true
+    }
+  ]
+}
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..a6c53728
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,14 @@
+{
+  "python.defaultInterpreterPath": "${workspaceFolder}/pytorch-tutorial/bin/python",
+  "python.terminal.activateEnvironment": true,
+  "python.analysis.typeCheckingMode": "basic",
+  "[python]": {
+    "editor.defaultFormatter": "charliermarsh.ruff",
+    "editor.formatOnSave": true,
+    "editor.codeActionsOnSave": {
+      "source.fixAll": "explicit",
+      "source.organizeImports": "explicit"
+    }
+  }
+}
+
diff --git a/README.md b/README.md
index 59ac3300..6b0b5bb3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,12 @@
+
+## uv
+```bash
+uv venv --python 3.11
+uv venv
+source .venv/bin/activate
+uv add xxx 
+uv sync 
+```
 <p align="center"><img width="40%" src="logo/pytorch_logo_2018.svg" /></p>
 
 --------------------------------------------------------------------------------
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..8bf6d2a4
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,59 @@
+[project]
+name = "tutorials"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.9"
+authors = [
+    { name = "lihanghang", email = "lihanghang@guazi.com" }  # 对象格式，符合新版规范
+]
+dependencies = [
+    "matplotlib>=3.9.4",
+    "onnx>=1.19.1",
+    "onnxruntime>=1.20.1",
+    "onnxscript>=0.5.7",
+    "pandas>=2.3.3",
+    "setuptools>=80.9.0",
+    "tabulate>=0.9.0",
+    "tensordict>=0.10.0",
+    "torchvision>=0.23.0",
+]
+
+[tool.poetry.dependencies]
+aiohttp = "3.12.14"                  # 异步HTTP客户端/服务器库
+urllib3 = "2.6.2"                    # HTTP客户端库，提供连接池和线程安全
+orjson = ">=3.9.14,<4.0.0"           # 高性能JSON序列化/反序列化库
+uuid = "^1.30"                       # 用于生成和操作UUID的库
+torch = "2.8.0"                      # PyTorch核心库，深度学习框架
+contourpy = "1.3.0"                  # 用于绘制等高线的Python库，matplotlib依赖
+cycler = "0.12.1"                    # 用于生成循环样式的工具库，matplotlib依赖
+fonttools = "4.60.2"                  # 用于处理字体文件的库，matplotlib依赖
+kiwisolver = "1.4.7"                  # 用于约束求解的库，matplotlib依赖
+pyparsing = "3.3.1"                   # 用于解析字符串的库，matplotlib依赖
+importlib-resources = "6.5.2"         # 用于访问Python包资源的库
+matplotlib = "3.9.4"                  # 数据可视化库，用于创建图表和图形
+python-dateutil = "2.9.0.post0"       # 日期时间处理扩展库，提供更强大的日期操作
+six = "1.17.0"                       # Python 2和Python 3兼容性库
+click = "8.1.8"                      # 命令行界面开发库，用于创建命令行工具
+joblib = "1.5.3"                     # 用于并行计算和任务调度的库，常用于机器学习
+nltk = "3.9.2"                       # 自然语言处理工具包，包含语料库和算法
+regex = "2025.11.3"                  # 正则表达式扩展库，提供比标准re更强大的功能
+tqdm = "4.67.1"                      # 进度条库，用于显示循环和任务的进度
+pycocotools = "2.0.11"                # COCO数据集工具库，用于目标检测、分割等任务
+argparse = "1.4.0"                    # 命令行参数解析库，用于处理命令行输入
+pandas = "2.3.3"
+pytz = "2025.2"
+tzdata = "2025.3"
+tensordict = "0.10.0"
+cloudpickle = "3.1.2"
+importlib-metadata = "8.7.1"
+pyvers = "0.1.0"
+pillow = "11.3.0"
+setuptools = "80.9.0"
+[[tool.poetry.source]]
+name = "aliyun"
+url = "https://mirrors.huaweicloud.com/repository/pypi/simple/"
+priority = "primary"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/tutorials/01-basics/feedforward_neural_network/main.py b/tutorials/01-basics/feedforward_neural_network/main.py
index 0c766a7e..0fb48bbf 100644
--- a/tutorials/01-basics/feedforward_neural_network/main.py
+++ b/tutorials/01-basics/feedforward_neural_network/main.py
@@ -2,93 +2,121 @@
 import torch.nn as nn
 import torchvision
 import torchvision.transforms as transforms
+import ssl
 
+# 前馈神经网络
+"""
+前馈神经网络是网络结构，反向传播是训练这个网络的核心算法。
+神经网络的「骨架」—— 定义了「神经元如何分层、层与层如何连接、每层神经元数量、用什么激活函数」的整体框架，
+决定了信息在网络中如何传递，是模型能拟合数据的基础，和 “反向传播（训练算法）” 是 “骨架” 和 “打磨骨架的方法” 的关系。
 
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+"""
+ssl._create_default_https_context = ssl._create_unverified_context
 
-# Hyper-parameters 
-input_size = 784
-hidden_size = 500
-num_classes = 10
-num_epochs = 5
-batch_size = 100
-learning_rate = 0.001
-
-# MNIST dataset 
-train_dataset = torchvision.datasets.MNIST(root='../../data', 
-                                           train=True, 
-                                           transform=transforms.ToTensor(),  
-                                           download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data', 
-                                          train=False, 
-                                          transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
-                                           batch_size=batch_size, 
-                                           shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
-                                          batch_size=batch_size, 
-                                          shuffle=False)
 
 # Fully connected neural network with one hidden layer
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
         super(NeuralNet, self).__init__()
-        self.fc1 = nn.Linear(input_size, hidden_size) 
-        self.relu = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)  
-    
+        self.fc1 = nn.Linear(input_size, hidden_size)  # 输入层→隐藏层
+        self.relu = nn.ReLU()  # 激活函数
+        self.fc2 = nn.Linear(hidden_size, num_classes)  # 隐藏层→输出层
+
     def forward(self, x):
-        out = self.fc1(x)
-        out = self.relu(out)
-        out = self.fc2(out)
+        out = self.fc1(x) # 线性变换
+        out = self.relu(out) # 非线性激活
+        out = self.fc2(out) # 线性变换
         return out
 
-model = NeuralNet(input_size, hidden_size, num_classes).to(device)
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
-    for i, (images, labels) in enumerate(train_loader):  
-        # Move tensors to the configured device
-        images = images.reshape(-1, 28*28).to(device)
-        labels = labels.to(device)
-        
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-        
-        # Backward and optimize
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        if (i+1) % 100 == 0:
-            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
-                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-# In test phase, we don't need to compute gradients (for memory efficiency)
-with torch.no_grad():
-    correct = 0
-    total = 0
-    for images, labels in test_loader:
-        images = images.reshape(-1, 28*28).to(device)
-        labels = labels.to(device)
-        outputs = model(images)
-        _, predicted = torch.max(outputs.data, 1)
-        total += labels.size(0)
-        correct += (predicted == labels).sum().item()
-
-    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
+"""
+FNN与之前学习的模型对比
+模型	        结构复杂度	    学习能力	             应用场景
+线性回归	    简单（单层）	    只能学习线性关系	    简单回归任务
+逻辑回归	    简单（单层+激活）	只能学习线性可分的分类	简单分类任务
+前馈神经网络	复杂（多层+激活）	可学习复杂非线性关系	复杂分类/回归任务
+
+"""
+"""
+前馈神经网络的作用
+FNN的核心作用是学习输入与输出之间的复杂映射关系，主要用于两类任务：
+
+分类任务：将输入数据分为不同类别（如代码中的MNIST数字分类）
+回归任务：预测连续数值（如房价预测、股票价格预测）
+其强大之处在于：通过多层结构和非线性激活，能够拟合几乎任何复杂的函数关系（这是神经网络的"万能近似定理"）。
+"""
+if __name__ == '__main__':
+
+    # Device configuration
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    # Hyper-parameters
+    input_size = 784
+    hidden_size = 500
+    num_classes = 10
+    num_epochs = 5
+    batch_size = 100
+    learning_rate = 0.001
+
+    # MNIST dataset
+    train_dataset = torchvision.datasets.MNIST(root='../../data',
+                                               train=True,
+                                               transform=transforms.ToTensor(),
+                                               download=True)
+
+    test_dataset = torchvision.datasets.MNIST(root='../../data',
+                                              train=False,
+                                              transform=transforms.ToTensor())
+
+    # Data loader
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True)
+
+    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+                                              batch_size=batch_size,
+                                              shuffle=False)
+
+    model = NeuralNet(input_size, hidden_size, num_classes).to(device)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    total_step = len(train_loader)
+    for epoch in range(num_epochs):
+        for i, (images, labels) in enumerate(train_loader):
+            # Move tensors to the configured device
+            images = images.reshape(-1, 28 * 28).to(device)
+            labels = labels.to(device)
+
+            # Forward pass
+            outputs = model(images)
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if (i + 1) % 100 == 0:
+                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+    # Test the model
+    # In test phase, we don't need to compute gradients (for memory efficiency)
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        for images, labels in test_loader:
+            images = images.reshape(-1, 28 * 28).to(device)
+            labels = labels.to(device)
+            outputs = model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+        print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
+
+    # Save the model checkpoint
+    torch.save(model.state_dict(), 'model.ckpt')
diff --git a/tutorials/01-basics/life/README.md b/tutorials/01-basics/life/README.md
new file mode 100644
index 00000000..3942dda3
--- /dev/null
+++ b/tutorials/01-basics/life/README.md
@@ -0,0 +1,193 @@
+一、推荐系统的核心目标
+为用户精准匹配他们可能感兴趣的内容/商品，同时帮助平台：
+
+提高用户留存和活跃度
+增加用户消费和观看时长
+优化平台资源利用率
+实现商业价值最大化
+二、推荐系统的基本流程
+数据收集：
+
+用户行为数据：浏览记录、点击、购买、评分、收藏、分享等
+物品特征数据：商品类别、价格、品牌、描述；视频的标签、时长、创作者等
+用户特征数据：年龄、性别、地域、偏好等
+数据预处理：
+
+清洗脏数据（如异常值、缺失值）
+特征工程（提取关键特征、特征编码、归一化等）
+构建用户-物品交互矩阵
+模型训练：
+
+使用机器学习或深度学习算法，从数据中学习用户偏好模式
+常见算法：协同过滤、基于内容的推荐、深度学习推荐等
+推荐生成：
+
+为用户预测对候选物品的兴趣度
+根据兴趣度排序，生成推荐列表
+在线服务与评估：
+
+将推荐结果实时展示给用户
+收集用户反馈，持续优化模型
+三、推荐系统的主要算法
+1. 协同过滤（Collaborative Filtering）
+协同过滤是推荐系统最经典的算法，基于**"相似用户喜欢相似物品"**的假设：
+
+（1）基于用户的协同过滤（User-based CF）
+原理：找到与目标用户兴趣相似的其他用户，推荐这些相似用户喜欢的物品
+步骤：
+计算用户之间的相似度（如余弦相似度、皮尔逊相关系数）
+为目标用户找到最相似的K个邻居
+将邻居喜欢的、目标用户未接触过的物品推荐给目标用户
+（2）基于物品的协同过滤（Item-based CF）
+原理：找到与目标用户喜欢的物品相似的其他物品，推荐这些相似物品
+
+步骤：
+
+计算物品之间的相似度（如余弦相似度、调整余弦相似度）
+为目标用户喜欢的每个物品找到最相似的K个物品
+将这些相似物品推荐给目标用户
+特点：计算复杂度低，推荐结果稳定，常用于电商平台（如Amazon）
+
+2. 基于内容的推荐（Content-based Recommendation）
+原理：基于用户过去喜欢的物品特征，推荐具有相似特征的其他物品
+
+步骤：
+
+提取物品的特征向量（如商品的类别、品牌、价格；视频的标签、风格）
+分析用户的历史行为，学习用户对不同特征的偏好权重
+根据物品特征与用户偏好的匹配度，生成推荐列表
+特点：不依赖其他用户数据，适用于冷启动场景，常用于视频/音乐推荐（如Spotify）
+
+3. 混合推荐系统（Hybrid Recommendation）
+原理：结合多种推荐算法的优势，提高推荐效果
+常见组合方式：
+加权混合：对不同算法的推荐结果进行加权融合
+特征组合：将多种算法的特征输入到统一模型中
+级联混合：先用一种算法生成候选集，再用另一种算法精排
+切换混合：根据不同场景或用户群体选择不同算法
+4. 深度学习在推荐系统中的应用
+随着深度学习的发展，越来越多的推荐系统开始采用深度神经网络：
+
+（1）矩阵分解模型（如MF、SVD++）
+将用户和物品映射到低维隐向量空间
+通过向量内积预测用户对物品的评分
+（2）神经网络推荐模型
+DNN（深度神经网络）：学习复杂的用户-物品交互模式
+CNN（卷积神经网络）：提取物品文本或图像的局部特征
+RNN/LSTM（循环神经网络）：捕捉用户行为序列的时序信息
+Attention Mechanism（注意力机制）：识别用户行为中的关键物品或特征
+（3）经典深度学习推荐模型
+Wide & Deep：结合记忆能力（Wide部分）和泛化能力（Deep部分）
+DeepFM：融合因子分解机（FM）和深度神经网络，自动学习高阶特征交互
+DIN（Deep Interest Network）：引入注意力机制，捕捉用户动态兴趣
+BERT4Rec：基于Transformer，建模用户行为序列
+四、电商平台 vs 视频网站：推荐策略差异
+虽然核心原理相似，但由于业务场景和用户行为特点不同，两者的推荐策略有所差异：
+
+对比维度	电商平台（商品推荐）	视频网站（内容推荐）
+用户目标	明确（购物、购买特定商品）	相对模糊（娱乐、打发时间）
+决策周期	较长（需要比较、考虑）	较短（几秒内决定是否观看）
+物品特点	种类丰富，价格差异大	内容形式相对统一，时长多样
+关键特征	价格、品牌、销量、评价	内容质量、创作者、时效性、标签
+推荐重点	转化率、客单价、复购率	点击率、观看时长、完播率
+实时性要求	中等（购物决策较慢）	极高（需快速响应用户兴趣变化）
+冷启动挑战	新用户、新商品	新用户、新内容、新创作者
+五、推荐系统的评估指标
+推荐系统的效果通常通过以下指标评估：
+
+准确性指标：
+
+准确率（Precision）：推荐列表中用户真正感兴趣的物品比例
+召回率（Recall）：用户真正感兴趣的物品被推荐到的比例
+F1值：准确率和召回率的调和平均
+排序指标：
+
+NDCG（Normalized Discounted Cumulative Gain）：衡量推荐列表的排序质量
+MAP（Mean Average Precision）：平均准确率均值
+业务指标：
+
+点击率（CTR）
+转化率（CVR）
+平均观看时长
+用户留存率
+销售额/GMV
+六、推荐系统面临的挑战
+冷启动问题：
+
+新用户：缺乏历史行为数据
+新物品：缺乏用户交互数据
+数据稀疏性：
+
+用户-物品交互矩阵通常非常稀疏
+多数用户只与少数物品产生交互
+实时性要求：
+
+需实时响应用户兴趣变化
+处理海量并发请求
+公平性与多样性：
+
+避免"信息茧房"（只推荐用户已感兴趣的内容）
+确保推荐结果的多样性和公平性
+隐私保护：
+
+在利用用户数据的同时，保护用户隐私
+遵守数据保护法规（如GDPR、个人信息保护法）
+七、推荐系统的发展趋势
+多模态推荐：融合文本、图像、音频、视频等多种模态信息
+联邦学习：在保护用户隐私的前提下，实现跨平台的推荐模型训练
+强化学习：通过与环境交互，动态优化推荐策略
+因果推荐：从相关性分析转向因果关系挖掘，提高推荐的可解释性
+大模型与推荐系统结合：利用预训练大模型（如LLM）增强推荐系统的语义理解能力
+
+
+
+##  核心技术：人脸识别系统
+
+#### 1. 智能手机解锁和人脸支付都依赖于人脸识别技术，其基本流程如下：
+* 图像采集：通过手机前置摄像头捕捉用户面部图像
+* 人脸检测：从图像中定位并提取人脸区域
+* 特征提取：将人脸图像转换为计算机可理解的特征向量
+* 特征匹配：将提取的特征与数据库中存储的用户特征进行比对
+* 决策判断：根据匹配结果和阈值，判断是否通过验证
+## 二、深度学习在人脸识别中的应用
+现代智能手机的人脸识别系统几乎都采用了深度学习技术，特别是卷积神经网络(CNN)，这与您学习的前馈神经网络有密切联系，但结构更复杂：
+
+1. 人脸检测
+使用专门的人脸检测网络（如MTCNN、RetinaFace等）
+这些网络能够在复杂环境下（如不同光线、角度、遮挡）准确检测人脸位置
+通常包含多个阶段：候选框生成 → 人脸回归 → 人脸关键点定位
+2. 特征提取
+使用深度卷积神经网络（如FaceNet、ArcFace、SphereFace等）
+这些网络经过大量人脸数据训练，能够学习到人脸的判别性特征
+关键特性：相同人脸在不同条件下的特征向量距离较小，不同人脸的特征向量距离较大
+3. 特征匹配与验证
+采用度量学习方法，将人脸映射到一个高维特征空间
+使用相似度度量（如欧氏距离、余弦相似度）比较特征向量
+设置阈值判断是否为同一人
+
+
+三、智能手机解锁 vs 人脸支付：主要区别
+虽然原理相似，但两者在安全性要求和实现细节上有很大差异：
+
+对比维度	智能手机解锁	人脸支付
+安全级别要求	中等（主要防止非授权使用）	极高（涉及资金安全）
+活体检测	基本（简单眨眼、摇头）	高级（3D结构光、红外成像）
+防攻击能力	一般（可能被高清照片欺骗）	强（能防御照片、视频、3D模型攻击）
+硬件支持	普通摄像头即可	专用硬件（如3D结构光、ToF传感器）
+验证阈值	相对宽松（优先用户体验）	非常严格（优先安全性）
+
+四、关键安全技术：活体检测
+为了防止照片、视频等欺骗手段，人脸支付系统通常采用活体检测技术：
+
+2D活体检测：分析面部表情变化（眨眼、张嘴）、纹理信息等
+3D结构光：投射红外点阵，获取面部3D结构信息（如iPhone的Face ID）
+红外成像：使用红外摄像头，区分真实人脸和照片/视频
+多光谱成像：结合可见光和红外光信息，提高检测准确率
+
+五、与您学习的PyTorch知识的联系
+虽然人脸识别系统比您当前学习的前馈神经网络复杂得多，但它们的核心原理是一致的：
+
+神经网络结构：都采用层级结构，通过非线性激活函数（如ReLU）提取特征
+训练过程：都需要大量标注数据，通过反向传播和优化器（如Adam）更新参数
+损失函数：人脸识别常用专门的损失函数（如Triplet Loss、ArcFace Loss），但基本思想仍是最小化预测与真实值的差距
+模型评估：都需要在测试集上评估准确率、召回率等指标
\ No newline at end of file
diff --git a/tutorials/01-basics/linear_regression/main.py b/tutorials/01-basics/linear_regression/main.py
index b3715d99..15d513f2 100644
--- a/tutorials/01-basics/linear_regression/main.py
+++ b/tutorials/01-basics/linear_regression/main.py
@@ -3,53 +3,73 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
+# 线性回归
 
-# Hyper-parameters
-input_size = 1
-output_size = 1
-num_epochs = 60
-learning_rate = 0.001
-
-# Toy dataset
-x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168], 
-                    [9.779], [6.182], [7.59], [2.167], [7.042], 
-                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
-
-y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573], 
-                    [3.366], [2.596], [2.53], [1.221], [2.827], 
-                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
-
-# Linear regression model
-model = nn.Linear(input_size, output_size)
-
-# Loss and optimizer
-criterion = nn.MSELoss()
-optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  
-
-# Train the model
-for epoch in range(num_epochs):
-    # Convert numpy arrays to torch tensors
-    inputs = torch.from_numpy(x_train)
-    targets = torch.from_numpy(y_train)
-
-    # Forward pass
-    outputs = model(inputs)
-    loss = criterion(outputs, targets)
-    
-    # Backward and optimize
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
-    
-    if (epoch+1) % 5 == 0:
-        print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
-
-# Plot the graph
-predicted = model(torch.from_numpy(x_train)).detach().numpy()
-plt.plot(x_train, y_train, 'ro', label='Original data')
-plt.plot(x_train, predicted, label='Fitted line')
-plt.legend()
-plt.show()
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
+if __name__ == '__main__':
+    # Hyper-parameters
+    input_size = 1
+    output_size = 1
+    num_epochs = 60
+    learning_rate = 0.001
+
+    # Toy dataset
+    x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
+                        [9.779], [6.182], [7.59], [2.167], [7.042],
+                        [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
+
+    y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
+                        [3.366], [2.596], [2.53], [1.221], [2.827],
+                        [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
+
+    # Linear regression model
+    # 创建一个线性回归模型（也称为全连接层或仿射变换层）
+    # 在PyTorch中，nn.Linear 会自动初始化权重 w 和偏置 b
+    model = nn.Linear(input_size, output_size)
+
+    # Loss and optimizer
+    # 定义损失函数，用于衡量模型预测值与真实值之间的差异
+    # 这里使用均方误差损失（Mean Squared Error Loss）
+    criterion = nn.MSELoss()
+    # 创建优化器，用于更新模型的参数（权重 w 和偏置 b）
+    # 学习率的作用：学习率过大可能导致模型训练不稳定，过小则训练速度太慢
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(num_epochs):
+        # Convert numpy arrays to torch tensors
+        inputs = torch.from_numpy(x_train)
+        targets = torch.from_numpy(y_train)
+
+        # Forward pass
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+
+        # Backward and optimize
+        optimizer.zero_grad()
+        # 反向传播计算梯度：
+        # 计算损失函数关于模型参数的梯度
+        loss.backward()
+        # 更新模型参数：
+        # 使用优化器根据计算得到的梯度更新模型参数
+        optimizer.step()
+
+        if (epoch + 1) % 5 == 0:
+            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))
+    """
+    模型训练时确实使用了 x_train（输入）和 y_train（目标），但训练完成后：
+
+    我们需要验证模型在训练数据上的拟合效果
+    通过对相同的 x_train 进行预测，得到 predicted（模型输出）
+    然后将 predicted 与真实的 y_train 对比绘图，直观展示模型学习的线性关系
+    """
+    # .detach()  # 从计算图中分离张量
+    # Plot the graph
+    predicted = model(
+        torch.from_numpy(x_train)).detach().numpy()  # 将PyTorch张量转换回NumPy数组 因为matplotlib绘图库需要NumPy数组格式 方便后续的可视化操作
+    plt.plot(x_train, y_train, 'ro', label='Original data')
+    plt.plot(x_train, predicted, label='Fitted line')
+    plt.legend()
+    plt.show()
+
+    # Save the model checkpoint
+    torch.save(model.state_dict(), 'model.ckpt')
diff --git a/tutorials/01-basics/logistic_regression/main.py b/tutorials/01-basics/logistic_regression/main.py
index c7eb378b..ea3c4b29 100644
--- a/tutorials/01-basics/logistic_regression/main.py
+++ b/tutorials/01-basics/logistic_regression/main.py
@@ -2,75 +2,109 @@
 import torch.nn as nn
 import torchvision
 import torchvision.transforms as transforms
+import ssl
+# 逻辑回归
+ssl._create_default_https_context = ssl._create_unverified_context
+if __name__ == '__main__':
+    # Hyper-parameters
+    input_size = 28 * 28    # 784
+    num_classes = 10
+    num_epochs = 5
+    batch_size = 100
+    learning_rate = 0.001
 
+    # MNIST dataset (images and labels)
+    train_dataset = torchvision.datasets.MNIST(root='../../data',
+                                               train=True,
+                                               transform=transforms.ToTensor(),
+                                               download=True)
 
-# Hyper-parameters 
-input_size = 28 * 28    # 784
-num_classes = 10
-num_epochs = 5
-batch_size = 100
-learning_rate = 0.001
+    test_dataset = torchvision.datasets.MNIST(root='../../data',
+                                              train=False,
+                                              transform=transforms.ToTensor())
 
-# MNIST dataset (images and labels)
-train_dataset = torchvision.datasets.MNIST(root='../../data', 
-                                           train=True, 
-                                           transform=transforms.ToTensor(),
-                                           download=True)
+    # Data loader (input pipeline)
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True)
 
-test_dataset = torchvision.datasets.MNIST(root='../../data', 
-                                          train=False, 
-                                          transform=transforms.ToTensor())
+    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+                                              batch_size=batch_size,
+                                              shuffle=False)
 
-# Data loader (input pipeline)
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
-                                           batch_size=batch_size, 
-                                           shuffle=True)
+    # Logistic regression model
+    # 逻辑回归（多分类）：输出类别数量的得分
+    # 逻辑回归：输出层维度 = 类别数量（这里10个数字）
+    # 线性回归：输出层维度 = 1（预测单一连续值）
 
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
-                                          batch_size=batch_size, 
-                                          shuffle=False)
+    model = nn.Linear(input_size, num_classes)
 
-# Logistic regression model
-model = nn.Linear(input_size, num_classes)
+    # Loss and optimizer
+    # nn.CrossEntropyLoss() computes softmax internally
+    # 逻辑回归：使用交叉熵损失（自动包含softmax）
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
 
-# Loss and optimizer
-# nn.CrossEntropyLoss() computes softmax internally
-criterion = nn.CrossEntropyLoss()  
-optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  
+    # Train the model
+    total_step = len(train_loader)
+    for epoch in range(num_epochs):
+        for i, (images, labels) in enumerate(train_loader):
+            # Reshape images to (batch_size, input_size)
+            images = images.reshape(-1, input_size)
+            # 逻辑回归：CrossEntropyLoss内部已包含softmax激活
+            # Forward pass
+            outputs = model(images)  # 输出是原始得分（logits）
+            loss = criterion(outputs, labels)
 
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
-    for i, (images, labels) in enumerate(train_loader):
-        # Reshape images to (batch_size, input_size)
-        images = images.reshape(-1, input_size)
-        
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-        
-        # Backward and optimize
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        if (i+1) % 100 == 0:
-            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
-                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
 
-# Test the model
-# In test phase, we don't need to compute gradients (for memory efficiency)
-with torch.no_grad():
-    correct = 0
-    total = 0
-    for images, labels in test_loader:
-        images = images.reshape(-1, input_size)
-        outputs = model(images)
-        _, predicted = torch.max(outputs.data, 1)
-        total += labels.size(0)
-        correct += (predicted == labels).sum()
+            if (i+1) % 100 == 0:
+                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+                       .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
+    """
+    线性回归和逻辑回归的核心区别在于：
 
-    print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+    任务目标不同：回归 vs 分类
+    输出处理不同：直接输出 vs 激活函数映射
+    损失函数不同：MSE vs 交叉熵
+    """
+    # Test the model
+    # In test phase, we don't need to compute gradients (for memory efficiency)
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        for images, labels in test_loader:
+            images = images.reshape(-1, input_size)
+            outputs = model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum()
 
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
+        print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+    # Save the model checkpoint
+    torch.save(model.state_dict(), 'model.ckpt')
+
+    """
+    一、核心区别对比表
+    对比维度	均方误差损失 (MSE)	交叉熵损失 (Cross-Entropy)
+    数学定义	$MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2$	二分类：$CE = -\frac{1}{n}\sum_{i=1}^{n}[y_i\log\hat{y}i + (1-y_i)\log(1-\hat{y}i)]$
+    多分类：$CE = -\frac{1}{n}\sum{i=1}^{n}\sum{c=1}^{C}y_{ic}\log\hat{y}_{ic}$
+    适用任务	回归任务（预测连续值，如房价、温度）	分类任务（预测离散类别，如图片分类、文本分类）
+    输出范围假设	预测值$\hat{y}$可任意实数（$-\infty, +\infty$）	预测值需转换为概率分布（$(0, 1)$区间）
+    激活函数配合	通常不需要特定激活函数（线性输出）
+    或配合sigmoid/tanh（约束输出范围）	二分类：配合sigmoid激活
+    多分类：配合softmax激活（PyTorch中CrossEntropyLoss内部自动计算）
+    梯度特性	梯度与预测偏差$(y-\hat{y})$成正比
+    预测远离真实值时梯度大，易不稳定	梯度与概率分布的差异相关
+    训练更稳定，尤其适合分类任务
+    
+    场景	      推荐损失函数	代码示例
+    预测连续值（如房价、温度）	MSE	criterion = nn.MSELoss()
+    二分类（如垃圾邮件检测）	Binary Cross-Entropy	criterion = nn.BCELoss()
+    多分类（如MNIST数字识别）	Cross-Entropy	criterion = nn.CrossEntropyLoss()
+    简单来说：回归用MSE，分类用CrossEntropy，这是深度学习中的"黄金法则"之一！
+    """
\ No newline at end of file
diff --git a/tutorials/01-basics/pytorch_basics/main.py b/tutorials/01-basics/pytorch_basics/main.py
index 744400c2..16dbf19d 100644
--- a/tutorials/01-basics/pytorch_basics/main.py
+++ b/tutorials/01-basics/pytorch_basics/main.py
@@ -1,10 +1,27 @@
-import torch 
-import torchvision
-import torch.nn as nn
+import ssl
+
 import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
 import torchvision.transforms as transforms
 
 
+class CustomDataset(torch.utils.data.Dataset):
+    def __init__(self):
+        # 初始化一些示例数据
+        self.data = torch.randn(100, 3)  # 100个样本，每个样本3个特征
+        self.labels = torch.randint(0, 2, (100,))  # 100个标签，0或1
+
+    def __getitem__(self, index):
+        # 返回数据对 (特征, 标签)
+        return self.data[index], self.labels[index]
+
+    def __len__(self):
+        # 返回数据集大小
+        return len(self.data)
+
+
 # ================================================================== #
 #                         Table of Contents                          #
 # ================================================================== #
@@ -15,175 +32,264 @@
 # 4. Input pipline                          (Line 104 to 129)
 # 5. Input pipline for custom dataset       (Line 136 to 156)
 # 6. Pretrained model                       (Line 163 to 176)
-# 7. Save and load model                    (Line 183 to 189) 
+# 7. Save and load model                    (Line 183 to 189)
+# （CNN）的基本原理 是什么？
 
 
 # ================================================================== #
 #                     1. Basic autograd example 1                    #
 # ================================================================== #
 
-# Create tensors.
-x = torch.tensor(1., requires_grad=True)
-w = torch.tensor(2., requires_grad=True)
-b = torch.tensor(3., requires_grad=True)
-
-# Build a computational graph.
-y = w * x + b    # y = 2 * x + 3
-
-# Compute gradients.
-y.backward()
-
-# Print out the gradients.
-print(x.grad)    # x.grad = 2 
-print(w.grad)    # w.grad = 1 
-print(b.grad)    # b.grad = 1 
-
-
-# ================================================================== #
-#                    2. Basic autograd example 2                     #
-# ================================================================== #
-
-# Create tensors of shape (10, 3) and (10, 2).
-x = torch.randn(10, 3)
-y = torch.randn(10, 2)
-
-# Build a fully connected layer.
-linear = nn.Linear(3, 2)
-print ('w: ', linear.weight)
-print ('b: ', linear.bias)
-
-# Build loss function and optimizer.
-criterion = nn.MSELoss()
-optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)
-
-# Forward pass.
-pred = linear(x)
-
-# Compute loss.
-loss = criterion(pred, y)
-print('loss: ', loss.item())
-
-# Backward pass.
-loss.backward()
-
-# Print out the gradients.
-print ('dL/dw: ', linear.weight.grad) 
-print ('dL/db: ', linear.bias.grad)
-
-# 1-step gradient descent.
-optimizer.step()
-
-# You can also perform gradient descent at the low level.
-# linear.weight.data.sub_(0.01 * linear.weight.grad.data)
-# linear.bias.data.sub_(0.01 * linear.bias.grad.data)
-
-# Print out the loss after 1-step gradient descent.
-pred = linear(x)
-loss = criterion(pred, y)
-print('loss after 1 step optimization: ', loss.item())
-
-
-# ================================================================== #
-#                     3. Loading data from numpy                     #
-# ================================================================== #
-
-# Create a numpy array.
-x = np.array([[1, 2], [3, 4]])
-
-# Convert the numpy array to a torch tensor.
-y = torch.from_numpy(x)
-
-# Convert the torch tensor to a numpy array.
-z = y.numpy()
-
-
-# ================================================================== #
-#                         4. Input pipeline                           #
-# ================================================================== #
-
-# Download and construct CIFAR-10 dataset.
-train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
-                                             train=True, 
-                                             transform=transforms.ToTensor(),
-                                             download=True)
-
-# Fetch one data pair (read data from disk).
-image, label = train_dataset[0]
-print (image.size())
-print (label)
-
-# Data loader (this provides queues and threads in a very simple way).
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                           batch_size=64, 
-                                           shuffle=True)
-
-# When iteration starts, queue and thread start to load data from files.
-data_iter = iter(train_loader)
-
-# Mini-batch images and labels.
-images, labels = data_iter.next()
-
-# Actual usage of the data loader is as below.
-for images, labels in train_loader:
-    # Training code should be written here.
-    pass
-
-
-# ================================================================== #
-#                5. Input pipeline for custom dataset                 #
-# ================================================================== #
-
-# You should build your custom dataset as below.
-class CustomDataset(torch.utils.data.Dataset):
-    def __init__(self):
-        # TODO
-        # 1. Initialize file paths or a list of file names. 
+"""
+神经网络中的延伸
+在神经网络中，损失函数 ( L ) 是关于所有模型参数（权重 ( w_1, w_2, ... ) 和偏置 ( b_1, b_2, ... )）的高维多元函数。因此：
+
+参数数量 = 偏导数数量
+每个参数的偏导数 ( \frac{\partial L}{\partial \theta} ) 代表损失函数在该参数方向上的变化率
+所有偏导数组成的向量称为梯度，用于指导参数更新方向
+"""
+if __name__ == '__main__':
+    """
+    偏导数的大小：变化率的强度
+    偏导数的绝对值大小代表了函数在该参数方向上的变化率强度：
+
+    偏导数绝对值越大 → 函数在该方向上变化越快 → 该参数对损失函数的影响越敏感
+    偏导数绝对值越小 → 函数在该方向上变化越慢 → 该参数对损失函数的影响较不敏感
+    """
+    ssl._create_default_https_context = ssl._create_unverified_context
+    # Create tensors.
+    x = torch.tensor(1., requires_grad=True)
+    print(f"x Python 类型: {type(x)}")
+    print(f"tensor 类名: {x.__class__.__name__}")
+    print(f"tensor 模块: {x.__class__.__module__}")
+    print(f"tensor 是否是 torch.Tensor: {isinstance(x, torch.Tensor)}")
+    print("在 PyTorch 中，当你创建一个张量并设置 requires_grad=True 时，PyTorch 会自动构建一个计算图"
+          "计算图记录了所有对该张量的操作，以便后续计算梯度"
+          "y.backward() 会从 y 开始，沿着计算图反向传播，计算 y 相对于所有具有 requires_grad=True 的输入张量（这里是 x、w、b）的偏导数")
+    w = torch.tensor(2., requires_grad=True)
+    b = torch.tensor(3., requires_grad=True)
+
+    # Build a computational graph.
+    y = w * x + b  # y = 2 * x + 3
+    print("""你不能直接打印 y.grad，因为：
+    只有叶子张量（Leaf Tensor）才能保存梯度
+    叶子张量是指直接创建的张量（不是通过其他张量计算得到的）
+    在这个例子中，x、w、b 是叶子张量，而 y 是通过计算得到的中间张量
+    PyTorch 默认只保存叶子张量的梯度，以节省内存空间""")
+    # Compute gradients.
+    # y.retain_grad()  # 保存 y 的梯度
+    # 执行反向传播
+    y.backward()
+
+    # Print out the gradients.
+    # 用于执行反向传播，计算梯度：
+    print(x.grad)  # x.grad = 2
+    print(w.grad)  # w.grad = 1
+    print(b.grad)  # b.grad = 1
+
+    # ================================================================== #
+    #                    2. Basic autograd example 2                     #
+    # ================================================================== #
+
+    # Create tensors of shape (10, 3) and (10, 2).
+    """
+    torch.randn(10, 3): 创建一个形状为 (10, 3) 的张量（矩阵），其中包含从标准正态分布中随机采样的值
+
+    10 表示批次大小（batch size），即一次处理10个样本
+    3 表示每个样本有3个特征
+    torch.randn(10, 2): 创建一个形状为 (10, 2) 的张量，作为模型的目标输出
+
+    10 同样是批次大小，与输入数据对应
+    2 表示每个样本期望输出2个值
+    """
+    x = torch.randn(10, 3)
+    y = torch.randn(10, 2)
+    print('x: ', x)
+    print('y: ', y)
+    # Build a fully connected layer.
+    # 构建全连接层
+    """
+    第一个参数 3 是输入特征的维度
+    第二个参数 2 是输出特征的维度
+    这个线性层会自动初始化权重和偏置参数
+    """
+    linear = nn.Linear(3, 2)
+    print('w: ', linear.weight)
+    print('b: ', linear.bias)
+
+    # Build loss function and optimizer.
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)
+
+    # Forward pass.
+    pred = linear(x)
+
+    # Compute loss.
+    loss = criterion(pred, y)
+    print('loss: ', loss.item())
+
+    # Backward pass.
+    loss.backward()
+
+    # Print out the gradients.
+    print('dL/dw: ', linear.weight.grad)
+    print('dL/db: ', linear.bias.grad)
+
+    # 1-step gradient descent.
+    optimizer.step()
+
+    # You can also perform gradient descent at the low level.
+    # linear.weight.data.sub_(0.01 * linear.weight.grad.data)
+    # linear.bias.data.sub_(0.01 * linear.bias.grad.data)
+
+    # Print out the loss after 1-step gradient descent.
+    pred = linear(x)
+    loss = criterion(pred, y)
+    print('loss after 1 step optimization: ', loss.item())
+    """
+    这段代码的意义
+    这是深度学习中构建神经网络的基础步骤：
+    准备输入数据和目标数据
+    定义模型结构（这里是一个简单的线性层）
+    查看和理解模型参数
+    在实际应用中，这段代码之后通常会添加：
+    损失函数定义（如 MSE、交叉熵等）
+    优化器选择（如 SGD、Adam 等）
+    前向传播、损失计算、反向传播和参数更新的循环
+    这段代码展示了PyTorch构建神经网络的核心概念，是理解更复杂模型的基础。
+    """
+
+    # ================================================================== #
+    #                     3. Loading data from numpy                     #
+    # ================================================================== #
+
+    # Create a numpy array.
+    x = np.array([[1, 2], [3, 4]])
+
+    # Convert the numpy array to a torch tensor.
+    y = torch.from_numpy(x)
+
+    # Convert the torch tensor to a numpy array.
+    z = y.numpy()
+    # x==z: [[ True  True]
+    print(f"x==z: {x == z}")
+
+    # ================================================================== #
+    #                         4. Input pipeline                           #
+    # ================================================================== #
+
+    # Download and construct CIFAR-10 dataset.
+    print("""
+    # CIFAR - 10
+    # 是一个经典的图像分类数据集，包含：
+    # 60000 张 32x32 彩色图像
+    # 10 个类别：飞机、汽车、鸟、猫、鹿、狗、青蛙、马、船、卡车
+    # 每个类别有 6000 张图像
+    # 训练集：50000 张
+    # 测试集：10000 张
+    # 数据预处理：
+    # 图像被归一化到 [0, 1] 范围
+    # 每个通道的均值和标准差分别为 [0.5, 0.5, 0.5] 和 [0.5, 0.5, 0.5]
+    """)
+    train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
+                                                 train=True,
+                                                 transform=transforms.ToTensor(),
+                                                 download=True)
+
+    # Fetch one data pair (read data from disk).
+    image, label = train_dataset[0]
+    print(f"image.size(): {image.size()}")
+    print(f"label: {label}")
+
+    # Data loader (this provides queues and threads in a very simple way).
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                               batch_size=64,
+                                               shuffle=True)
+
+    # When iteration starts, queue and thread start to load data from files.
+    data_iter = iter(train_loader)
+
+    # Mini-batch images and labels.
+    images, labels = data_iter.__next__()
+    print(f"images.size(): {images.size()}")
+    print(f"labels: {labels}")
+    # Actual usage of the data loader is as below.
+    for images, labels in train_loader:
+        # Training code should be written here.
         pass
-    def __getitem__(self, index):
-        # TODO
-        # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
-        # 2. Preprocess the data (e.g. torchvision.Transform).
-        # 3. Return a data pair (e.g. image and label).
-        pass
-    def __len__(self):
-        # You should change 0 to the total size of your dataset.
-        return 0 
-
-# You can then use the prebuilt data loader. 
-custom_dataset = CustomDataset()
-train_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
-                                           batch_size=64, 
-                                           shuffle=True)
-
-
-# ================================================================== #
-#                        6. Pretrained model                         #
-# ================================================================== #
-
-# Download and load the pretrained ResNet-18.
-resnet = torchvision.models.resnet18(pretrained=True)
-
-# If you want to finetune only the top layer of the model, set as below.
-for param in resnet.parameters():
-    param.requires_grad = False
-
-# Replace the top layer for finetuning.
-resnet.fc = nn.Linear(resnet.fc.in_features, 100)  # 100 is an example.
-
-# Forward pass.
-images = torch.randn(64, 3, 224, 224)
-outputs = resnet(images)
-print (outputs.size())     # (64, 100)
-
-
-# ================================================================== #
-#                      7. Save and load the model                    #
-# ================================================================== #
-
-# Save and load the entire model.
-torch.save(resnet, 'model.ckpt')
-model = torch.load('model.ckpt')
 
-# Save and load only the model parameters (recommended).
-torch.save(resnet.state_dict(), 'params.ckpt')
-resnet.load_state_dict(torch.load('params.ckpt'))
+    # ================================================================== #
+    #                5. Input pipeline for custom dataset                 #
+    # ================================================================== #
+
+    # You should build your custom dataset as below.
+
+    # You can then use the prebuilt data loader.
+    custom_dataset = CustomDataset()
+    train_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
+                                               batch_size=64,
+                                               shuffle=True)
+
+    # ================================================================== #
+    #                        6. Pretrained model                         #
+    # ================================================================== #
+    """
+    torchvision.models：PyTorch视觉库(torchvision)中的模型模块，包含了多种经典的计算机视觉预定义模型（如ResNet、VGG、AlexNet等）
+    resnet18：ResNet（Residual Network，残差网络）模型家族中的一个变体，指具有18层网络结构的ResNet模型
+    pretrained=True：关键参数，指定加载在ImageNet数据集上预训练好的模型权重
+    """
+    """预训练模型是指：
+       已经在大型数据集（这里是ImageNet，包含1400万张图像，1000
+       个类别）上训练完成的模型
+       模型权重已经学习到了通用的图像特征（如边缘、纹理、形状等高级视觉特征）
+       可以直接用于推理，或作为迁移学习的起点"""
+    # Download and load the pretrained ResNet-18.
+    resnet = torchvision.models.resnet18(pretrained=True)
+    # 模型微调（Fine - tuning）的经典实现
+    # If you want to finetune only the top layer of the model, set as below.
+    """
+    作用：将ResNet-18模型中所有原有参数的requires_grad设置为False
+    效果：在反向传播时，这些参数不会计算梯度，也就不会被更新
+    原理：预训练模型的底层（卷积层）已经学习到了通用的视觉特征（如边缘、纹理、形状），这些特征对大多数图像任务都有效，无需重新学习
+    """
+    for param in resnet.parameters():
+        param.requires_grad = False
+
+    # Replace the top layer for finetuning.
+    """
+    作用：将ResNet-18模型的全连接层（fc层）替换为一个新的全连接层，输出维度为100
+    效果：模型的输出层从原来的1000个类别（ImageNet数据集的类别数）减少到100个类别
+    原理：全连接层是模型的最后一层，负责将特征映射到类别空间。通过替换fc层，我们可以将模型用于不同的分类任务（如CIFAR - 10）
+    """
+    resnet.fc = nn.Linear(resnet.fc.in_features, 100)  # 100 is an example.
+
+    # Forward pass.
+    images = torch.randn(64, 3, 224, 224)
+    outputs = resnet(images)
+    print(outputs.size())  # (64, 100)
+
+    # ================================================================== #
+    #                      7. Save and load the model                    #
+    # ================================================================== #
+    """
+    torch.save(resnet, 'model.ckpt')：保存完整模型对象
+    保存的是整个模型的Python对象，包括：
+    模型的网络结构（如ResNet-18的卷积层、池化层、全连接层等）
+    模型的所有参数（权重和偏置）
+    模型的优化器状态（如果模型包含的话）
+    其他与模型相关的Python对象（如类定义、导入依赖等）
+    本质：使用Python的pickle序列化机制保存整个对象
+    """
+    # Save and load the entire model.
+    torch.save(resnet, 'model.ckpt')
+    model = torch.load('model.ckpt', weights_only=False)
+    """
+    只保存模型的参数状态字典（State Dictionary）：
+    以字典形式保存所有可学习参数的名称和值
+    不包含模型的网络结构信息
+    不包含任何Python类定义或依赖
+    本质：只保存模型的"权重"，不保存"骨架"
+    """
+    # Save and load only the model parameters (recommended).
+    torch.save(resnet.state_dict(), 'params.ckpt')
+    resnet.load_state_dict(torch.load('params.ckpt'))
diff --git a/tutorials/01-mine/FashionMNIST.py b/tutorials/01-mine/FashionMNIST.py
new file mode 100644
index 00000000..fe897f3e
--- /dev/null
+++ b/tutorials/01-mine/FashionMNIST.py
@@ -0,0 +1,149 @@
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+
+# 设备配置：优先使用GPU，否则使用CPU，M系列芯片可使用MPS
+# if torch.backends.mps.is_available():
+#     device = torch.device('mps')  # Apple Silicon M系列芯片加速
+# elif torch.cuda.is_available():
+#     device = torch.device('cuda')  # NVIDIA GPU加速
+# else:
+#     device = torch.device('cpu')   # CPU训练
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28 * 28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+
+# 在单个训练循环中，模型会对训练数据集（以批次形式输入）进行预测，并通过反向传播预测误差来调整模型的参数。
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+
+        # Backpropagation
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), (batch + 1) * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+# 根据测试数据集检查模型的性能，以确保它在学习。
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+"""
+下载模型数据
+创建模型
+优化模型参数
+保存模型
+加载模型
+预测数据
+"""
+if __name__ == '__main__':
+    # Download training data from open datasets.
+    training_data = datasets.FashionMNIST(
+        root="data",
+        train=True,
+        download=True,
+        transform=ToTensor(),
+    )
+
+    # Download test data from open datasets.
+    # 它是一个包含图像和对应标签的数据集对象。
+    test_data = datasets.FashionMNIST(
+        root="data",
+        train=False,
+        download=True,
+        transform=ToTensor(),
+    )
+
+    batch_size = 64
+    # Create data loaders.
+    train_dataloader = DataLoader(training_data, batch_size=batch_size)
+    test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+    for X, y in test_dataloader:
+        print(f"Shape of X [N, C, H, W]: {X.shape}")
+        print(f"Shape of y: {y.shape} {y.dtype}")
+        break
+    # Creating Models
+    device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+    print(f"Using {device} device")
+    # Define model
+
+    model = NeuralNetwork().to(device)
+    print(f"model: {model}")
+    # To train a model, we need a loss function and an optimizer.
+    loss_fn = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+    epochs = 5
+    for t in range(epochs):
+        print(f"Epoch {t + 1}\n-------------------------------")
+        train(train_dataloader, model, loss_fn, optimizer)
+        test(test_dataloader, model, loss_fn)
+    print("Done!")
+    torch.save(model.state_dict(), "model.pth")
+    print("Saved PyTorch Model State to model.pth")
+    model.load_state_dict(torch.load("model.pth", weights_only=True))
+    classes = [
+        "T-shirt/top",
+        "Trouser",
+        "Pullover",
+        "Dress",
+        "Coat",
+        "Sandal",
+        "Shirt",
+        "Sneaker",
+        "Bag",
+        "Ankle boot",
+    ]
+
+    model.eval() # 设置模型为评估模式
+    x, y = test_data[0][0], test_data[0][1]
+    print(f"x: {x} ,y: {y}")
+    with torch.no_grad():
+        x = x.to(device)
+        # logits 指的是 模型最后一层（通常是线性层 nn.Linear）的原始输出，这些输出 没有经过归一化，因此它们不是概率值。
+        # Softmax 函数转换为概率分布：
+        pred = model(x) # 模型预测，输出10个类别的logits
+        # probs = torch.softmax(pred, dim=1)
+        # print(probs)
+
+        print(f"pred: {pred}")
+        # 在分类任务中，我们通常只需要找到最大logits对应的类别即可，不需要转换为概率：
+        predicted, actual = classes[pred[0].argmax(0)], classes[y]
+        print(f'Predicted: "{predicted}", Actual: "{actual}"')
\ No newline at end of file
diff --git a/tutorials/01-mine/autograd.py b/tutorials/01-mine/autograd.py
new file mode 100644
index 00000000..ee0818f8
--- /dev/null
+++ b/tutorials/01-mine/autograd.py
@@ -0,0 +1,91 @@
+# 微分是把整体拆成无限小的局部，求「瞬间变化率」；积分是把无限小的局部拼回整体，求「累积总量」，二者是互逆运算，就像 “拆积木” 和 “搭积木” 的关系。
+# 通俗说：比如汽车行驶，微分就是求某一秒的瞬时速度（不是平均速度）；比如山坡，微分就是求某一点的坡度（斜率）。
+"""
+定积分：求 “面积 / 总量”（核心应用）
+对函数\(y=f(x)\)，在区间\([a,b]\)上的定积分\(\int_{a}^{b}f(x)dx\)，本质是：把区间\([a,b]\)拆成无数个微小区间，每个区间对应一个微小矩形（高 = f (x)，宽 = dx），把所有微小矩形的面积加起来，就是定积分的结果。
+不定积分：微分的 “逆运算”
+通俗说：知道 “每一点的斜率”，反推 “原来的曲线”；知道 “瞬时速度”，反推 “位移函数”。✅ 例子：已知微分（导数）\(y'=2x\)，不定积分就是 \(y=x^2 + C\)（C 是任意常数），因为\(x^2\)、\(x^2+1\)、\(x^2+100\)的导数都是 2x。
+
+"""
+
+# %matplotlib inline
+
+import torch
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import math
+import torch
+
+
+def demo1():
+    """
+        Consider the simplest one-layer neural network,
+        with input x, parameters w and b, and some loss function. It can be defined in PyTorch in the following manner:
+        """
+    x = torch.ones(5)  # input tensor
+    y = torch.zeros(3)  # expected output
+    w = torch.randn(5, 3, requires_grad=True)
+    b = torch.randn(3, requires_grad=True)
+    z = torch.matmul(x, w) + b
+    loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
+    print(f"Gradient function for z = {z.grad_fn}")
+    print(f"Gradient function for loss = {loss.grad_fn}")
+    # Computing Gradients
+    loss.backward()
+    """
+    我们只能获取计算图中叶节点的grad属性，这些叶节点的requires_grad属性被设置为True。对于图中的所有其他节点，梯度将不可用。
+
+    """
+    print(w.grad)
+    print(b.grad)
+
+
+def sin_demo():
+    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+    print(a)
+    b = torch.sin(a)
+    plt.plot(a.detach(), b.detach())
+    print(b)
+BATCH_SIZE = 16
+DIM_IN = 1000
+HIDDEN_SIZE = 100
+DIM_OUT = 10
+
+class TinyModel(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel, self).__init__()
+
+        self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
+        self.relu = torch.nn.ReLU()
+        self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+
+def sin_demo1():
+    some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
+    ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)
+
+    model = TinyModel()
+    print(model.layer2.weight[0][0:10])  # just a small slice
+    print(model.layer2.weight.grad)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+    prediction = model(some_input)
+    loss = (ideal_output - prediction).pow(2).sum()
+    print(loss)
+    loss.backward()
+    print(model.layer2.weight[0][0:10])
+    print(model.layer2.weight.grad[0][0:10])
+    optimizer.step()
+    print(model.layer2.weight[0][0:10])
+    print(model.layer2.weight.grad[0][0:10])
+
+
+
+if __name__ == '__main__':
+    sin_demo1()
diff --git a/tutorials/01-mine/build_neural_network.py b/tutorials/01-mine/build_neural_network.py
new file mode 100644
index 00000000..03bd0f23
--- /dev/null
+++ b/tutorials/01-mine/build_neural_network.py
@@ -0,0 +1,69 @@
+
+
+
+# 构建神经网络
+import os
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+
+if __name__ == '__main__':
+    model = NeuralNetwork().to(device)
+    print(model)
+    # To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. Do not call model.forward() directly!
+    # 要使用该模型，我们需向其传入输入数据。这会执行模型的forward以及一些后台操作。请勿直接调用model.forward()！
+    X = torch.rand(1, 28, 28, device=device)
+    logits = model(X)
+    print(logits)
+    pred_probab = nn.Softmax(dim=1)(logits)
+    y_pred = pred_probab.argmax(1)
+    print(f"Predicted class: {y_pred}")
+    input_image = torch.rand(3, 28, 28)
+    print(input_image.size())
+    # 我们初始化nn.Flatten层，将每个2D的28x28图像转换为一个包含784个像素值的连续数组（保持dim=0处的小批量维度）。
+    flatten = nn.Flatten()
+    flat_image = flatten(input_image)
+    print(flat_image.size())
+    # 线性层是一个模块，它使用其存储的权重和偏置对输入进行线性变换。
+    layer1 = nn.Linear(in_features=28 * 28, out_features=20)
+    hidden1 = layer1(flat_image)
+    print(hidden1.size())
+    print(f"Before ReLU: {hidden1}\n\n")
+    hidden1 = nn.ReLU()(hidden1)
+    print(f"After ReLU: {hidden1}")
+    seq_modules = nn.Sequential(
+        flatten,
+        layer1,
+        nn.ReLU(),
+        nn.Linear(20, 10)
+    )
+    input_image = torch.rand(3, 28, 28)
+    logits = seq_modules(input_image)
+    softmax = nn.Softmax(dim=1)
+    pred_probab = softmax(logits)
+    print(f"Model structure: {model}\n\n")
+
+    for name, param in model.named_parameters():
+        print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
\ No newline at end of file
diff --git a/tutorials/01-mine/control_flow_weight_sharing.py b/tutorials/01-mine/control_flow_weight_sharing.py
new file mode 100644
index 00000000..f4dc4732
--- /dev/null
+++ b/tutorials/01-mine/control_flow_weight_sharing.py
@@ -0,0 +1,75 @@
+import random
+import torch
+import math
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+"""
+模块	torch.nn (Neural Network)	torch.optim (Optimizer)
+核心职责	定义模型结构，执行前向计算	更新模型参数，实现优化算法
+是什么	模型架构师与计算器	模型调参优化师
+管什么	管理模型的层、参数、计算图	管理参数的更新方向和步长
+关键输出	模型的预测输出	模型参数的新数值
+类比	汽车的发动机与车身设计图	汽车的驾驶员与导航系统
+"""
+class DynamicNet(torch.nn.Module):
+    def __init__(self):
+        """
+        In the constructor we instantiate five parameters and assign them as members.
+        """
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+        self.e = torch.nn.Parameter(torch.randn(()))
+
+    def forward(self, x):
+        """
+        For the forward pass of the model, we randomly choose either 4, 5
+        and reuse the e parameter to compute the contribution of these orders.
+
+        Since each forward pass builds a dynamic computation graph, we can use normal
+        Python control-flow operators like loops or conditional statements when
+        defining the forward pass of the model.
+
+        Here we also see that it is perfectly safe to reuse the same parameter many
+        times when defining a computational graph.
+        """
+        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+        for exp in range(4, random.randint(4, 6)):
+            y = y + self.e * x ** exp
+        return y
+
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000,device=device)
+y = torch.sin(x).to(device)
+
+# Construct our model by instantiating the class defined above
+model = DynamicNet().to(device)
+
+# Construct our loss function and an Optimizer. Training this strange model with
+# vanilla stochastic gradient descent is tough, so we use momentum
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
+for t in range(30000):
+    # Forward pass: Compute predicted y by passing x to the model
+    y_pred = model(x)
+
+    # Compute and print loss
+    loss = criterion(y_pred, y)
+    if t % 2000 == 1999:
+        print(t, loss.item())
+
+    # Zero gradients, perform a backward pass, and update the weights.
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+print(f'Result: {model.string()}')
\ No newline at end of file
diff --git a/tutorials/01-mine/cpu_2_gpu.py b/tutorials/01-mine/cpu_2_gpu.py
new file mode 100644
index 00000000..0413258b
--- /dev/null
+++ b/tutorials/01-mine/cpu_2_gpu.py
@@ -0,0 +1,59 @@
+"""
+在许多PyTorch应用中，将数据从CPU传输到GPU是基本操作。用户理解在设备之间移动数据的最有效工具和选项至关重要。
+本教程探讨了PyTorch中设备到设备数据传输的两种关键方法：pin_memory()和带有non_blocking=True选项的to()。
+"""
+
+import contextlib
+import torch
+from torch.cuda import Stream
+
+s = Stream()
+
+torch.manual_seed(42)
+t1_cpu_pinned = torch.randn(1024 ** 2 * 5, pin_memory=True)
+t2_cpu_paged = torch.randn(1024 ** 2 * 5, pin_memory=False)
+t3_cuda = torch.randn(1024 ** 2 * 5, device="cuda:0")
+
+assert torch.cuda.is_available()
+device = torch.device("cuda", torch.cuda.current_device())
+
+
+# The function we want to profile
+def inner(pinned: bool, streamed: bool):
+    with torch.cuda.stream(s) if streamed else contextlib.nullcontext():
+        if pinned:
+            t1_cuda = t1_cpu_pinned.to(device, non_blocking=True)
+        else:
+            t2_cuda = t2_cpu_paged.to(device, non_blocking=True)
+        t_star_cuda_h2d_event = s.record_event()
+    # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is
+    #  done in the other stream
+    t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda
+    t3_cuda_h2d_event = torch.cuda.current_stream().record_event()
+    t_star_cuda_h2d_event.synchronize()
+    t3_cuda_h2d_event.synchronize()
+
+
+# Our profiler: profiles the `inner` function and stores the results in a .json file
+def benchmark_with_profiler(
+        pinned,
+        streamed,
+) -> None:
+    torch._C._profiler._set_cuda_sync_enabled_val(True)
+    wait, warmup, active = 1, 1, 2
+    num_steps = wait + warmup + active
+    rank = 0
+    with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            schedule=torch.profiler.schedule(
+                wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1
+            ),
+    ) as prof:
+        for step_idx in range(1, num_steps + 1):
+            inner(streamed=streamed, pinned=pinned)
+            if rank is None or rank == 0:
+                prof.step()
+    prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json")
diff --git a/tutorials/01-mine/custom_autograd.py b/tutorials/01-mine/custom_autograd.py
new file mode 100644
index 00000000..181b434d
--- /dev/null
+++ b/tutorials/01-mine/custom_autograd.py
@@ -0,0 +1,88 @@
+import torch
+import math
+
+
+class LegendrePolynomial3(torch.autograd.Function):
+    """
+    We can implement our own custom autograd Functions by subclassing
+    torch.autograd.Function and implementing the forward and backward passes
+    which operate on Tensors.
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        """
+        In the forward pass we receive a Tensor containing the input and return
+        a Tensor containing the output. ctx is a context object that can be used
+        to stash information for backward computation. You can cache tensors for
+        use in the backward pass using the ``ctx.save_for_backward`` method. Other
+        objects can be stored directly as attributes on the ctx object, such as
+        ``ctx.my_object = my_object``. Check out `Extending torch.autograd <https://docs.pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd>`_
+        for further details.
+        """
+        ctx.save_for_backward(input)
+        return 0.5 * (5 * input ** 3 - 3 * input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+        input, = ctx.saved_tensors
+        return grad_output * 1.5 * (5 * input ** 2 - 1)
+
+
+dtype = torch.float
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For this example, we need
+# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
+# not too far from the correct result to ensure convergence.
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
+c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 5e-6
+for t in range(2000):
+    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
+    P3 = LegendrePolynomial3.apply
+
+    # Forward pass: compute predicted y using operations; we compute
+    # P3 using our custom autograd operation.
+    y_pred = a + b * P3(c + d * x)
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum()
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Use autograd to compute the backward pass.
+    loss.backward()
+
+    # Update weights using gradient descent
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')
\ No newline at end of file
diff --git a/tutorials/01-mine/dataset_demo.py b/tutorials/01-mine/dataset_demo.py
new file mode 100644
index 00000000..9e91ff6a
--- /dev/null
+++ b/tutorials/01-mine/dataset_demo.py
@@ -0,0 +1,87 @@
+import os
+import pandas as pd
+from torchvision.io import decode_image
+import torch
+from torch.utils.data import Dataset
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+import matplotlib.pyplot as plt
+from torch.utils.data import DataLoader
+
+
+
+class CustomImageDataset(Dataset):
+    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
+        self.img_labels = pd.read_csv(annotations_file)
+        self.img_dir = img_dir
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __len__(self):
+        return len(self.img_labels)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
+        image = decode_image(img_path)
+        label = self.img_labels.iloc[idx, 1]
+        if self.transform:
+            image = self.transform(image)
+        if self.target_transform:
+            label = self.target_transform(label)
+        return image, label
+
+
+def plt_show(training_data):
+    labels_map = {
+        0: "T-Shirt",
+        1: "Trouser",
+        2: "Pullover",
+        3: "Dress",
+        4: "Coat",
+        5: "Sandal",
+        6: "Shirt",
+        7: "Sneaker",
+        8: "Bag",
+        9: "Ankle Boot",
+    }
+    figure = plt.figure(figsize=(8, 8))
+    cols, rows = 3, 3
+    for i in range(1, cols * rows + 1):
+        sample_idx = torch.randint(len(training_data), size=(1,)).item()
+        img, label = training_data[sample_idx]
+        figure.add_subplot(rows, cols, i)
+        plt.title(labels_map[label])
+        plt.axis("off")
+        plt.imshow(img.squeeze(), cmap="gray")
+    plt.show()
+
+
+if __name__ == '__main__':
+    training_data = datasets.FashionMNIST(
+        root="data",
+        train=True,
+        download=True,
+        transform=ToTensor()
+    )
+
+    test_data = datasets.FashionMNIST(
+        root="data",
+        train=False,
+        download=True,
+        transform=ToTensor()
+    )
+    # plt_show(training_data)
+
+    #
+    #
+    train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
+    test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
+    # Display image and label.
+    train_features, train_labels = next(iter(train_dataloader))
+    print(f"Feature batch shape: {train_features.size()}")
+    print(f"Labels batch shape: {train_labels.size()}")
+    img = train_features[0].squeeze()
+    label = train_labels[0]
+    plt.imshow(img, cmap="gray")
+    plt.show()
+    print(f"Label: {label}")
\ No newline at end of file
diff --git a/tutorials/01-mine/neural_network.md b/tutorials/01-mine/neural_network.md
new file mode 100644
index 00000000..31366da2
--- /dev/null
+++ b/tutorials/01-mine/neural_network.md
@@ -0,0 +1,165 @@
+## 1. **神经网络的本质：数据处理的 “流水线”**
+
+在 PyTorch 中，**神经网络**本质上是一个 **由可训练参数（权重和偏置）组成的计算图**，它的作用是：
+
+- 接收一个输入（通常是张量，比如图片的像素矩阵、文本的向量表示）
+- 通过一系列 **线性变换**（矩阵乘法）和 **非线性激活**（比如 ReLU、Sigmoid）
+- 输出一个预测结果（比如分类任务的类别概率、回归任务的数值）
+
+你可以把它想象成一个 **“智能函数”**：
+
+\(y = f(x; W, b)\)
+
+其中：
+
+- x 是输入（张量）
+- W 和 b 是网络的 **可训练参数**（张量）
+- f 是网络的计算逻辑（由多个层组成）
+- y 是输出（张量）
+
+------
+
+## 2. **PyTorch 神经网络的核心组成**
+
+在 PyTorch 中，神经网络通常由以下几个关键部分构成：
+
+### （1）`torch.nn.Module`：网络的 “容器”
+
+- 所有神经网络都必须继承自 `torch.nn.Module` 类
+
+- 它是一个
+
+   
+
+  参数化的容器
+
+  ，可以包含：
+
+  - 网络的层（如 `nn.Linear`、`nn.Conv2d`、`nn.ReLU` 等）
+  - 可训练的参数（`nn.Parameter`）
+  - 自定义的计算逻辑
+
+例如：
+
+```python
+import torch
+import torch.nn as nn
+
+class MyNet(nn.Module):
+    def __init__(self):
+        super(MyNet, self).__init__()
+        # 定义层（包含可训练参数）
+        self.fc1 = nn.Linear(10, 20)  # 输入10维，输出20维
+        self.relu = nn.ReLU()         # 非线性激活
+        self.fc2 = nn.Linear(20, 2)   # 输出2类
+
+    def forward(self, x):
+        # 定义数据流动的路径（前向传播）
+        x = self.fc1(x)  # 线性变换：x @ W1 + b1
+        x = self.relu(x) # 非线性激活
+        x = self.fc2(x)  # 线性变换：x @ W2 + b2
+        return x
+```
+
+------
+
+### （2）**层（Layer）：网络的 “基本单元”**
+
+层是神经网络的核心组件，每个层都是一个 **参数化的函数**，负责对输入张量进行特定的变换。常见的层包括：
+
+| 层类型               | 作用                           | 数学表达（简化）                         |
+| -------------------- | ------------------------------ | ---------------------------------------- |
+| `nn.Linear(in, out)` | 线性变换（全连接层）           | \(y = xW^T + b\)                         |
+| `nn.Conv2d(in, out)` | 二维卷积（提取空间特征）       | \(y = \text{Conv}(x, W) + b\)            |
+| `nn.ReLU()`          | 非线性激活（增加模型表达能力） | \(y = \max(0, x)\)                       |
+| `nn.Softmax(dim)`    | 归一化输出为概率分布           | \(y_i = \frac{e^{x_i}}{\sum_j e^{x_j}}\) |
+
+这些层的本质都是 **对张量的运算**，而层中的 `weight` 和 `bias` 是 **可训练的张量**（`nn.Parameter` 类型），会在训练过程中通过梯度下降更新。
+
+------
+
+### （3）**前向传播（forward）：数据的 “流动路径”**
+
+- `forward` 方法定义了 **数据如何在网络中流动**
+- 输入张量 `x` 依次经过各层的变换，最终得到输出张量
+- 这个过程就是 **计算图的构建过程**（PyTorch 会自动记录运算，用于反向传播）
+
+例如：
+
+```python
+net = MyNet()
+x = torch.randn(3, 10)  # 3个样本，每个样本10维特征
+y = net(x)              # 前向传播：x → fc1 → relu → fc2 → y
+print(y.shape)          # 输出: torch.Size([3, 2])
+```
+
+------
+
+### （4）**参数（Parameter）：网络的 “可训练变量”**
+
+- 网络的参数（权重 W 和偏置 b）是 `nn.Parameter` 类型的张量
+- 它们会被自动注册到网络的 `parameters()` 或 `named_parameters()` 方法中
+- 在训练时，优化器（如 `torch.optim.SGD`）会根据梯度更新这些参数
+
+查看网络参数：
+
+```python
+for name, param in net.named_parameters():
+    print(name, param.shape)
+```
+
+输出：
+
+```plaintext
+fc1.weight torch.Size([20, 10])
+fc1.bias   torch.Size([20])
+fc2.weight torch.Size([2, 20])
+fc2.bias   torch.Size([2])
+```
+
+------
+
+## 3. **神经网络的 “学习” 过程**
+
+神经网络之所以能 “智能”，是因为它的参数可以通过 **数据驱动的方式更新**（即训练过程），核心步骤如下：
+
+### （1）定义损失函数（衡量预测与真实值的差距）
+
+```python
+loss_fn = nn.CrossEntropyLoss()  # 分类任务常用损失
+```
+
+### （2）定义优化器（负责更新参数）
+
+```python
+optimizer = torch.optim.SGD(net.parameters(), lr=0.01)  # 随机梯度下降
+```
+
+### （3）训练循环（前向传播 → 计算损失 → 反向传播 → 更新参数）
+
+```python
+for epoch in range(100):
+    # 前向传播
+    y_pred = net(x)
+    
+    # 计算损失
+    loss = loss_fn(y_pred, y_true)  # y_true是真实标签
+    
+    # 反向传播（计算梯度）
+    optimizer.zero_grad()  # 清空上一轮梯度
+    loss.backward()        # 自动计算参数的梯度
+    
+    # 更新参数（梯度下降）
+    optimizer.step()
+```
+
+通过这个过程，网络的参数会逐渐调整，使得预测结果越来越接近真实值。
+
+------
+
+## 4. **PyTorch 神经网络的特点**
+
+- **模块化**：可以像搭积木一样组合不同的层
+- **自动微分**：`backward()` 方法自动计算梯度，无需手动推导
+- **灵活性**：`forward` 方法可以写任意复杂的逻辑（如循环、条件判断）
+- **GPU 加速**：只需调用 `.to("cuda")` 即可在 GPU 上运行“智能函数”：\(y = f(x; W, b)\)其中：x 是输入（张量）W 和 b 是网络的 可训练参数（张量）f 是网络的计算逻辑（由多个层组成）y 是输出（张量）
\ No newline at end of file
diff --git a/tutorials/01-mine/tensors_demo.py b/tutorials/01-mine/tensors_demo.py
new file mode 100644
index 00000000..3bbede3f
--- /dev/null
+++ b/tutorials/01-mine/tensors_demo.py
@@ -0,0 +1,69 @@
+import torch
+import numpy as np
+
+
+if __name__ == '__main__':
+    # Initializing a Tensor
+    data = [[1, 2], [3, 4]]
+    x_data = torch.tensor(data)
+    np_array = np.array(data)
+    x_np = torch.from_numpy(np_array)
+    print(f"x_np.numpy() == np_array: {x_np.numpy() == np_array}")
+    # The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.
+    x_ones = torch.ones_like(x_data)  # retains the properties of x_data
+    print(f"Ones Tensor: \n {x_ones} \n")
+    x_rand = torch.rand_like(x_data, dtype=torch.float)  # overrides the datatype of x_data
+    print(f"Random Tensor: \n {x_rand} \n")
+    shape = (2, 3,)
+    rand_tensor = torch.rand(shape)
+    ones_tensor = torch.ones(shape)
+    zeros_tensor = torch.zeros(shape)
+
+    print(f"Random Tensor: \n {rand_tensor} \n")
+    print(f"Ones Tensor: \n {ones_tensor} \n")
+    print(f"Zeros Tensor: \n {zeros_tensor}")
+
+    tensor = torch.rand(3, 4)
+
+    print(f"Shape of tensor: {tensor.shape}")
+    print(f"Datatype of tensor: {tensor.dtype}")
+    print(f"Device tensor is stored on: {tensor.device}")
+    # Operations on Tensors
+    # 超过1200种张量运算，包括算术运算、线性代数运算、矩阵操作（转置、索引、切片）、采样等
+    # 默认情况下，张量在CPU上创建。我们需要使用.to方法（在检查加速器可用性之后）将张量显式移动到加速器。请记住，跨设备复制大型张量在时间和内存方面可能成本很高！
+    # We move our tensor to the current accelerator if available
+    device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+    print(f"Using {device} device")
+    # tensor = tensor.to(device)
+    tensor = torch.ones(4, 4)
+    print(f"First row: {tensor[0]}")
+    print(f"First column: {tensor[:, 0]}")
+    print(f"Last column: {tensor[..., -1]}")
+    tensor[:, 1] = 0
+    print(tensor)
+    # Joining tensors  拼接张量
+    t1 = torch.cat([tensor, tensor, tensor], dim=1)
+    print(f"t1: \n {t1} \n")
+    # Arithmetic operations 算术运算
+    # This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
+    # ``tensor.T`` returns the transpose of a tensor
+    print(f"tensor: \n {tensor} \n")
+    print(f"tensor.T: \n {tensor.T} \n")
+    y1 = tensor @ tensor.T #矩阵乘法
+    y2 = tensor.matmul(tensor.T) #矩阵乘法
+
+    y3 = torch.rand_like(y1)
+    print(f"y1: \n {y1} \n, \ny2: \n {y2} \n, \ny3: \n {y3}")
+    torch.matmul(tensor, tensor.T, out=y3)
+
+    # This computes the element-wise product. z1, z2, z3 will have the same value
+    z1 = tensor * tensor
+    z2 = tensor.mul(tensor)
+
+    z3 = torch.rand_like(tensor)
+    torch.mul(tensor, tensor, out=z3)
+    print(f"agg tensor: \n {tensor} \n")
+    # tensor.sum() 会对张量的所有元素进行求和，返回一个标量（只有一个值的张量）。
+    agg = tensor.sum()
+    agg_item = agg.item()
+    print(agg_item, type(agg_item))
diff --git a/tutorials/01-mine/tensors_demo2.py b/tutorials/01-mine/tensors_demo2.py
new file mode 100644
index 00000000..71d383e5
--- /dev/null
+++ b/tutorials/01-mine/tensors_demo2.py
@@ -0,0 +1,191 @@
+import torch
+import numpy as np
+import torch  # for all things PyTorch
+import torch.nn as nn  # for torch.nn.Module, the parent object for PyTorch models
+import torch.nn.functional as F  # for the activation function
+import torch
+import math
+
+"""
+    Tensors operations （张量运算）
+"""
+
+
+def tensors_demo():
+    z = torch.zeros(5, 3)
+    print(z)
+    # 发现这些零是32位浮点数，这是PyTorch的默认类型。
+    print(z.dtype)
+    i = torch.ones((5, 3), dtype=torch.int16)
+    print(i)
+    print(i.dtype)
+    torch.manual_seed(1729)
+    r1 = torch.rand(2, 2)
+    print('A random tensor:')
+    print(r1)
+
+    r2 = torch.rand(2, 2)
+    print('\nA different random tensor:')
+    print(r2)  # new values
+
+    torch.manual_seed(1729)
+    r3 = torch.rand(2, 2)
+    print('\nShould match r1:')
+    print(r3)  # repeats values of r1 because of re-seed
+    ones = torch.ones(2, 3)
+    print(ones)
+
+    twos = torch.ones(2, 3) * 2  # every element is multiplied by 2
+    print(twos)
+
+    threes = ones + twos  # addition allowed because shapes are similar
+    print(threes)  # tensors are added element-wise
+    print(threes.shape)  # this has the same dimensions as input tensors
+
+    r1 = torch.rand(2, 3)
+    r2 = torch.rand(3, 2)
+    # uncomment this line to get a runtime error
+    # r3 = r1 + r2
+    r = (torch.rand(2, 2) - 0.5) * 2  # values between -1 and 1
+    print('A random matrix, r:')
+    print(r)
+
+    # Common mathematical operations are supported:
+    print('\nAbsolute value of r:')
+    print(torch.abs(r))
+
+    # ...as are trigonometric functions:
+    print('\nInverse sine of r:')
+    """
+    求每个元素的反正弦值
+    """
+    print(torch.asin(r))
+
+    # ...and linear algebra operations like determinant and singular value decomposition
+    print('\nDeterminant of r:')
+    """
+    计算 方阵 的 行列式（determinant）。行列式是一个标量，描述矩阵的缩放因子和方向变化。
+    r = torch.tensor([[1.0, 2.0],
+                  [3.0, 4.0]])
+    print(torch.det(r))
+    =1×4−2×3=−2
+    """
+    print(torch.det(r))
+    print('\nSingular value decomposition of r:')
+    print(torch.svd(r))
+
+    # ...and statistical and aggregate operations:
+    print('\nAverage and standard deviation of r:')
+    # 同时计算张量的 标准差（standard deviation） 和 均值（mean）。
+    print(torch.std_mean(r))
+    print('\nMaximum value of r:')
+    print(torch.max(r))
+
+
+class LeNet(nn.Module):
+
+    def __init__(self):
+        super(LeNet, self).__init__()
+        # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
+        # kernel
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        # an affine operation: y = Wx + b
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    # 前向计算
+    """
+    前向计算
+    就是让输入数据通过神经网络，得到模型的预测结果的过程。你可以把它想象成：
+    输入数据（比如一张图片、一段文字）从网络的第一层开始，依次经过每一层的运算（卷积、矩阵乘法、激活函数等）。
+    每一层都会对上一层的输出做处理，最终在网络的最后一层得到预测值（比如分类任务的类别概率、回归任务的数值）。
+    作用是什么？
+    得到预测结果：比如输入一张猫的图片，前向计算会输出模型认为这是 “猫” 的概率。
+    计算损失：将预测结果与真实标签（比如 “猫”）比较，用损失函数（Loss Function）衡量模型预测的误差。
+
+    反向传播（Backward Pass）
+    反向传播就是根据损失函数，从网络的最后一层往回计算每一层参数的梯度的过程。梯度表示参数变化对损失的影响程度，优化器（如 SGD、Adam）会用这些梯度来更新参数，让损失更小。
+    你可以把它想象成：
+    从损失值开始，沿着网络的每一层反向计算参数的梯度（使用链式法则）。
+    梯度会告诉我们：每个参数应该调整多少，才能让模型预测更准确。
+    作用是什么？
+    计算梯度：得到每个参数（权重 w、偏置 b）的梯度。更新参数：优化器根据梯度调整参数，让模型的预测结果越来越接近真实值。
+    
+    前向计算：输入数据 → 模型 → 预测值 → 损失。
+    反向传播：损失 → 计算梯度 → 更新参数。
+    """
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square you can only specify a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, self.num_flat_features(x))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def num_flat_features(self, x):
+        size = x.size()[1:]  # all dimensions except the batch dimension
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+
+
+def pytorch_model_demo():
+    net = LeNet()
+    print(net)
+    input = torch.rand(1, 1, 32, 32)  # stand-in for a 32x32 black & white image
+    print('\nImage batch shape:')
+    print(input.shape)
+
+    output = net(input)  # we don't call forward() directly
+    print('\nRaw output:')
+    print(output)
+    print(output.shape)
+
+
+def tensors_demo1():
+    x = torch.empty(3, 4)
+    print(type(x))
+    print(x)
+   #随机张量与种子
+    torch.manual_seed(1729)
+    random1 = torch.rand(2, 3)
+    print(random1)
+
+    random2 = torch.rand(2, 3)
+    print(random2)
+
+    torch.manual_seed(1729)
+    random3 = torch.rand(2, 3)
+    print(random3)
+
+    random4 = torch.rand(2, 3)
+    print(random4)
+    # In Brief: Tensor Broadcasting
+    # 广播是一种机制，它允许在进行逐元素运算时，对不同形状的张量自动进行 “虚拟扩展”，使它们的形状兼容，从而可以进行运算。
+    """
+    广播的核心思想是：
+    不需要实际复制数据，而是在逻辑上扩展张量的维度。
+    它遵循一套规则，自动对齐不同形状的张量。
+    广播的规则
+    广播有两个核心规则：
+    规则 1：维度对齐
+    从 最后一个维度 开始向前比较两个张量的维度：
+    如果两个维度的大小 相同 → 兼容。
+    如果其中一个维度的大小是 1 → 兼容（会被扩展）。
+    如果两个维度的大小 不同且都不是 1 → 不兼容，会报错。
+    规则 2：维度扩展
+    对于大小为 1 的维度，沿着该维度复制数据（在逻辑上），直到与另一个张量的对应维度大小一致。
+    """
+    rand = torch.rand(2, 4)
+    doubled = rand * (torch.ones(1, 4) * 2)
+
+    print(f"rand: {rand}")
+    print(f"doubled: {doubled}")
+
+if __name__ == '__main__':
+    tensors_demo1()
diff --git a/tutorials/01-mine/transforms.py b/tutorials/01-mine/transforms.py
new file mode 100644
index 00000000..c8dd4645
--- /dev/null
+++ b/tutorials/01-mine/transforms.py
@@ -0,0 +1,21 @@
+import torch
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda
+
+
+"""
+数据并不总是以训练机器学习算法所需的最终处理形式出现。我们使用变换对数据进行一些处理，使其适合训练。
+所有TorchVision数据集都有两个参数——transform用于修改特征，target_transform用于修改标签，
+这两个参数接收包含转换逻辑的可调用对象。torchvision.transforms模块提供了几种常用的现成转换方法。
+"""
+if __name__ == '__main__':
+    # ToTensor将PIL图像或NumPy ndarray转换为FloatTensor</b2，并将图像的像素强度值缩放到[0.，1.]范围内。
+    ds = datasets.FashionMNIST(
+        root="data",
+        train=True,
+        download=True,
+        transform=ToTensor(),
+        target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), value=1))
+    )
+    target_transform = Lambda(lambda y: torch.zeros(
+        10, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))
\ No newline at end of file
diff --git a/tutorials/01-mine/translation_demo.py b/tutorials/01-mine/translation_demo.py
new file mode 100644
index 00000000..b3988164
--- /dev/null
+++ b/tutorials/01-mine/translation_demo.py
@@ -0,0 +1,596 @@
+# -*- coding: utf-8 -*-
+"""
+NLP From Scratch: Translation with a Sequence to Sequence Network and Attention
+*******************************************************************************
+**Author**: `Sean Robertson <https://github.com/spro>`_
+
+This tutorials is part of a three-part series:
+
+* `NLP From Scratch: Classifying Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html>`__
+* `NLP From Scratch: Generating Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html>`__
+* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention <https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html>`__
+
+This is the third and final tutorial on doing **NLP From Scratch**, where we
+write our own classes and functions to preprocess the data to do our NLP
+modeling tasks.
+
+In this project we will be teaching a neural network to translate from
+French to English.
+
+.. code-block:: sh
+
+    [KEY: > input, = target, < output]
+
+    > il est en train de peindre un tableau .
+    = he is painting a picture .
+    < he is painting a picture .
+
+    > pourquoi ne pas essayer ce vin delicieux ?
+    = why not try that delicious wine ?
+    < why not try that delicious wine ?
+
+    > elle n est pas poete mais romanciere .
+    = she is not a poet but a novelist .
+    < she not not a poet but a novelist .
+
+    > vous etes trop maigre .
+    = you re too skinny .
+    < you re all alone .
+
+... to varying degrees of success.
+
+This is made possible by the simple but powerful idea of the `sequence
+to sequence network <https://arxiv.org/abs/1409.3215>`__, in which two
+recurrent neural networks work together to transform one sequence to
+another. An encoder network condenses an input sequence into a vector,
+and a decoder network unfolds that vector into a new sequence.
+
+.. figure:: /_static/img/seq-seq-images/seq2seq.png
+   :alt:
+
+To improve upon this model we'll use an `attention
+mechanism <https://arxiv.org/abs/1409.0473>`__, which lets the decoder
+learn to focus over a specific range of the input sequence.
+
+**Recommended Reading:**
+
+I assume you have at least installed PyTorch, know Python, and
+understand Tensors:
+
+-  https://pytorch.org/ For installation instructions
+-  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
+-  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
+-  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
+
+
+It would also be useful to know about Sequence to Sequence networks and
+how they work:
+
+-  `Learning Phrase Representations using RNN Encoder-Decoder for
+   Statistical Machine Translation <https://arxiv.org/abs/1406.1078>`__
+-  `Sequence to Sequence Learning with Neural
+   Networks <https://arxiv.org/abs/1409.3215>`__
+-  `Neural Machine Translation by Jointly Learning to Align and
+   Translate <https://arxiv.org/abs/1409.0473>`__
+-  `A Neural Conversational Model <https://arxiv.org/abs/1506.05869>`__
+
+You will also find the previous tutorials on
+:doc:`/intermediate/char_rnn_classification_tutorial`
+and :doc:`/intermediate/char_rnn_generation_tutorial`
+helpful as those concepts are very similar to the Encoder and Decoder
+models, respectively.
+
+**Requirements**
+"""
+from __future__ import unicode_literals, print_function, division
+from io import open
+import unicodedata
+import re
+import random
+
+import torch
+import torch.nn as nn
+from torch import optim
+import torch.nn.functional as F
+
+import time
+import math
+
+import numpy as np
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+#
+SOS_token = 0
+EOS_token = 1
+
+class Lang:
+    def __init__(self, name):
+        self.name = name
+        self.word2index = {}
+        self.word2count = {}
+        self.index2word = {0: "SOS", 1: "EOS"}
+        self.n_words = 2  # Count SOS and EOS
+
+    def addSentence(self, sentence):
+        """添加整个句子到词汇表"""
+        # 检查句子是否包含中文字符
+        has_chinese = any('\u4e00' <= char <= '\u9fff' for char in sentence)
+        if has_chinese:  # 中文按字符处理
+            for char in sentence:
+                self.addWord(char)
+        else:  # 英文按单词处理
+            for word in sentence.split(' '):
+                self.addWord(word)
+
+    def addWord(self, word):
+        """添加单个词到词汇表"""
+        if word not in self.word2index:
+            self.word2index[word] = self.n_words
+            self.word2count[word] = 1
+            self.index2word[self.n_words] = word
+            self.n_words += 1
+        else:
+            self.word2count[word] += 1
+
+def unicodeToAscii(s):
+    return ''.join(
+        c for c in unicodedata.normalize('NFD', s)
+        if unicodedata.category(c) != 'Mn'
+    )
+
+
+def normalizeString(s):
+    s = s.strip()
+    # 判断是否为英文句子（以字母开头）
+    if s and s[0].isalpha() and s[0].lower() in 'abcdefghijklmnopqrstuvwxyz':
+        # 英文句子处理
+        s = unicodeToAscii(s.lower())
+        s = re.sub(r"([.!?])", r" \1", s)
+        s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
+    else:
+        # 中文句子处理
+        s = re.sub(r'\\s+', ' ', s)  # 只去除多余空格
+    return s.strip()
+
+def readLangs(lang1, lang2, reverse=False):
+    print("Reading lines...")
+
+    # Read the file and split into lines
+    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \
+        read().strip().split('\n')
+
+    # Split every line into pairs and normalize
+    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
+
+    # Reverse pairs, make Lang instances
+    if reverse:
+        pairs = [list(reversed(p)) for p in pairs]
+        input_lang = Lang(lang2)
+        output_lang = Lang(lang1)
+    else:
+        input_lang = Lang(lang1)
+        output_lang = Lang(lang2)
+
+    return input_lang, output_lang, pairs
+
+
+MAX_LENGTH = 10
+
+# 更宽松的英文前缀列表
+eng_prefixes = (
+    "i ", "you ", "he ", "she ", "we ", "they ",
+    "it ", "this ", "that ", "there ", "the ",
+    "a ", "an ", "my ", "your ", "his ", "her ",
+    "our ", "their "
+)
+
+
+def filterPair(p):
+    # 判断输入是英文还是中文，分别计算长度
+    if p[0] and p[0][0].isalpha() and p[0][0].lower() in 'abcdefghijklmnopqrstuvwxyz':
+        input_length = len(p[0].split(' '))
+    else:
+        input_length = len(p[0])
+
+    output_length = len(p[1].split(' '))
+
+    # 仅保留长度合适的句子对，移除eng_prefixes限制
+    return input_length <= MAX_LENGTH - 1 and output_length <= MAX_LENGTH - 1
+
+
+def filterPairs(pairs):
+    return [pair for pair in pairs if filterPair(pair)]
+
+def prepareData(lang1, lang2, reverse=False):
+    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
+    print("Read %s sentence pairs" % len(pairs))
+    pairs = filterPairs(pairs)
+    print("前5个句子对示例:")
+    for i, pair in enumerate(pairs[:5]):
+        print(f"  中文: {pair[0]}")
+        print(f"  英文: {pair[1]}")
+        print()
+    print("Trimmed to %s sentence pairs" % len(pairs))
+    print("Counting words...")
+    for pair in pairs:
+        input_lang.addSentence(pair[0])
+        output_lang.addSentence(pair[1])
+    print("Counted words:")
+    print(input_lang.name, input_lang.n_words)
+    print(output_lang.name, output_lang.n_words)
+    return input_lang, output_lang, pairs
+
+
+input_lang, output_lang, pairs = prepareData('cmn', 'eng', True)
+print(random.choice(pairs))
+
+class EncoderRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, dropout_p=0.1):
+        super(EncoderRNN, self).__init__()
+        self.hidden_size = hidden_size
+
+        self.embedding = nn.Embedding(input_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
+        self.dropout = nn.Dropout(dropout_p)
+
+    def forward(self, input):
+        embedded = self.dropout(self.embedding(input))
+        output, hidden = self.gru(embedded)
+        return output, hidden
+
+
+class DecoderRNN(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super(DecoderRNN, self).__init__()
+        self.embedding = nn.Embedding(output_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
+        self.out = nn.Linear(hidden_size, output_size)
+
+    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+        batch_size = encoder_outputs.size(0)
+        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
+        decoder_hidden = encoder_hidden
+        decoder_outputs = []
+
+        for i in range(MAX_LENGTH):
+            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
+            decoder_outputs.append(decoder_output)
+
+            if target_tensor is not None:
+                # Teacher forcing: Feed the target as the next input
+                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
+            else:
+                # Without teacher forcing: use its own predictions as the next input
+                _, topi = decoder_output.topk(1)
+                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
+
+        decoder_outputs = torch.cat(decoder_outputs, dim=1)
+        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+        return decoder_outputs, decoder_hidden, None  # We return `None` for consistency in the training loop
+
+    def forward_step(self, input, hidden):
+        output = self.embedding(input)
+        output = F.relu(output)
+        output, hidden = self.gru(output, hidden)
+        output = self.out(output)
+        return output, hidden
+
+class BahdanauAttention(nn.Module):
+    def __init__(self, hidden_size):
+        super(BahdanauAttention, self).__init__()
+        self.Wa = nn.Linear(hidden_size, hidden_size)
+        self.Ua = nn.Linear(hidden_size, hidden_size)
+        self.Va = nn.Linear(hidden_size, 1)
+
+    def forward(self, query, keys):
+        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
+        scores = scores.squeeze(2).unsqueeze(1)
+
+        weights = F.softmax(scores, dim=-1)
+        context = torch.bmm(weights, keys)
+
+        return context, weights
+
+
+class AttnDecoderRNN(nn.Module):
+    def __init__(self, hidden_size, output_size, dropout_p=0.1):
+        super(AttnDecoderRNN, self).__init__()
+        self.embedding = nn.Embedding(output_size, hidden_size)
+        self.attention = BahdanauAttention(hidden_size)
+        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
+        self.out = nn.Linear(hidden_size, output_size)
+        self.dropout = nn.Dropout(dropout_p)
+
+    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+        batch_size = encoder_outputs.size(0)
+        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
+        decoder_hidden = encoder_hidden
+        decoder_outputs = []
+        attentions = []
+
+        for i in range(MAX_LENGTH):
+            decoder_output, decoder_hidden, attn_weights = self.forward_step(
+                decoder_input, decoder_hidden, encoder_outputs
+            )
+            decoder_outputs.append(decoder_output)
+            attentions.append(attn_weights)
+
+            if target_tensor is not None:
+                # Teacher forcing: Feed the target as the next input
+                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
+            else:
+                # Without teacher forcing: use its own predictions as the next input
+                _, topi = decoder_output.topk(1)
+                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
+
+        decoder_outputs = torch.cat(decoder_outputs, dim=1)
+        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+        attentions = torch.cat(attentions, dim=1)
+
+        return decoder_outputs, decoder_hidden, attentions
+
+    def forward_step(self, input, hidden, encoder_outputs):
+        embedded = self.dropout(self.embedding(input))
+
+        query = hidden.permute(1, 0, 2)
+        context, attn_weights = self.attention(query, encoder_outputs)
+        input_gru = torch.cat((embedded, context), dim=2)
+
+        output, hidden = self.gru(input_gru, hidden)
+        output = self.out(output)
+
+        return output, hidden, attn_weights
+
+
+def indexesFromSentence(lang, sentence):
+    has_chinese = any('\u4e00' <= char <= '\u9fff' for char in sentence)
+    indexes = []
+
+    if has_chinese:
+        for char in sentence:
+            if char in lang.word2index:
+                indexes.append(lang.word2index[char])
+            else:
+                indexes.append(2)  # UNK的索引是2
+    else:
+        for word in sentence.split(' '):
+            if word in lang.word2index:
+                indexes.append(lang.word2index[word])
+            else:
+                indexes.append(2)  # UNK的索引是2
+    return indexes
+
+
+def tensorFromSentence(lang, sentence):
+    indexes = indexesFromSentence(lang, sentence)
+    indexes.append(EOS_token)
+    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)
+
+
+def tensorsFromPair(pair):
+    input_tensor = tensorFromSentence(input_lang, pair[0])
+    target_tensor = tensorFromSentence(output_lang, pair[1])
+    return (input_tensor, target_tensor)
+
+
+def get_dataloader(batch_size):
+    # 使用全局的input_lang, output_lang, pairs
+    n = len(pairs)
+    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
+    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
+
+    for idx, (inp, tgt) in enumerate(pairs):
+        inp_ids = indexesFromSentence(input_lang, inp)
+        tgt_ids = indexesFromSentence(output_lang, tgt)
+
+        # 确保句子长度不超过MAX_LENGTH-1，然后添加EOS_token
+        inp_ids = inp_ids[:MAX_LENGTH - 1]
+        tgt_ids = tgt_ids[:MAX_LENGTH - 1]
+
+        inp_ids.append(EOS_token)
+        tgt_ids.append(EOS_token)
+
+        input_ids[idx, :len(inp_ids)] = inp_ids
+        target_ids[idx, :len(tgt_ids)] = tgt_ids
+
+    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
+                               torch.LongTensor(target_ids).to(device))
+
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+    return input_lang, output_lang, train_dataloader
+
+def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
+                decoder_optimizer, criterion):
+    total_loss = 0
+    for data in dataloader:
+        input_tensor, target_tensor = data
+
+        encoder_optimizer.zero_grad()
+        decoder_optimizer.zero_grad()
+
+        encoder_outputs, encoder_hidden = encoder(input_tensor)
+        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
+
+        loss = criterion(
+            decoder_outputs.view(-1, decoder_outputs.size(-1)),
+            target_tensor.view(-1)
+        )
+        loss.backward()
+
+        encoder_optimizer.step()
+        decoder_optimizer.step()
+
+        total_loss += loss.item()
+
+    return total_loss / len(dataloader)
+
+
+
+def asMinutes(s):
+    m = math.floor(s / 60)
+    s -= m * 60
+    return '%dm %ds' % (m, s)
+
+
+def timeSince(since, percent):
+    now = time.time()
+    s = now - since
+    es = s / (percent)
+    rs = es - s
+    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
+
+
+def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
+          print_every=100, plot_every=100):
+    start = time.time()
+    plot_losses = []
+    print_loss_total = 0  # Reset every print_every
+    plot_loss_total = 0  # Reset every plot_every
+
+    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
+    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
+    criterion = nn.NLLLoss()
+
+    for epoch in range(1, n_epochs + 1):
+        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
+        print_loss_total += loss
+        plot_loss_total += loss
+
+        if epoch % print_every == 0:
+            print_loss_avg = print_loss_total / print_every
+            print_loss_total = 0
+            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
+                                         epoch, epoch / n_epochs * 100, print_loss_avg))
+
+        if epoch % plot_every == 0:
+            plot_loss_avg = plot_loss_total / plot_every
+            plot_losses.append(plot_loss_avg)
+            plot_loss_total = 0
+
+    showPlot(plot_losses)
+
+
+import matplotlib.pyplot as plt
+
+plt.switch_backend('agg')
+import matplotlib.ticker as ticker
+import numpy as np
+
+
+def showPlot(points):
+    plt.figure()
+    fig, ax = plt.subplots()
+    # this locator puts ticks at regular intervals
+    loc = ticker.MultipleLocator(base=0.2)
+    ax.yaxis.set_major_locator(loc)
+    plt.plot(points)
+
+def evaluate(encoder, decoder, sentence, input_lang, output_lang):
+    with torch.no_grad():
+        input_tensor = tensorFromSentence(input_lang, sentence)
+
+        encoder_outputs, encoder_hidden = encoder(input_tensor)
+        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)
+
+        _, topi = decoder_outputs.topk(1)
+        decoded_ids = topi.squeeze()
+
+        decoded_words = []
+        for idx in decoded_ids:
+            if idx.item() == EOS_token:
+                decoded_words.append('<EOS>')
+                break
+            decoded_words.append(output_lang.index2word[idx.item()])
+    return decoded_words, decoder_attn
+
+
+def evaluateRandomly(encoder, decoder, n=10):
+    for i in range(n):
+        pair = random.choice(pairs)
+        print('>', pair[0])
+        print('=', pair[1])
+        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
+        output_sentence = ' '.join(output_words)
+        print('<', output_sentence)
+        print('')
+
+
+hidden_size = 128
+batch_size = 32
+
+input_lang, output_lang, train_dataloader = get_dataloader(batch_size)
+
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
+
+# train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)
+#
+# ######################################################################
+# #
+# # Set dropout layers to ``eval`` mode
+# encoder.eval()
+# decoder.eval()
+# evaluateRandomly(encoder, decoder)
+
+
+######################################################################
+# Visualizing Attention
+# ---------------------
+#
+# A useful property of the attention mechanism is its highly interpretable
+# outputs. Because it is used to weight specific encoder outputs of the
+# input sequence, we can imagine looking where the network is focused most
+# at each time step.
+#
+# You could simply run ``plt.matshow(attentions)`` to see attention output
+# displayed as a matrix. For a better viewing experience we will do the
+# extra work of adding axes and labels:
+#
+
+def showAttention(input_sentence, output_words, attentions):
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
+    fig.colorbar(cax)
+
+    # Set up axes
+    ax.set_xticklabels([''] + input_sentence.split(' ') +
+                       ['<EOS>'], rotation=90)
+    ax.set_yticklabels([''] + output_words)
+
+    # Show label at every tick
+    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
+
+    plt.show()
+
+
+def evaluateAndShowAttention(input_sentence):
+    # 根据输入句子的语言类型选择正确的语言对象
+    has_chinese = any('\u4e00' <= char <= '\u9fff' for char in input_sentence)
+    if has_chinese:
+        # 中文句子，使用output_lang作为输入语言
+        output_words, attentions = evaluate(encoder, decoder, input_sentence, output_lang, input_lang)
+    else:
+        # 英文句子，使用input_lang作为输入语言
+        output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
+
+    # 显示结果
+    print('input =', input_sentence)
+    print('output =', ' '.join(output_words))
+    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])
+
+# test_sentences = [
+#     'i am anxious',  # 从训练数据前缀中选取
+#     'he is happy',
+#     'you are welcome'
+# ]
+#
+# for sent in test_sentences:
+#     evaluateAndShowAttention(sent)
+evaluateAndShowAttention('i am anxious')
+evaluateAndShowAttention('he is happy')
+evaluateAndShowAttention('you are welcome？')
diff --git a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py
new file mode 100644
index 00000000..4af9cc56
--- /dev/null
+++ b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+
+# Device configuration
+# 支持CUDA、MPS和CPU设备
+# 优化前
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# 优化后
+device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
+print(f"使用设备: {device}")
+
+# BiRNN是双向循环神经网络（Bidirectional Recurrent Neural Network）的缩写，
+# 是一种能够同时利用序列过去和未来信息的神经网络架构。
+# Bidirectional recurrent neural network (many-to-one)
+class BiRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        super(BiRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 2 for bidirection
+
+    def forward(self, x):
+        # Set initial states
+        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 2 for bidirection
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
+
+        # Forward propagate LSTM
+        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out[:, -1, :])
+        return out
+
+
+if __name__ == '__main__':
+
+    # Hyper-parameters
+    sequence_length = 28
+    input_size = 28
+    hidden_size = 128
+    num_layers = 2
+    num_classes = 10
+    batch_size = 100
+    num_epochs = 2
+    learning_rate = 0.003
+
+    # MNIST dataset
+    train_dataset = torchvision.datasets.MNIST(root='../../data/',
+                                               train=True,
+                                               transform=transforms.ToTensor(),
+                                               download=True)
+
+    test_dataset = torchvision.datasets.MNIST(root='../../data/',
+                                              train=False,
+                                              transform=transforms.ToTensor())
+
+    # Data loader
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True)
+
+    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+                                              batch_size=batch_size,
+                                              shuffle=False)
+
+    model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    total_step = len(train_loader)
+    for epoch in range(num_epochs):
+        for i, (images, labels) in enumerate(train_loader):
+            images = images.reshape(-1, sequence_length, input_size).to(device)
+            labels = labels.to(device)
+
+            # Forward pass
+            outputs = model(images)
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if (i + 1) % 100 == 0:
+                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+    # Test the model
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        for images, labels in test_loader:
+            images = images.reshape(-1, sequence_length, input_size).to(device)
+            labels = labels.to(device)
+            outputs = model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+        print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+        # Save the model checkpoint
+    torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
deleted file mode 100644
index a0ecd773..00000000
--- a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import torch 
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# Hyper-parameters
-sequence_length = 28
-input_size = 28
-hidden_size = 128
-num_layers = 2
-num_classes = 10
-batch_size = 100
-num_epochs = 2
-learning_rate = 0.003
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data/',
-                                           train=True, 
-                                           transform=transforms.ToTensor(),
-                                           download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data/',
-                                          train=False, 
-                                          transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                           batch_size=batch_size, 
-                                           shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                          batch_size=batch_size, 
-                                          shuffle=False)
-
-# Bidirectional recurrent neural network (many-to-one)
-class BiRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, num_classes):
-        super(BiRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
-    
-    def forward(self, x):
-        # Set initial states
-        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
-        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
-        
-        # Forward propagate LSTM
-        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
-        
-        # Decode the hidden state of the last time step
-        out = self.fc(out[:, -1, :])
-        return out
-
-model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
-
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-    
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
-    for i, (images, labels) in enumerate(train_loader):
-        images = images.reshape(-1, sequence_length, input_size).to(device)
-        labels = labels.to(device)
-        
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-        
-        # Backward and optimize
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        if (i+1) % 100 == 0:
-            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
-                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-with torch.no_grad():
-    correct = 0
-    total = 0
-    for images, labels in test_loader:
-        images = images.reshape(-1, sequence_length, input_size).to(device)
-        labels = labels.to(device)
-        outputs = model(images)
-        _, predicted = torch.max(outputs.data, 1)
-        total += labels.size(0)
-        correct += (predicted == labels).sum().item()
-
-    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py b/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py
new file mode 100644
index 00000000..b200bb68
--- /dev/null
+++ b/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py
@@ -0,0 +1,123 @@
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+
+
+# 卷积神经网络
+
+# Convolutional neural network (two convolutional layers)
+class ConvNet(nn.Module):
+    # 网络层定义
+    def __init__(self, num_classes=10):
+        super(ConvNet, self).__init__()
+        # 第一层卷积层：输入通道1（MNIST灰度图），输出通道16，卷积核5x5，步长1，填充2
+        # 卷积后尺寸保持不变：(28-5+2*2)/1 + 1 = 28
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
+            nn.BatchNorm2d(16),  # 批标准化：加速训练，提高稳定性
+            nn.ReLU(),           # 激活函数：引入非线性
+            nn.MaxPool2d(kernel_size=2, stride=2)  # 池化层：尺寸减半为14x14 无参数的下采样操作
+        )
+        # 第二层卷积层：输入通道16，输出通道32，卷积核5x5，步长1，填充2
+        # 卷积后尺寸保持不变：(14-5+2*2)/1 + 1 = 14
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
+            nn.BatchNorm2d(32),  # 批标准化
+            nn.ReLU(),           # 激活函数
+            nn.MaxPool2d(kernel_size=2, stride=2)  # 池化层：尺寸减半为7x7
+        )
+        # 全连接层：输入尺寸7x7x32，输出类别数
+        self.fc = nn.Linear(7 * 7 * 32, num_classes)
+    # 前向传播定义
+    def forward(self, x):
+        # 前向传播路径
+        out = self.layer1(x)    # 输入x经过第一层卷积层，输出尺寸：[batch_size, 16, 14, 14]
+        out = self.layer2(out)  # 输出经过第二层卷积层，输出尺寸：[batch_size, 32, 7, 7]
+        out = out.reshape(out.size(0), -1)  # 展平：[batch_size, 32*7*7]
+        out = self.fc(out)      # 全连接层分类，输出尺寸：[batch_size, num_classes]
+        return out
+
+
+if __name__ == '__main__':
+
+    # Device configuration
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+    # Hyper parameters
+    num_epochs = 5
+    num_classes = 10
+    batch_size = 100
+    learning_rate = 0.001
+
+    # MNIST dataset
+    train_dataset = torchvision.datasets.MNIST(root='../../data/',
+                                               train=True,
+                                               transform=transforms.ToTensor(),
+                                               download=True)
+
+    test_dataset = torchvision.datasets.MNIST(root='../../data/',
+                                              train=False,
+                                              transform=transforms.ToTensor())
+
+    # Data loader
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                               batch_size=batch_size,
+                                               shuffle=True)
+
+    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+                                              batch_size=batch_size,
+                                              shuffle=False)
+
+    model = ConvNet(num_classes).to(device)
+
+    # Loss and optimizer
+    # 损失函数：衡量模型预测与真实标签的差异程度
+    # CrossEntropyLoss适用于分类任务，内部整合了softmax和负对数似然
+    # 公式：loss = -sum(y_true * log(y_pred))，其中y_true是one-hot编码的真实标签
+    criterion = nn.CrossEntropyLoss()
+    
+    # 优化器：根据损失函数的梯度来更新模型参数，最小化损失
+    # Adam是一种常用的自适应学习率优化算法，结合了Momentum和RMSProp的优点
+    # 参数说明：
+    # - model.parameters(): 需要优化的模型参数集合
+    # - lr: 学习率，控制参数更新的步长大小
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    total_step = len(train_loader)  # 每个epoch的总步数 = 训练数据集大小 / 批大小
+    for epoch in range(num_epochs):  # 训练num_epochs轮
+        for i, (images, labels) in enumerate(train_loader):  # 遍历每个批次
+            images = images.to(device)  # 将图像数据移至指定设备
+            labels = labels.to(device)  # 将标签移至指定设备
+
+            # Forward pass（前向传播）：模型对输入图像进行预测
+            outputs = model(images)  # outputs形状：[batch_size, num_classes]
+            loss = criterion(outputs, labels)  # 计算损失：模型预测与真实标签的差异
+
+            # Backward and optimize（反向传播与参数优化）
+            optimizer.zero_grad()  # 清除之前的梯度，避免梯度累积
+            loss.backward()  # 反向传播：计算所有可训练参数的梯度
+            optimizer.step()  # 优化器更新参数：根据梯度调整模型权重
+
+            if (i + 1) % 100 == 0:
+                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+    # Test the model
+    model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        for images, labels in test_loader:
+            images = images.to(device)
+            labels = labels.to(device)
+            outputs = model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+        print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+    # Save the model checkpoint
+    torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/convolutional_neural_network/main.py b/tutorials/02-intermediate/convolutional_neural_network/main.py
deleted file mode 100644
index ec904f1f..00000000
--- a/tutorials/02-intermediate/convolutional_neural_network/main.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import torch 
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-
-# Hyper parameters
-num_epochs = 5
-num_classes = 10
-batch_size = 100
-learning_rate = 0.001
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data/',
-                                           train=True, 
-                                           transform=transforms.ToTensor(),
-                                           download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data/',
-                                          train=False, 
-                                          transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                           batch_size=batch_size, 
-                                           shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                          batch_size=batch_size, 
-                                          shuffle=False)
-
-# Convolutional neural network (two convolutional layers)
-class ConvNet(nn.Module):
-    def __init__(self, num_classes=10):
-        super(ConvNet, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
-            nn.BatchNorm2d(16),
-            nn.ReLU(),
-            nn.MaxPool2d(kernel_size=2, stride=2))
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
-            nn.BatchNorm2d(32),
-            nn.ReLU(),
-            nn.MaxPool2d(kernel_size=2, stride=2))
-        self.fc = nn.Linear(7*7*32, num_classes)
-        
-    def forward(self, x):
-        out = self.layer1(x)
-        out = self.layer2(out)
-        out = out.reshape(out.size(0), -1)
-        out = self.fc(out)
-        return out
-
-model = ConvNet(num_classes).to(device)
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
-    for i, (images, labels) in enumerate(train_loader):
-        images = images.to(device)
-        labels = labels.to(device)
-        
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-        
-        # Backward and optimize
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        if (i+1) % 100 == 0:
-            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
-                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
-with torch.no_grad():
-    correct = 0
-    total = 0
-    for images, labels in test_loader:
-        images = images.to(device)
-        labels = labels.to(device)
-        outputs = model(images)
-        _, predicted = torch.max(outputs.data, 1)
-        total += labels.size(0)
-        correct += (predicted == labels).sum().item()
-
-    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py b/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py
new file mode 100644
index 00000000..d9aa9fe4
--- /dev/null
+++ b/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py
@@ -0,0 +1,274 @@
+# ---------------------------------------------------------------------------- #
+# ResNet模型实现：基于论文https://arxiv.org/pdf/1512.03385.pdf                   #
+# 采用CIFAR-10数据集的模型架构（见论文4.2节）                                     #
+# 部分代码参考PyTorch官方实现：                                                #
+# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py   #
+# ---------------------------------------------------------------------------- #
+
+# 导入必要的库
+import torch                            # PyTorch核心库
+import torch.nn as nn                   # 神经网络模块
+import torchvision                      # 计算机视觉库，提供数据集和模型
+import torchvision.transforms as transforms  # 图像预处理工具
+from torch.cuda.amp import autocast, GradScaler  # 混合精度训练
+
+# 设备配置：优先使用GPU，否则使用CPU，M系列芯片可使用MPS
+if torch.backends.mps.is_available():
+    device = torch.device('mps')  # Apple Silicon M系列芯片加速
+elif torch.cuda.is_available():
+    device = torch.device('cuda')  # NVIDIA GPU加速
+else:
+    device = torch.device('cpu')   # CPU训练
+print(f"使用设备: {device}")
+
+
+# 3x3卷积层封装函数
+# ResNet大量使用3x3卷积，此函数简化代码复用
+def conv3x3(in_channels, out_channels, stride=1):
+    # 定义3x3卷积：
+    # - in_channels: 输入通道数
+    # - out_channels: 输出通道数  
+    # - kernel_size: 卷积核大小
+    # - stride: 步长，默认1
+    # - padding: 填充，设置为1保持尺寸不变
+    # - bias: 不使用偏置，因为后续会接批标准化层
+    return nn.Conv2d(in_channels, out_channels, kernel_size=3,
+                     stride=stride, padding=1, bias=False)
+
+
+# 残差块(Residual Block)定义
+# ResNet的核心组件，通过残差连接解决深度网络训练问题
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
+        super(ResidualBlock, self).__init__()
+        # 第一个3x3卷积层：可能改变通道数和尺寸
+        self.conv1 = conv3x3(in_channels, out_channels, stride)
+        self.bn1 = nn.BatchNorm2d(out_channels)  # 批标准化层
+        self.relu = nn.ReLU(inplace=True)  # ReLU激活函数，inplace=True节省内存
+        # 第二个3x3卷积层：保持通道数和尺寸不变
+        self.conv2 = conv3x3(out_channels, out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)  # 批标准化层
+        # 下采样模块：当输入输出通道数或尺寸不匹配时使用
+        self.downsample = downsample
+
+    def forward(self, x):
+        # 保存输入作为残差连接
+        residual = x
+        
+        # 主路径：两次卷积+激活
+        out = self.conv1(x)      # 第一次卷积
+        out = self.bn1(out)      # 批标准化
+        out = self.relu(out)     # 激活函数
+        
+        out = self.conv2(out)    # 第二次卷积
+        out = self.bn2(out)      # 批标准化
+        
+        # 残差路径：如果需要下采样则调整尺寸和通道数
+        if self.downsample:
+            residual = self.downsample(x)
+        
+        # 残差连接：主路径输出 + 残差路径输出
+        out += residual
+        out = self.relu(out)     # 最终激活
+        
+        return out
+
+
+# ResNet主网络定义
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=10):
+        super(ResNet, self).__init__()
+        # 初始输入通道数
+        self.in_channels = 16
+        
+        # 第一层：3x3卷积 + 批标准化 + ReLU
+        # CIFAR-10输入为3通道，输出16通道
+        self.conv = conv3x3(3, 16)
+        self.bn = nn.BatchNorm2d(16)
+        self.relu = nn.ReLU(inplace=True)
+        
+        # 构建残差层：
+        # layer1: 16通道，layers[0]个残差块，步长1
+        # layer2: 32通道，layers[1]个残差块，步长2（尺寸减半）
+        # layer3: 64通道，layers[2]个残差块，步长2（尺寸减半）
+        self.layer1 = self.make_layer(block, 16, layers[0])
+        self.layer2 = self.make_layer(block, 32, layers[1], 2)
+        self.layer3 = self.make_layer(block, 64, layers[2], 2)
+        
+        # 全局平均池化：将64x8x8特征图转为64x1x1
+        self.avg_pool = nn.AvgPool2d(8)
+        
+        # 全连接层：将64维特征映射到10个类别
+        self.fc = nn.Linear(64, num_classes)
+
+    # 创建残差层的辅助函数
+    def make_layer(self, block, out_channels, blocks, stride=1):
+        downsample = None
+        
+        # 当步长不为1或输入输出通道数不匹配时，需要下采样
+        if (stride != 1) or (self.in_channels != out_channels):
+            downsample = nn.Sequential(
+                conv3x3(self.in_channels, out_channels, stride=stride),
+                nn.BatchNorm2d(out_channels)
+            )
+        
+        layers = []
+        # 添加第一个残差块（可能包含下采样）
+        layers.append(block(self.in_channels, out_channels, stride, downsample))
+        self.in_channels = out_channels  # 更新输入通道数
+        
+        # 添加剩余的残差块（不需要下采样）
+        for i in range(1, blocks):
+            layers.append(block(out_channels, out_channels))
+        
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # 输入x: [batch_size, 3, 32, 32]
+        
+        # 初始卷积层
+        out = self.conv(x)      # [batch_size, 16, 32, 32]
+        out = self.bn(out)      # 批标准化
+        out = self.relu(out)    # 激活
+        
+        # 残差层1：保持尺寸不变
+        out = self.layer1(out)  # [batch_size, 16, 32, 32]
+        
+        # 残差层2：尺寸减半
+        out = self.layer2(out)  # [batch_size, 32, 16, 16]
+        
+        # 残差层3：尺寸减半
+        out = self.layer3(out)  # [batch_size, 64, 8, 8]
+        
+        # 全局平均池化
+        out = self.avg_pool(out)  # [batch_size, 64, 1, 1]
+        
+        # 展平特征
+        out = out.view(out.size(0), -1)  # [batch_size, 64]
+        
+        # 全连接层分类
+        out = self.fc(out)  # [batch_size, 10]
+        
+        return out
+# 学习率更新函数
+def update_lr(optimizer, lr):
+    # 遍历优化器中的所有参数组，更新学习率
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+if __name__ == '__main__':
+    # 主函数入口
+
+    # 超参数设置 - 针对Mac优化
+    num_epochs = 20         # 减少训练轮数（原80轮，可根据需要调整）
+    batch_size = 32         # 减小批大小适配Mac内存（原100）
+    learning_rate = 0.001   # 初始学习率
+
+    # 图像预处理模块：数据增强提高模型泛化能力
+    transform = transforms.Compose([
+        transforms.Pad(4),               # 填充4个像素，32x32→40x40
+        transforms.RandomHorizontalFlip(),  # 随机水平翻转
+        transforms.RandomCrop(32),       # 随机裁剪回32x32
+        transforms.ToTensor()            # 转换为Tensor格式
+    ])
+
+    # CIFAR-10数据集加载
+    # 训练集：应用数据增强
+    train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
+                                                 train=True,       # 训练集
+                                                 transform=transform,  # 应用数据增强
+                                                 download=True)    # 自动下载
+
+    # 测试集：仅转换为Tensor，不应用数据增强
+    test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
+                                                train=False,      # 测试集
+                                                transform=transforms.ToTensor())
+
+    # 数据加载器 - 优化版本
+    # 训练数据加载器：多线程加载+内存锁定
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                               batch_size=batch_size,  # 批大小
+                                               shuffle=True,          # 打乱数据
+                                               num_workers=4,         # 多线程加载（根据CPU核心数调整）
+                                               pin_memory=True,       # 内存锁定加速数据传输
+                                               persistent_workers=True)  # 保持线程存活
+
+    # 测试数据加载器：多线程加载+内存锁定
+    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+                                              batch_size=batch_size,
+                                              shuffle=False,
+                                              num_workers=2,
+                                              pin_memory=True)
+
+    # 初始化ResNet模型
+    # 使用ResidualBlock作为基本单元，[2, 2, 2]表示每个残差层包含2个残差块
+    # 即创建一个ResNet-18模型（计算方式：2*(2+2+2)+2=14层？不，正确计算是：
+    # 初始卷积层(1) + 3个残差层(每个2个块，每个块2层) + 全连接层(1) = 1+3*2*2+1=14层）
+    model = ResNet(ResidualBlock, [2, 2, 2]).to(device)
+    # 损失函数和优化器
+    # 交叉熵损失：适用于分类问题，内部包含softmax
+    criterion = nn.CrossEntropyLoss()
+    
+    # Adam优化器：自适应学习率优化器，收敛速度快
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    
+    # 混合精度训练配置
+    scaler = GradScaler(enabled=(device.type == 'cuda' or device.type == 'mps'))
+
+    # 训练模型
+    total_step = len(train_loader)  # 每个epoch的总步数
+    curr_lr = learning_rate         # 当前学习率
+    
+    # 训练模型
+    total_step = len(train_loader)  # 每个epoch的总步数
+    curr_lr = learning_rate         # 当前学习率
+    
+    for epoch in range(num_epochs):  # 遍历每个epoch
+        model.train()  # 确保模型处于训练模式
+        for i, (images, labels) in enumerate(train_loader):  # 遍历每个batch
+            # 将数据移至指定设备
+            images = images.to(device, non_blocking=True)  # 非阻塞传输
+            labels = labels.to(device, non_blocking=True)  # 非阻塞传输
+
+            # 前向传播：模型预测（混合精度）
+            with autocast(enabled=(device.type == 'cuda' or device.type == 'mps')):
+                outputs = model(images)
+                loss = criterion(outputs, labels)
+
+            # 反向传播和优化（混合精度）
+            optimizer.zero_grad(set_to_none=True)  # 更高效的梯度清除
+            scaler.scale(loss).backward()          # 缩放损失并反向传播
+            scaler.step(optimizer)                 # 更新参数
+            scaler.update()                        # 更新缩放器
+
+            # 每100步打印一次损失
+            if (i + 1) % 100 == 0:
+                print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
+                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+        # 每20个epoch衰减一次学习率
+        if (epoch + 1) % 20 == 0:
+            curr_lr /= 3  # 学习率除以3
+            update_lr(optimizer, curr_lr)  # 更新优化器的学习率
+    # 测试模型
+    model.eval()  # 设置模型为评估模式
+    
+    with torch.no_grad():  # 不计算梯度，节省内存和计算资源
+        correct = 0  # 正确预测数
+        total = 0    # 总样本数
+        
+        for images, labels in test_loader:  # 遍历测试集
+            images = images.to(device)
+            labels = labels.to(device)
+            
+            outputs = model(images)  # 模型预测
+            _, predicted = torch.max(outputs.data, 1)  # 获取预测类别
+            
+            total += labels.size(0)  # 更新总样本数
+            correct += (predicted == labels).sum().item()  # 更新正确预测数
+
+        # 计算并打印准确率
+        print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
+
+    # 保存模型参数到文件
+    torch.save(model.state_dict(), 'resnet.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/deep_residual_network/main.py b/tutorials/02-intermediate/deep_residual_network/main.py
deleted file mode 100644
index 69dbe5fb..00000000
--- a/tutorials/02-intermediate/deep_residual_network/main.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# ---------------------------------------------------------------------------- #
-# An implementation of https://arxiv.org/pdf/1512.03385.pdf                    #
-# See section 4.2 for the model architecture on CIFAR-10                       #
-# Some part of the code was referenced from below                              #
-# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py   #
-# ---------------------------------------------------------------------------- #
-
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# Hyper-parameters
-num_epochs = 80
-batch_size = 100
-learning_rate = 0.001
-
-# Image preprocessing modules
-transform = transforms.Compose([
-    transforms.Pad(4),
-    transforms.RandomHorizontalFlip(),
-    transforms.RandomCrop(32),
-    transforms.ToTensor()])
-
-# CIFAR-10 dataset
-train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
-                                             train=True, 
-                                             transform=transform,
-                                             download=True)
-
-test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
-                                            train=False, 
-                                            transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                           batch_size=batch_size,
-                                           shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                          batch_size=batch_size,
-                                          shuffle=False)
-
-# 3x3 convolution
-def conv3x3(in_channels, out_channels, stride=1):
-    return nn.Conv2d(in_channels, out_channels, kernel_size=3, 
-                     stride=stride, padding=1, bias=False)
-
-# Residual block
-class ResidualBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = conv3x3(in_channels, out_channels, stride)
-        self.bn1 = nn.BatchNorm2d(out_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(out_channels, out_channels)
-        self.bn2 = nn.BatchNorm2d(out_channels)
-        self.downsample = downsample
-        
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        if self.downsample:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-
-# ResNet
-class ResNet(nn.Module):
-    def __init__(self, block, layers, num_classes=10):
-        super(ResNet, self).__init__()
-        self.in_channels = 16
-        self.conv = conv3x3(3, 16)
-        self.bn = nn.BatchNorm2d(16)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self.make_layer(block, 16, layers[0])
-        self.layer2 = self.make_layer(block, 32, layers[1], 2)
-        self.layer3 = self.make_layer(block, 64, layers[2], 2)
-        self.avg_pool = nn.AvgPool2d(8)
-        self.fc = nn.Linear(64, num_classes)
-        
-    def make_layer(self, block, out_channels, blocks, stride=1):
-        downsample = None
-        if (stride != 1) or (self.in_channels != out_channels):
-            downsample = nn.Sequential(
-                conv3x3(self.in_channels, out_channels, stride=stride),
-                nn.BatchNorm2d(out_channels))
-        layers = []
-        layers.append(block(self.in_channels, out_channels, stride, downsample))
-        self.in_channels = out_channels
-        for i in range(1, blocks):
-            layers.append(block(out_channels, out_channels))
-        return nn.Sequential(*layers)
-    
-    def forward(self, x):
-        out = self.conv(x)
-        out = self.bn(out)
-        out = self.relu(out)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.avg_pool(out)
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        return out
-    
-model = ResNet(ResidualBlock, [2, 2, 2]).to(device)
-
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# For updating learning rate
-def update_lr(optimizer, lr):    
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-
-# Train the model
-total_step = len(train_loader)
-curr_lr = learning_rate
-for epoch in range(num_epochs):
-    for i, (images, labels) in enumerate(train_loader):
-        images = images.to(device)
-        labels = labels.to(device)
-        
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-        
-        # Backward and optimize
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        if (i+1) % 100 == 0:
-            print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
-                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-    # Decay learning rate
-    if (epoch+1) % 20 == 0:
-        curr_lr /= 3
-        update_lr(optimizer, curr_lr)
-
-# Test the model
-model.eval()
-with torch.no_grad():
-    correct = 0
-    total = 0
-    for images, labels in test_loader:
-        images = images.to(device)
-        labels = labels.to(device)
-        outputs = model(images)
-        _, predicted = torch.max(outputs.data, 1)
-        total += labels.size(0)
-        correct += (predicted == labels).sum().item()
-
-    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'resnet.ckpt')
diff --git a/tutorials/02-intermediate/language_model/main.py b/tutorials/02-intermediate/language_model/language_model.py
similarity index 79%
rename from tutorials/02-intermediate/language_model/main.py
rename to tutorials/02-intermediate/language_model/language_model.py
index ef135bb7..e534a71d 100644
--- a/tutorials/02-intermediate/language_model/main.py
+++ b/tutorials/02-intermediate/language_model/language_model.py
@@ -8,7 +8,9 @@
 
 
 # Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# 支持CUDA、MPS和CPU设备
+device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
+print(f"使用设备: {device}")
 
 # Hyper-parameters
 embed_size = 128
@@ -31,22 +33,22 @@
 class RNNLM(nn.Module):
     def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
         super(RNNLM, self).__init__()
-        self.embed = nn.Embedding(vocab_size, embed_size)
-        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
-        self.linear = nn.Linear(hidden_size, vocab_size)
-        
+        self.embed = nn.Embedding(vocab_size, embed_size)  # 词嵌入层
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)  # LSTM层
+        self.linear = nn.Linear(hidden_size, vocab_size)  # 输出层
+
     def forward(self, x, h):
-        # Embed word ids to vectors
-        x = self.embed(x)
-        
-        # Forward propagate LSTM
-        out, (h, c) = self.lstm(x, h)
-        
-        # Reshape output to (batch_size*sequence_length, hidden_size)
-        out = out.reshape(out.size(0)*out.size(1), out.size(2))
-        
-        # Decode hidden states of all time steps
-        out = self.linear(out)
+        # 词嵌入：将单词ID转换为向量表示
+        x = self.embed(x)  # [batch_size, seq_length, embed_size]
+
+        # LSTM前向传播
+        out, (h, c) = self.lstm(x, h)  # out: [batch_size, seq_length, hidden_size]
+
+        # 重塑输出以匹配全连接层输入格式
+        out = out.reshape(out.size(0) * out.size(1), out.size(2))  # [batch_size*seq_length, hidden_size]
+
+        # 预测下一个单词的概率分布
+        out = self.linear(out)  # [batch_size*seq_length, vocab_size]
         return out, (h, c)
 
 model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)
diff --git a/tutorials/02-intermediate/recurrent_neural_network/main.py b/tutorials/02-intermediate/recurrent_neural_network/main.py
deleted file mode 100644
index c138c5ad..00000000
--- a/tutorials/02-intermediate/recurrent_neural_network/main.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import torch 
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# Hyper-parameters
-sequence_length = 28
-input_size = 28
-hidden_size = 128
-num_layers = 2
-num_classes = 10
-batch_size = 100
-num_epochs = 2
-learning_rate = 0.01
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data/',
-                                           train=True, 
-                                           transform=transforms.ToTensor(),
-                                           download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data/',
-                                          train=False, 
-                                          transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                           batch_size=batch_size, 
-                                           shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                          batch_size=batch_size, 
-                                          shuffle=False)
-
-# Recurrent neural network (many-to-one)
-class RNN(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, num_classes):
-        super(RNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
-        self.fc = nn.Linear(hidden_size, num_classes)
-    
-    def forward(self, x):
-        # Set initial hidden and cell states 
-        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
-        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
-        
-        # Forward propagate LSTM
-        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
-        
-        # Decode the hidden state of the last time step
-        out = self.fc(out[:, -1, :])
-        return out
-
-model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
-
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
-    for i, (images, labels) in enumerate(train_loader):
-        images = images.reshape(-1, sequence_length, input_size).to(device)
-        labels = labels.to(device)
-        
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-        
-        # Backward and optimize
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        if (i+1) % 100 == 0:
-            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
-                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-model.eval()
-with torch.no_grad():
-    correct = 0
-    total = 0
-    for images, labels in test_loader:
-        images = images.reshape(-1, sequence_length, input_size).to(device)
-        labels = labels.to(device)
-        outputs = model(images)
-        _, predicted = torch.max(outputs.data, 1)
-        total += labels.size(0)
-        correct += (predicted == labels).sum().item()
-
-    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py b/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py
new file mode 100644
index 00000000..7f0300fc
--- /dev/null
+++ b/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py
@@ -0,0 +1,186 @@
+# 导入PyTorch核心库
+import torch
+import torch.nn as nn
+# 导入TorchVision用于数据加载和预处理
+import torchvision
+import torchvision.transforms as transforms
+
+# 设备配置：自动选择GPU（如果可用）否则使用CPU
+# MPS是Apple Silicon GPU支持，这里也可以加上对MPS的支持
+device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
+
+"""
+RNN（循环神经网络）的用途与应用场景
+RNN（Recurrent Neural Network，循环神经网络）是一类专门用于处理序列数据的深度学习模型，
+其核心特点是能够记忆之前的信息并用于当前决策，这使得它在各种需要处理时序依赖关系的任务中表现出色。
+RNN的核心特性
+RNN通过在网络中引入循环连接，使模型能够：
+
+处理任意长度的序列数据
+捕捉序列中的时间依赖关系
+保留序列的上下文信息
+
+RNN的主要应用场景
+1. 自然语言处理（NLP）
+文本分类：情感分析、垃圾邮件检测、新闻分类
+语言建模：预测下一个词的概率分布
+机器翻译：将一种语言翻译成另一种语言
+命名实体识别：识别文本中的人名、地名、组织名等
+文本生成：自动生成文章、诗歌、对话等
+2. 时间序列预测
+股票价格预测：基于历史价格预测未来走势
+天气预报：基于气象数据预测未来天气
+电力负荷预测：预测未来电力需求
+销售预测：预测产品未来销量
+3. 语音处理
+语音识别：将语音转换为文本
+语音合成：将文本转换为语音
+说话人识别：识别说话人的身份
+4. 图像与视频分析
+图像描述生成：为图像生成文字描述
+视频分析：行为识别、动作检测
+手写体识别：如代码示例中的MNIST数字分类
+"""
+# 定义RNN模型类，继承自nn.Module
+class RNN(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        """
+        RNN模型初始化函数
+        :param input_size: 输入特征维度 (MNIST图像的每行像素数：28)
+        :param hidden_size: 隐藏层维度 (LSTM单元的隐藏状态大小：128)
+        :param num_layers: LSTM层数 (2层)
+        :param num_classes: 分类数量 (MNIST有10个数字类别)
+        """
+        super(RNN, self).__init__()
+        self.hidden_size = hidden_size  # 隐藏层大小
+        self.num_layers = num_layers    # LSTM层数
+        
+        # 定义LSTM层：
+        # - input_size: 输入特征维度
+        # - hidden_size: 隐藏层维度
+        # - num_layers: LSTM层数
+        # - batch_first=True: 输入输出形状为(batch_size, seq_length, feature_size)
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
+        
+        # 全连接层：将LSTM输出映射到分类结果
+        self.fc = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        """
+        前向传播函数
+        :param x: 输入张量，形状为(batch_size, sequence_length, input_size)
+        :return: 输出张量，形状为(batch_size, num_classes)
+        """
+        # 初始化LSTM的隐藏状态h0和细胞状态c0
+        # 形状：(num_layers, batch_size, hidden_size)
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+
+        # 前向传播LSTM
+        # out: LSTM的输出，形状为(batch_size, seq_length, hidden_size)
+        # _: 包含最终隐藏状态和细胞状态的元组（这里未使用）
+        out, _ = self.lstm(x, (h0, c0))
+
+        # 解码最后一个时间步的隐藏状态用于分类
+        # out[:, -1, :] 表示取所有样本的最后一个时间步的隐藏状态
+        out = self.fc(out[:, -1, :])
+        return out
+
+
+if __name__ == '__main__':
+    # 超参数设置
+    sequence_length = 28     # 序列长度 (MNIST图像的行数：28)
+    input_size = 28          # 输入特征维度 (MNIST图像的列数：28)
+    hidden_size = 128        # 隐藏层维度
+    num_layers = 2           # LSTM层数
+    num_classes = 10         # 分类数量 (0-9数字)
+    batch_size = 100         # 批次大小
+    num_epochs = 2           # 训练轮数
+    learning_rate = 0.01     # 学习率
+
+    # MNIST数据集加载
+    # 训练集
+    train_dataset = torchvision.datasets.MNIST(
+        root='../../data/',      # 数据集保存路径
+        train=True,              # 训练集
+        transform=transforms.ToTensor(),  # 转换为Tensor并归一化到[0,1]
+        download=True            # 自动下载（如果本地没有）
+    )
+
+    # 测试集
+    test_dataset = torchvision.datasets.MNIST(
+        root='../../data/',
+        train=False,             # 测试集
+        transform=transforms.ToTensor()
+    )
+
+    # 数据加载器
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset,
+        batch_size=batch_size,
+        shuffle=True  # 训练时打乱数据
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset,
+        batch_size=batch_size,
+        shuffle=False  # 测试时不打乱数据
+    )
+
+    # 实例化RNN模型（many-to-one架构：多个时间步输入，一个输出）
+    model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
+
+    # 损失函数和优化器
+    # 交叉熵损失：适用于多分类任务
+    criterion = nn.CrossEntropyLoss()
+    # Adam优化器：自适应学习率优化算法
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # 训练模型
+    total_step = len(train_loader)  # 每个epoch的总步数
+    for epoch in range(num_epochs):  # 遍历每个epoch
+        for i, (images, labels) in enumerate(train_loader):  # 遍历每个批次
+            # 将图像重塑为序列数据：
+            # MNIST图像原始形状：(batch_size, 1, 28, 28)
+            # 重塑后形状：(batch_size, sequence_length=28, input_size=28)
+            # 即把28x28的图像看作28个时间步，每个时间步输入28个像素
+            images = images.reshape(-1, sequence_length, input_size).to(device)
+            labels = labels.to(device)  # 标签移至设备
+
+            # 前向传播
+            outputs = model(images)  # 模型预测
+            loss = criterion(outputs, labels)  # 计算损失
+
+            # 反向传播和优化
+            optimizer.zero_grad()  # 清除梯度
+            loss.backward()       # 反向传播计算梯度
+            optimizer.step()      # 更新参数
+
+            # 打印训练信息
+            if (i + 1) % 100 == 0:
+                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+    # 测试模型
+    model.eval()  # 设置模型为评估模式（关闭dropout等）
+    with torch.no_grad():  # 关闭梯度计算，节省内存和计算
+        correct = 0  # 正确预测数
+        total = 0    # 总样本数
+        for images, labels in test_loader:  # 遍历测试集
+            # 重塑图像并移至设备
+            images = images.reshape(-1, sequence_length, input_size).to(device)
+            labels = labels.to(device)
+            outputs = model(images)  # 模型预测
+            
+            # 获取预测结果：torch.max返回最大值和索引，索引即为预测类别
+            _, predicted = torch.max(outputs.data, 1)
+            
+            total += labels.size(0)  # 更新总样本数
+            correct += (predicted == labels).sum().item()  # 更新正确预测数
+
+        # 打印测试准确率
+        print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+    # 保存模型权重
+    torch.save(model.state_dict(), 'model.ckpt')
+    print("Model weights saved to 'model.ckpt'")
\ No newline at end of file
diff --git a/tutorials/03-advanced/generative_adversarial_network/main.py b/tutorials/03-advanced/generative_adversarial_network/generative_adversarial_network.py
similarity index 100%
rename from tutorials/03-advanced/generative_adversarial_network/main.py
rename to tutorials/03-advanced/generative_adversarial_network/generative_adversarial_network.py
diff --git a/tutorials/compilers/Inductor_demo.py b/tutorials/compilers/Inductor_demo.py
new file mode 100644
index 00000000..1fbff4a1
--- /dev/null
+++ b/tutorials/compilers/Inductor_demo.py
@@ -0,0 +1,15 @@
+import torch
+
+def foo1(x1, x2):
+    a = torch.neg(x1)
+    b = torch.maximum(x2, a)
+    y = torch.cat([b], dim=0)
+    return y
+
+
+# TORCH_COMPILE_DEBUG=1 python xx.py
+if __name__ == '__main__':
+    x1 = torch.randint(256, (1, 8), dtype=torch.uint8)
+    x2 = torch.randint(256, (8390, 8), dtype=torch.uint8)
+    compiled_foo1 = torch.compile(foo1)
+    result = compiled_foo1(x1, x2)
diff --git a/tutorials/compilers/basic_demo.py b/tutorials/compilers/basic_demo.py
new file mode 100644
index 00000000..e9fd7d2b
--- /dev/null
+++ b/tutorials/compilers/basic_demo.py
@@ -0,0 +1,203 @@
+import torch
+
+# def foo(x, y):
+#     a = torch.sin(x)
+#     b = torch.cos(y)
+#     return a + b
+#
+#
+# opt_foo1 = torch.compile(foo)
+# print(opt_foo1(torch.randn(3, 3), torch.randn(3, 3)))
+#
+#
+# @torch.compile
+# def opt_foo2(x, y):
+#     a = torch.sin(x)
+#     b = torch.cos(y)
+#     return a + b
+#
+#
+# print(opt_foo2(torch.randn(3, 3), torch.randn(3, 3)))
+#
+# def inner(x):
+#     return torch.sin(x)
+#
+#
+# @torch.compile
+# def outer(x, y):
+#     a = inner(x)
+#     b = torch.cos(y)
+#     return a + b
+#
+#
+# print(outer(torch.randn(3, 3), torch.randn(3, 3)))
+#
+
+# t = torch.randn(10, 100)
+#
+#
+# class MyModule(torch.nn.Module):
+#     def __init__(self):
+#         super().__init__()
+#         self.lin = torch.nn.Linear(3, 3)
+#
+#     def forward(self, x):
+#         return torch.nn.functional.relu(self.lin(x))
+#
+#
+# mod1 = MyModule()
+# mod1.compile()
+# print(mod1(torch.randn(3, 3)))
+#
+# mod2 = MyModule()
+# mod2 = torch.compile(mod2)
+# print(mod2(torch.randn(3, 3)))
+
+
+# Demonstrating Speedups 展示加速效果
+
+def foo3(x):
+    y = x + 1
+    z = torch.nn.functional.relu(y)
+    u = z * 2
+    return u
+
+
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    result = fn()
+    end.record()
+    torch.cuda.synchronize()
+    return result, start.elapsed_time(end) / 1000
+
+
+opt_foo3 = torch.compile(foo3)
+inp = torch.randn(4096, 4096).cuda()
+
+
+def first_run():
+    torch._logging.set_logs(graph_code=True)
+    """
+    请注意，torch.compile 似乎比即时执行要花费长得多的时间才能完成。这是因为 torch.compile 在最初几次执行时需要额外时间来编译模型。
+    torch.compile 会尽可能重用已编译的代码，因此如果我们多运行几次优化后的模型，应该会看到与即时执行相比有显著的性能提升。
+    """
+    print("compile:", timed(lambda: opt_foo3(inp))[1])
+    print("eager:", timed(lambda: foo3(inp))[1])
+
+
+"""
+eager time 0: 0.027955583572387695
+eager time 1: 0.0004986880123615265
+eager time 2: 0.00045683199167251585
+eager time 3: 0.00045158401131629945
+eager time 4: 0.00045363199710845946
+eager time 5: 0.00045363199710845946
+eager time 6: 0.0004556800127029419
+eager time 7: 0.0004505600035190582
+eager time 8: 0.00045043200254440307
+eager time 9: 0.0004546560049057007
+~~~~~~~~~~
+compile time 0: 0.434231201171875
+compile time 1: 0.00026624000072479246
+compile time 2: 0.00023552000522613525
+compile time 3: 0.0002234240025281906
+compile time 4: 0.00021913599967956544
+compile time 5: 0.00022220799326896668
+compile time 6: 0.0002181120067834854
+compile time 7: 0.0002242559939622879
+compile time 8: 0.0002181120067834854
+compile time 9: 0.00022118400037288665
+~~~~~~~~~~
+(eval) eager median: 0.0004541440010070801, compile median: 0.00022281599789857864, speedup: 2.038201948200314x
+"""
+
+
+def many_runs():
+    # turn off logging for now to prevent spam
+    torch._logging.set_logs(graph_code=False)
+    eager_times = []
+    for i in range(10):
+        _, eager_time = timed(lambda: foo3(inp))
+        eager_times.append(eager_time)
+        print(f"eager time {i}: {eager_time}")
+    print("~" * 10)
+
+    compile_times = []
+    for i in range(10):
+        _, compile_time = timed(lambda: opt_foo3(inp))
+        compile_times.append(compile_time)
+        print(f"compile time {i}: {compile_time}")
+    print("~" * 10)
+
+    import numpy as np
+
+    eager_med = np.median(eager_times)
+    compile_med = np.median(compile_times)
+    speedup = eager_med / compile_med
+    assert speedup > 1
+    print(
+        f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x"
+    )
+    print("~" * 10)
+
+
+def bar1(a, b):
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+def bar(a, b):
+    x = a / (torch.abs(a) + 1)
+    b = torch.where(b.sum() < 0, -b, b)
+    return x * b
+
+
+# Graph Breaks 图中断
+"""“图中断”这一术语源于torch.compile尝试捕获并优化PyTorch操作图这一事实。当遇到不支持的Python代码时，这个图就必须被“中断”。
+图中断会导致优化机会的损失，这可能仍然不尽如人意，但总比出现无声的错误或硬崩溃要好。"""
+
+
+
+@torch.compile(fullgraph=True)
+def bar_fixed(a, b):
+    x = a / (torch.abs(a) + 1)
+
+    def true_branch(y):
+        return y * -1
+
+    def false_branch(y):
+        # NOTE: torch.cond doesn't allow aliased outputs
+        return y.clone()
+
+    x = torch.cond(b.sum() < 0, true_branch, false_branch, (b,))
+    return x * b
+
+
+def graph_breaks_fixed_demo():
+    torch._logging.set_logs(graph_code=True)
+    inp1 = torch.ones(10)
+    inp2 = torch.ones(10)
+    fixed = bar_fixed(inp1, inp2)
+    fixed1 = bar_fixed(inp1, -inp2)
+    print(f"fixed: {fixed}")
+    print(f"fixed1: {fixed1}")
+
+
+def graph_breaks_demo():
+    torch._logging.set_logs(graph_code=True)
+    opt_bar = torch.compile(bar)
+    inp1 = torch.ones(10)
+    inp2 = torch.ones(10)
+    opt_bar(inp1, inp2)
+    opt_bar(inp1, -inp2)
+
+
+if __name__ == '__main__':
+    graph_breaks_fixed_demo()
diff --git a/tutorials/compilers/benchmark_time.py b/tutorials/compilers/benchmark_time.py
new file mode 100644
index 00000000..53e9a642
--- /dev/null
+++ b/tutorials/compilers/benchmark_time.py
@@ -0,0 +1,68 @@
+import torch
+import torch.utils.benchmark as benchmark
+from torch.utils.benchmark import Language
+from torch.utils.benchmark import Timer
+import pickle
+import re
+from torch.utils.benchmark import CallgrindStats, FunctionCounts
+
+
+def cudaclu_timer():
+    cpp_timer = Timer(
+        "x * y;",
+        """
+            auto x = torch::ones({128});
+            auto y = torch::ones({128});
+        """,
+        language=Language.CPP,
+    )
+
+    print(cpp_timer.blocked_autorange(min_run_time=1))
+
+
+# 使用 Callgrind 进行 A/B 测试
+"""
+指令计数最有用之处在于它们允许对计算进行精细比较，这在分析性能时至关重要。
+为了实际演示这一点，让我们将两个大小为 128 的 Tensor 相乘与一个 {128} x {1} 的乘法进行比较，后者将广播第二个 Tensor。
+"""
+
+
+def call_grind_timer():
+    broadcasting_stats = Timer(
+        "x * y;",
+        """
+            auto x = torch::ones({128});
+            auto y = torch::ones({1});
+        """,
+        language=Language.CPP,
+    ).collect_callgrind().as_standardized().stats(inclusive=False)
+    # Let's round trip `broadcasting_stats` just to show that we can.
+    broadcasting_stats = pickle.loads(pickle.dumps(broadcasting_stats))
+
+    cpp_timer = Timer(
+        "x * y;",
+        """
+            auto x = torch::ones({128});
+            auto y = torch::ones({128});
+        """,
+        language=Language.CPP,
+    )
+
+    print(cpp_timer.blocked_autorange(min_run_time=1))
+    stats: CallgrindStats = cpp_timer.collect_callgrind()
+    inclusive_stats = stats.as_standardized().stats(inclusive=False)
+    print(inclusive_stats[:10])
+    # And now to diff the two tasks:
+    delta = broadcasting_stats - inclusive_stats
+
+    def extract_fn_name(fn: str):
+        """Trim everything except the function name."""
+        fn = ":".join(fn.split(":")[1:])
+        return re.sub(r"\(.+\)", "(...)", fn)
+
+    # We use `.transform` to make the diff readable:
+    print(delta.transform(extract_fn_name))
+
+
+if __name__ == '__main__':
+    cudaclu_timer()
diff --git a/tutorials/compilers/compiled_autograd_demo.py b/tutorials/compilers/compiled_autograd_demo.py
new file mode 100644
index 00000000..49e90b36
--- /dev/null
+++ b/tutorials/compilers/compiled_autograd_demo.py
@@ -0,0 +1,58 @@
+import torch
+import warnings
+from torchvision.models import densenet121
+import numpy as np
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+"""
+Python 解释器调用 Dynamo，因为此调用被装饰了 @torch.compile。
+Dynamo 拦截 Python 字节码，模拟其执行并将操作记录到图中。
+AOTDispatcher 禁用钩子，并调用自动梯度引擎来计算 model.linear.weight 和 
+model.linear.bias 的梯度，并将操作记录到图中。使用 torch.autograd.Function，AOTDispatcher 重写了 train 的前向和反向传播实现。
+Inductor 生成一个对应于 AOTDispatcher 前向和反向传播优化实现的函数。
+Dynamo 设置优化后的函数，以便 Python 解释器接下来进行评估。
+Python 解释器执行优化后的函数，该函数执行 loss = model(x).sum()。
+Python 解释器执行 loss.backward()，调用自动梯度引擎，该引擎会路由到已编译的自动梯度引擎，因为我们将 torch._dynamo.config.compiled_autograd = True 设置为 True。
+已编译的自动梯度计算 model.linear.weight 和 model.linear.bias 的梯度，并将操作记录到图中，包括它遇到的任何钩子。
+在此过程中，它将记录 AOTDispatcher 之前重写的反向传播。然后，已编译的自动梯度生成一个新函数，该函数对应于 loss.backward() 
+的完全跟踪实现，并以推理模式使用 torch.compile 执行它。
+相同的步骤将递归应用于已编译的自动梯度图，但这次 AOTDispatcher 将不需要划分图。
+
+
+"""
+
+
+def train_demo():
+    model = Model()
+    x = torch.randn(10)
+    torch._dynamo.config.compiled_autograd = True
+    @torch.compile
+    def train(model, x):
+        loss = model(x).sum()
+        loss.backward()
+    train(model, x)
+
+def train_demo1():
+    model = Model()
+    x = torch.randn(10)
+    torch._dynamo.config.compiled_autograd = True
+    @torch.compile
+    def train(model, x):
+        model = torch.compile(model)
+        loss = model(x).sum()
+        torch._dynamo.config.compiled_autograd = True
+        torch.compile(lambda: loss.backward(), fullgraph=True)()
+    train(model, x)
+# TORCH_LOGS="compiled_autograd" CUDA_VISIBLE_DEVICES=1 python compiled_autograd_demo.py
+# TORCH_LOGS="compiled_autograd_verbose" CUDA_VISIBLE_DEVICES=1 python compiled_autograd_demo.py
+if __name__ == '__main__':
+    train_demo1()
diff --git a/tutorials/compilers/custom_onnxscript_demo.py b/tutorials/compilers/custom_onnxscript_demo.py
new file mode 100644
index 00000000..d73705e6
--- /dev/null
+++ b/tutorials/compilers/custom_onnxscript_demo.py
@@ -0,0 +1,43 @@
+import torch
+import onnxscript
+
+# Opset 18 is the standard supported version as of PyTorch 2.6
+from onnxscript import opset18 as op
+class GeluModel(torch.nn.Module):
+    def forward(self, input_x):
+        return torch.ops.aten.gelu(input_x)
+
+
+# Create a namespace for the custom operator using ONNX Script
+# ``com.microsoft`` is an official ONNX Runtime namespace
+microsoft_op = onnxscript.values.Opset(domain="com.microsoft", version=1)
+
+# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator.
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+# NOTE: All attributes must be annotated with type hints.
+# The function must be scripted using the ``@onnxscript.script()`` decorator when
+# using operators from custom domains. This may be improved in future versions.
+from onnxscript import FLOAT
+
+
+@onnxscript.script(microsoft_op)
+def custom_aten_gelu(self: FLOAT, approximate: str = "none") -> FLOAT:
+    return microsoft_op.Gelu(self)
+x = torch.tensor([1.0])
+
+onnx_program = torch.onnx.export(
+    GeluModel().eval(),
+    (x,),
+    dynamo=True,
+    custom_translation_table={
+        torch.ops.aten.gelu.default: custom_aten_gelu,
+    },
+)
+
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+print(onnx_program.model)
+
+result = onnx_program(x)[0]
+print(f"Result: {result}")
+torch.testing.assert_close(result, torch.ops.aten.gelu(x))
\ No newline at end of file
diff --git a/tutorials/compilers/fx_demo.py b/tutorials/compilers/fx_demo.py
new file mode 100644
index 00000000..60194dae
--- /dev/null
+++ b/tutorials/compilers/fx_demo.py
@@ -0,0 +1,119 @@
+import torch
+import torch.fx
+import torchvision.models as models
+import statistics, tabulate, time
+from typing import Any, Dict, List
+from torch.fx import Interpreter
+
+
+# 创建性能分析解释器
+class ProfilingInterpreter(Interpreter):
+    def __init__(self, mod: torch.nn.Module):
+        # Rather than have the user symbolically trace their model,
+        # we're going to do it in the constructor. As a result, the
+        # user can pass in any ``Module`` without having to worry about
+        # symbolic tracing APIs
+        gm = torch.fx.symbolic_trace(mod)
+        super().__init__(gm)
+
+        # We are going to store away two things here:
+        #
+        # 1. A list of total runtimes for ``mod``. In other words, we are
+        #    storing away the time ``mod(...)`` took each time this
+        #    interpreter is called.
+        self.total_runtime_sec: List[float] = []
+        # 2. A map from ``Node`` to a list of times (in seconds) that
+        #    node took to run. This can be seen as similar to (1) but
+        #    for specific sub-parts of the model.
+        self.runtimes_sec: Dict[torch.fx.Node, List[float]] = {}
+
+    ######################################################################
+    # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run``
+    # method is the top-level entry point for execution of the model. We will
+    # want to intercept this so that we can record the total runtime of the
+    # model.
+
+    def run(self, *args) -> Any:
+        # Record the time we started running the model
+        t_start = time.time()
+        # Run the model by delegating back into Interpreter.run()
+        return_val = super().run(*args)
+        # Record the time we finished running the model
+        t_end = time.time()
+        # Store the total elapsed time this model execution took in the
+        # ``ProfilingInterpreter``
+        self.total_runtime_sec.append(t_end - t_start)
+        return return_val
+
+    ######################################################################
+    # Now, let's override ``run_node``. ``Interpreter`` calls ``run_node`` each
+    # time it executes a single node. We will intercept this so that we
+    # can measure and record the time taken for each individual call in
+    # the model.
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        # Record the time we started running the op
+        t_start = time.time()
+        # Run the op by delegating back into Interpreter.run_node()
+        return_val = super().run_node(n)
+        # Record the time we finished running the op
+        t_end = time.time()
+        # If we don't have an entry for this node in our runtimes_sec
+        # data structure, add one with an empty list value.
+        self.runtimes_sec.setdefault(n, [])
+        # Record the total elapsed time for this single invocation
+        # in the runtimes_sec data structure
+        self.runtimes_sec[n].append(t_end - t_start)
+        return return_val
+
+    ######################################################################
+    # Finally, we are going to define a method (one which doesn't override
+    # any ``Interpreter`` method) that provides us a nice, organized view of
+    # the data we have collected.
+
+    def summary(self, should_sort: bool = False) -> str:
+        # Build up a list of summary information for each node
+        node_summaries: List[List[Any]] = []
+        # Calculate the mean runtime for the whole network. Because the
+        # network may have been called multiple times during profiling,
+        # we need to summarize the runtimes. We choose to use the
+        # arithmetic mean for this.
+        mean_total_runtime = statistics.mean(self.total_runtime_sec)
+
+        # For each node, record summary statistics
+        for node, runtimes in self.runtimes_sec.items():
+            # Similarly, compute the mean runtime for ``node``
+            mean_runtime = statistics.mean(runtimes)
+            # For easier understanding, we also compute the percentage
+            # time each node took with respect to the whole network.
+            pct_total = mean_runtime / mean_total_runtime * 100
+            # Record the node's type, name of the node, mean runtime, and
+            # percent runtime.
+            node_summaries.append(
+                [node.op, str(node), mean_runtime, pct_total])
+
+        # One of the most important questions to answer when doing performance
+        # profiling is "Which op(s) took the longest?". We can make this easy
+        # to see by providing sorting functionality in our summary view
+        if should_sort:
+            node_summaries.sort(key=lambda s: s[2], reverse=True)
+
+        # Use the ``tabulate`` library to create a well-formatted table
+        # presenting our summary information
+        headers: List[str] = [
+            'Op type', 'Op', 'Average runtime (s)', 'Pct total runtime'
+        ]
+        return tabulate.tabulate(node_summaries, headers=headers)
+
+
+if __name__ == '__main__':
+    rn18 = models.resnet18()
+    rn18.eval()
+    input = torch.randn(5, 3, 224, 224)
+    # output = rn18(input)
+    # traced_rn18 = torch.fx.symbolic_trace(rn18)
+    # print(traced_rn18.graph)
+
+    interp = ProfilingInterpreter(rn18)
+    interp.run(input)
+    print(interp.summary(True))
diff --git a/tutorials/compilers/nvidia_demo.py b/tutorials/compilers/nvidia_demo.py
new file mode 100644
index 00000000..1c496595
--- /dev/null
+++ b/tutorials/compilers/nvidia_demo.py
@@ -0,0 +1,191 @@
+# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in
+# order to reproduce the speedup numbers shown below and documented elsewhere.
+
+import torch
+import warnings
+from torchvision.models import densenet121
+import numpy as np
+
+gpu_ok = False
+if torch.cuda.is_available():
+    device_cap = torch.cuda.get_device_capability()
+    if device_cap in ((7, 0), (8, 0), (9, 0)):
+        gpu_ok = True
+
+if not gpu_ok:
+    warnings.warn(
+        "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower "
+        "than expected."
+    )
+
+
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    result = fn()
+    end.record()
+    torch.cuda.synchronize()
+    return result, start.elapsed_time(end) / 1000
+
+
+# Generates random input and targets data for the model, where `b` is
+# batch size.
+def generate_data(b):
+    return (
+        torch.randn(b, 3, 128, 128).to().cuda(),
+        torch.randint(1000, (b,)).cuda(),
+    )
+
+
+N_ITERS = 10
+
+
+def init_model():
+    return densenet121().cuda()
+
+
+model = init_model()
+
+# Note that we generally recommend directly compiling a torch.nn.Module by calling
+# its .compile() method.
+model_opt = init_model()
+model_opt.compile(mode="reduce-overhead")
+
+
+def first_demo():
+    inp = generate_data(16)[0]
+    with torch.no_grad():
+        print("eager:", timed(lambda: model(inp))[1])
+        print("compile:", timed(lambda: model_opt(inp))[1])
+
+
+"""
+(eval) eager median: 0.01525604772567749, compile median: 0.003931119918823242, speedup: 3.8808400762916184x
+eager eval time 0: 0.2951763916015625
+eager eval time 1: 0.01678335952758789
+eager eval time 2: 0.015734944343566894
+eager eval time 3: 0.015243231773376465
+eager eval time 4: 0.015268863677978516
+eager eval time 5: 0.01522979164123535
+eager eval time 6: 0.015177727699279785
+eager eval time 7: 0.015617024421691895
+eager eval time 8: 0.015202367782592773
+eager eval time 9: 0.015126527786254883
+~~~~~~~~~~
+compile eval time 0: 5.565470703125
+compile eval time 1: 0.24912281799316408 第二次还是慢了，尽管比第一次运行快得多。这是因为 "reduce-overhead" 模式会为 CUDA 图运行几次预热迭代。
+compile eval time 2: 0.00450867223739624
+compile eval time 3: 0.004577280044555664
+compile eval time 4: 0.003706687927246094
+compile eval time 5: 0.0037672960758209227
+compile eval time 6: 0.003935231924057007
+compile eval time 7: 0.003768320083618164
+compile eval time 8: 0.003927007913589477
+compile eval time 9: 0.0038635520935058594
+"""
+
+
+def predict_many_demo():
+    eager_times = []
+    for i in range(N_ITERS):
+        inp = generate_data(16)[0]
+        with torch.no_grad():
+            _, eager_time = timed(lambda: model(inp))
+        eager_times.append(eager_time)
+        print(f"eager eval time {i}: {eager_time}")
+
+    print("~" * 10)
+
+    compile_times = []
+    for i in range(N_ITERS):
+        inp = generate_data(16)[0]
+        with torch.no_grad():
+            _, compile_time = timed(lambda: model_opt(inp))
+        compile_times.append(compile_time)
+        print(f"compile eval time {i}: {compile_time}")
+    print("~" * 10)
+
+    import numpy as np
+
+    eager_med = np.median(eager_times)
+    compile_med = np.median(compile_times)
+    speedup = eager_med / compile_med
+    assert speedup > 1
+    print(
+        f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x"
+    )
+    print("~" * 10)
+
+
+opt = torch.optim.Adam(model.parameters())
+
+
+def train(mod, data):
+    opt.zero_grad(True)
+    pred = mod(data[0])
+    loss = torch.nn.CrossEntropyLoss()(pred, data[1])
+    loss.backward()
+    opt.step()
+
+"""
+eager train time 0: 0.6821947631835937
+eager train time 1: 0.0516577262878418
+eager train time 2: 0.048728256225585936
+eager train time 3: 0.047841407775878905
+eager train time 4: 0.04823257446289062
+eager train time 5: 0.048595008850097654
+eager train time 6: 0.057622528076171874
+eager train time 7: 0.05626262283325195
+eager train time 8: 0.057923583984375
+eager train time 9: 0.058123264312744144
+~~~~~~~~~~
+
+compile train time 0: 141.419421875
+compile train time 1: 8.4247080078125
+compile train time 2: 0.018790399551391602
+compile train time 3: 0.010836992263793945
+compile train time 4: 0.010805248260498047
+compile train time 5: 0.010437631607055664
+compile train time 6: 0.010218496322631837
+compile train time 7: 0.012146688461303711
+compile train time 8: 0.012992511749267579
+compile train time 9: 0.012563455581665038
+~~~~~~~~~~
+(train) eager median: 0.05396017456054687, compile median: 0.012355072021484375, speedup: 4.3674512351457695x
+"""
+def train_many_demo():
+    eager_times = []
+    for i in range(N_ITERS):
+        inp = generate_data(16)
+        _, eager_time = timed(lambda: train(model, inp))
+        eager_times.append(eager_time)
+        print(f"eager train time {i}: {eager_time}")
+    print("~" * 10)
+    # Note that because we are compiling a regular Python function, we do not
+    # call any .compile() method.
+    train_opt = torch.compile(train, mode="reduce-overhead")
+
+    compile_times = []
+    for i in range(N_ITERS):
+        inp = generate_data(16)
+        _, compile_time = timed(lambda: train_opt(model, inp))
+        compile_times.append(compile_time)
+        print(f"compile train time {i}: {compile_time}")
+    print("~" * 10)
+
+    eager_med = np.median(eager_times)
+    compile_med = np.median(compile_times)
+    speedup = eager_med / compile_med
+    assert speedup > 1
+    print(
+        f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x"
+    )
+    print("~" * 10)
+
+
+if __name__ == '__main__':
+    train_many_demo()
diff --git a/tutorials/compilers/onnx_demo.py b/tutorials/compilers/onnx_demo.py
new file mode 100644
index 00000000..b3fba746
--- /dev/null
+++ b/tutorials/compilers/onnx_demo.py
@@ -0,0 +1,109 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import onnx
+import onnxruntime
+import onnxscript
+import os
+
+
+def get_version():
+    print(torch.__version__)
+    print(onnxscript.__version__)
+    print(onnxruntime.__version__)
+
+
+"""简单的图像分类器模型"""
+
+
+class ImageClassifierModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x: torch.Tensor):
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = torch.flatten(x, 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def onnx_demo():
+    # 创建示例输入（固定随机种子以确保可重复性）
+    torch.manual_seed(42)
+    example_inputs = (torch.randn(1, 1, 32, 32),)
+    onnx_inputs = [tensor.numpy(force=True) for tensor in example_inputs]
+    print(f"Input length: {len(onnx_inputs)}")
+    print(f"Sample input shape: {onnx_inputs[0].shape}")
+
+    # 创建一个PyTorch模型实例，用于导出和比较
+    torch_model = ImageClassifierModel()
+
+    # 生成ONNX模型（传入同一个模型实例）
+    model_2_onnx(torch_model, example_inputs)
+
+    # 加载ONNX模型并运行推理
+    ort_session = onnxruntime.InferenceSession(
+        "./image_classifier_model.onnx", providers=["CPUExecutionProvider"]
+    )
+
+    onnxruntime_input = {input_arg.name: input_value for input_arg, input_value in
+                         zip(ort_session.get_inputs(), onnx_inputs)}
+
+    # ONNX Runtime returns a list of outputs
+    onnxruntime_outputs = ort_session.run(None, onnxruntime_input)[0]
+
+    # 使用同一个PyTorch模型进行推理
+    torch_outputs = torch_model(*example_inputs)
+
+    print(f"PyTorch output shape: {torch_outputs.shape}")
+    print(f"ONNX Runtime output shape: {onnxruntime_outputs.shape}")
+
+    # 直接比较整个张量而不是逐个元素比较
+    try:
+        torch.testing.assert_close(torch_outputs, torch.tensor(onnxruntime_outputs), rtol=1e-3, atol=1e-3)
+        print("PyTorch and ONNX Runtime output matched!")
+    except AssertionError as e:
+        print(f"Outputs didn't match: {e}")
+        # 输出详细的差异信息
+        print("\nPyTorch output:")
+        print(torch_outputs)
+        print("\nONNX Runtime output:")
+        print(onnxruntime_outputs)
+        print("\nDifference:")
+        print(torch_outputs - torch.tensor(onnxruntime_outputs))
+
+    print(f"Output length: {onnxruntime_outputs.shape[1]}")
+    print(f"Sample output: {onnxruntime_outputs[0][:5]}...")
+
+
+def model_2_onnx(torch_model, example_inputs):
+    # 导出模型为ONNX格式，使用默认的opset版本
+    torch.onnx.export(
+        torch_model,
+        example_inputs,
+        "image_classifier_model.onnx",
+        export_params=True,
+        # 不指定opset_version，让PyTorch自动选择合适的版本
+        do_constant_folding=True,
+        input_names=['input'],
+        output_names=['output'],
+        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
+    )
+
+    # 验证导出的模型
+    onnx_model = onnx.load("image_classifier_model.onnx")
+    onnx.checker.check_model(onnx_model)
+    print("ONNX model generated and validated successfully!")
+    print(f"ONNX model opset version: {onnx_model.opset_import[0].version}")
+
+
+if __name__ == '__main__':
+    onnx_demo()
\ No newline at end of file
diff --git a/tutorials/compilers/onnxscript_demo.py b/tutorials/compilers/onnxscript_demo.py
new file mode 100644
index 00000000..80ac8aee
--- /dev/null
+++ b/tutorials/compilers/onnxscript_demo.py
@@ -0,0 +1,43 @@
+import torch
+import onnxscript
+
+# Opset 18 is the standard supported version as of PyTorch 2.6
+from onnxscript import opset18 as op
+
+
+# Create a model that uses the operator torch.ops.aten.add.Tensor
+class Model(torch.nn.Module):
+    def forward(self, input_x, input_y):
+        return torch.ops.aten.add.Tensor(input_x, input_y)
+
+
+# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator.
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+# All attributes must be annotated with type hints.
+def custom_aten_add(self, other, alpha: float = 1.0):
+    if alpha != 1.0:
+        alpha = op.CastLike(alpha, other)
+        other = op.Mul(other, alpha)
+    # To distinguish the custom implementation from the builtin one, we switch the order of the inputs
+    return op.Add(other, self)
+
+
+x = torch.tensor([1.0])
+y = torch.tensor([2.0])
+
+# Then we provide the custom implementation to the ONNX exporter as a ``custom_translation_table``.
+onnx_program = torch.onnx.export(
+    Model().eval(),
+    (x, y),
+    dynamo=True,
+    custom_translation_table={
+        torch.ops.aten.add.Tensor: custom_aten_add,
+    },
+)
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+print(onnx_program.model)
+
+result = onnx_program(x, y)[0]
+print(f"Result: {result}")
+torch.testing.assert_close(result, torch.tensor([3.0]))
\ No newline at end of file
diff --git a/tutorials/compilers/set_stance_demo.py b/tutorials/compilers/set_stance_demo.py
new file mode 100644
index 00000000..255bb49f
--- /dev/null
+++ b/tutorials/compilers/set_stance_demo.py
@@ -0,0 +1,100 @@
+import torch
+
+
+@torch.compile
+def my_big_model(x):
+    return torch.relu(x)
+
+
+# fail_on_recompile 防止重新编译
+def fail_on_recompile():
+    # first compilation
+    my_big_model(torch.randn(3))
+
+    with torch.compiler.set_stance("fail_on_recompile"):
+        my_big_model(torch.randn(3))  # no recompilation - OK
+        try:
+            # 这里 shape 改变了，会触发 recompilation
+            my_big_model(torch.randn(4))  # recompilation - error
+        except Exception as e:
+            print(e)
+
+
+@torch.compile
+def my_huge_model(x):
+    if torch.compiler.is_compiling():
+        return x + 1
+    else:
+        return x - 1
+
+
+"""
+报错过于 disruptive，我们可以改用 "eager_on_recompile"，它将导致 torch.compile 回退到立即执行模式而不是报错。
+如果预计重新编译不会频繁发生，但一旦需要，我们宁愿承担立即执行的成本而不是重新编译的成本，那么这可能很有用。
+"""
+
+
+def eager_on_recompile():
+    # first compilation
+    print(my_huge_model(torch.zeros(3)))  # 1
+    with torch.compiler.set_stance("eager_on_recompile"):
+        print(my_huge_model(torch.zeros(3)))  # 1
+        print(my_huge_model(torch.zeros(4)))  # -1
+        print(my_huge_model(torch.zeros(3)))  # 1
+
+
+# 衡量性能提升
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    result = fn()
+    end.record()
+    torch.cuda.synchronize()
+    return result, start.elapsed_time(end) / 1000
+
+
+@torch.compile
+def my_gigantic_model(x, y):
+    x = x @ y
+    x = x @ y
+    x = x @ y
+    return x
+
+
+"""
+eager: 0.0004822399914264679
+compiled: 0.00010444799810647964
+"""
+
+
+def force_eager_demo():
+    inps = torch.randn(5, 5), torch.randn(5, 5)
+    with torch.compiler.set_stance("force_eager"):
+        print("eager:", timed(lambda: my_gigantic_model(*inps))[1])
+    # warmups
+    for _ in range(3):
+        my_gigantic_model(*inps)
+    print("compiled:", timed(lambda: my_gigantic_model(*inps))[1])
+
+
+@torch.compile
+def my_humongous_model(x):
+    return torch.sin(x, x)
+
+def fast_find_error():
+    try:
+        # sin() takes 1 positional argument but 2 were given
+        with torch.compiler.set_stance("force_eager"):
+            print(my_humongous_model(torch.randn(3)))
+        # this call to the compiled model won't run
+        print(my_humongous_model(torch.randn(3)))
+    except Exception as e:
+        print(e)
+
+if __name__ == '__main__':
+    fast_find_error()
+
diff --git a/tutorials/compilers/torch_compile.py b/tutorials/compilers/torch_compile.py
new file mode 100644
index 00000000..77ffce45
--- /dev/null
+++ b/tutorials/compilers/torch_compile.py
@@ -0,0 +1,45 @@
+import torch
+import torch.utils.benchmark as benchmark
+from torch.utils.benchmark import Language
+from torch.utils.benchmark import Timer
+model = torch.nn.Sequential(
+    *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+input = torch.rand(1024, device="cuda")
+output = model(input)
+output.sum().backward()
+opt = torch.optim.Adam(model.parameters(), lr=0.01)
+
+
+@torch.compile(fullgraph=False)
+def fn():
+    opt.step()
+
+
+def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    return t0.blocked_autorange().mean * 1e6
+
+
+def warmup():
+    for _ in range(5):
+        fn()
+
+
+def diff():
+    eager_runtime = benchmark_torch_function_in_microseconds(opt.step)
+    compiled_runtime = benchmark_torch_function_in_microseconds(fn)
+
+    assert eager_runtime > compiled_runtime
+
+    print(f"eager runtime: {eager_runtime}us")
+    print(f"compiled runtime: {compiled_runtime}us")
+
+
+
+
+if __name__ == '__main__':
+    diff()
+
diff --git a/tutorials/ddp/basic_ddp_demo.py b/tutorials/ddp/basic_ddp_demo.py
new file mode 100644
index 00000000..3058607d
--- /dev/null
+++ b/tutorials/ddp/basic_ddp_demo.py
@@ -0,0 +1,137 @@
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.optim as optim
+import os
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import os
+import sys
+import tempfile
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+import torch.multiprocessing as mp
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+
+# On Windows platform, the torch.distributed package only
+# supports Gloo backend, FileStore and TcpStore.
+# For FileStore, set init_method parameter in init_process_group
+# to a local file. Example as follow:
+# init_method="file:///f:/libtmp/some_file"
+# dist.init_process_group(
+#    "gloo",
+#    rank=rank,
+#    init_method=init_method,
+#    world_size=world_size)
+# For TcpStore, same way as on Linux.
+
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+
+    # We want to be able to train our model on an `accelerator <https://pytorch.ac.cn/docs/stable/torch.html#accelerators>`__
+    # such as CUDA, MPS, MTIA, or XPU.
+    acc = torch.accelerator.current_accelerator()
+    print('Accelerator:', acc)
+    backend = torch.distributed.get_default_backend_for_device(acc)
+    # initialize the process group
+    dist.init_process_group(backend, rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+def demo_basic(rank, world_size):
+    print(f"Running basic DDP example on rank {rank}.")
+    setup(rank, world_size)
+
+    # create model and move it to GPU with id rank
+    model = ToyModel().to(rank)
+    ddp_model = DDP(model, device_ids=[rank])
+
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+    optimizer.zero_grad()
+    outputs = ddp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(rank)
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+
+    cleanup()
+    print(f"Finished running basic DDP example on rank {rank}.")
+
+
+def run_demo(demo_fn, world_size):
+    mp.spawn(demo_fn,
+             args=(world_size,),
+             nprocs=world_size,
+             join=True)
+
+
+def demo_checkpoint(rank, world_size):
+    print(f"Running DDP checkpoint example on rank {rank}.")
+    setup(rank, world_size)
+
+    model = ToyModel().to(rank)
+    ddp_model = DDP(model, device_ids=[rank])
+
+    CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
+    print(f"Checkpoint path: {CHECKPOINT_PATH}")
+    if rank == 0:
+        # All processes should see same parameters as they all start from same
+        # random parameters and gradients are synchronized in backward passes.
+        # Therefore, saving it in one process is sufficient.
+        torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
+
+    # Use a barrier() to make sure that process 1 loads the model after process
+    # 0 saves it.
+    dist.barrier()
+    # We want to be able to train our model on an `accelerator <https://pytorch.ac.cn/docs/stable/torch.html#accelerators>`__
+    # such as CUDA, MPS, MTIA, or XPU.
+    acc = torch.accelerator.current_accelerator()
+    # configure map_location properly
+    map_location = {f'{acc}:0': f'{acc}:{rank}'}
+    ddp_model.load_state_dict(
+        torch.load(CHECKPOINT_PATH, map_location=map_location, weights_only=True))
+
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+    optimizer.zero_grad()
+    outputs = ddp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(rank)
+
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+
+    # Not necessary to use a dist.barrier() to guard the file deletion below
+    # as the AllReduce ops in the backward pass of DDP already served as
+    # a synchronization.
+
+    if rank == 0:
+        os.remove(CHECKPOINT_PATH)
+
+    cleanup()
+    print(f"Finished running DDP checkpoint example on rank {rank}.")
+
+
+if __name__ == '__main__':
+    run_demo(demo_checkpoint, world_size=2)
diff --git a/tutorials/ddp/distributed_demo.py b/tutorials/ddp/distributed_demo.py
new file mode 100644
index 00000000..272a27ed
--- /dev/null
+++ b/tutorials/ddp/distributed_demo.py
@@ -0,0 +1,79 @@
+"""run.py:"""
+# !/usr/bin/env python
+import os
+import sys
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+# def run(rank, size):
+#     """ Distributed function to be implemented later. """
+#     """ 分布式函数示例 """
+#     print(f"Rank {rank} of {size} is running.")
+#     # 简单的分布式通信示例
+#     tensor = torch.tensor([rank], dtype=torch.float32)
+#     dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+#     print(f"Rank {rank} reduced tensor: {tensor.item()}")
+#
+# """Blocking point-to-point communication."""
+#
+#
+# def run(rank, size):
+#     tensor = torch.zeros(1)
+#     if rank == 0:
+#         tensor += 1
+#         # Send the tensor to process 1
+#         dist.send(tensor=tensor, dst=1)
+#     else:
+#         # Receive tensor from process 0
+#         dist.recv(tensor=tensor, src=0)
+#     print('Rank ', rank, ' has data ', tensor[0])
+"""Non-blocking point-to-point communication."""
+
+
+# def run(rank, size):
+#     tensor = torch.zeros(1)
+#     req = None
+#     if rank == 0:
+#         tensor += 1
+#         # Send the tensor to process 1
+#         req = dist.isend(tensor=tensor, dst=1)
+#         print('Rank 0 started sending')
+#     else:
+#         # Receive tensor from process 0
+#         req = dist.irecv(tensor=tensor, src=0)
+#         print('Rank 1 started receiving')
+#     req.wait()
+#     print('Rank ', rank, ' has data ', tensor[0])
+def run(rank, size):
+    """ Simple collective communication. """
+    group = dist.new_group([0, 1])
+    tensor = torch.ones(1)
+    print(f"tensor: {tensor}")
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)
+    print('Rank ', rank, ' has data ', tensor[0])
+
+
+def init_process(rank, size, fn, backend='gloo'):
+    """ Initialize the distributed environment. """
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '29500'
+    dist.init_process_group(backend, rank=rank, world_size=size)
+    fn(rank, size)
+
+
+if __name__ == "__main__":
+    world_size = 2
+    processes = []
+    if "google.colab" in sys.modules:
+        print("Running in Google Colab")
+        mp.get_context("spawn")
+    else:
+        mp.set_start_method("spawn")
+    for rank in range(world_size):
+        p = mp.Process(target=init_process, args=(rank, world_size, run))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
diff --git a/tutorials/ddp/pipelining_tutorial.py b/tutorials/ddp/pipelining_tutorial.py
new file mode 100644
index 00000000..f0337ab6
--- /dev/null
+++ b/tutorials/ddp/pipelining_tutorial.py
@@ -0,0 +1,120 @@
+import os
+import torch.distributed as dist
+import torch
+from torch.distributed.pipelining import pipeline, SplitPoint, PipelineStage, ScheduleGPipe
+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+
+global rank, device, pp_group, stage_index, num_stages
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 512
+    n_layers: int = 8
+    n_heads: int = 8
+    vocab_size: int = 10000
+
+
+class Transformer(nn.Module):
+    def __init__(self, model_args: ModelArgs):
+        super().__init__()
+
+        self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
+
+        # Using a ModuleDict lets us delete layers witout affecting names,
+        # ensuring checkpoints will correctly save and load.
+        self.layers = torch.nn.ModuleDict()
+        for layer_id in range(model_args.n_layers):
+            self.layers[str(layer_id)] = nn.TransformerDecoderLayer(model_args.dim, model_args.n_heads)
+
+        self.norm = nn.LayerNorm(model_args.dim)
+        self.output = nn.Linear(model_args.dim, model_args.vocab_size)
+
+    def forward(self, tokens: torch.Tensor):
+        # Handling layers being 'None' at runtime enables easy pipeline splitting
+        h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+
+        for layer in self.layers.values():
+            h = layer(h, h)
+
+        h = self.norm(h) if self.norm else h
+        output = self.output(h).clone() if self.output else h
+        return output
+
+
+def init_distributed():
+    global rank, device, pp_group, stage_index, num_stages
+    rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    device = torch.device(f"cuda:{rank}") if torch.cuda.is_available() else torch.device("cpu")
+    dist.init_process_group()
+
+    # This group can be a sub-group in the N-D parallel case
+    pp_group = dist.new_group()
+    stage_index = rank
+    num_stages = world_size
+
+
+def manual_model_split(model) -> PipelineStage:
+    if stage_index == 0:
+        # prepare the first stage model
+        for i in range(4, 8):
+            del model.layers[str(i)]
+        model.norm = None
+        model.output = None
+
+    elif stage_index == 1:
+        # prepare the second stage model
+        for i in range(4):
+            del model.layers[str(i)]
+        model.tok_embeddings = None
+
+    stage = PipelineStage(
+        model,
+        stage_index,
+        num_stages,
+        device,
+    )
+    return stage
+
+
+if __name__ == "__main__":
+    init_distributed()
+    num_microbatches = 4
+    model_args = ModelArgs()
+    model = Transformer(model_args)
+
+    # Dummy data
+    x = torch.ones(32, 500, dtype=torch.long)
+    y = torch.randint(0, model_args.vocab_size, (32, 500), dtype=torch.long)
+    example_input_microbatch = x.chunk(num_microbatches)[0]
+
+    # Option 1: Manual model splitting
+    stage = manual_model_split(model)
+
+    # Option 2: Tracer model splitting
+    # stage = tracer_model_split(model, example_input_microbatch)
+
+    model.to(device)
+    x = x.to(device)
+    y = y.to(device)
+
+
+    def tokenwise_loss_fn(outputs, targets):
+        loss_fn = nn.CrossEntropyLoss()
+        outputs = outputs.reshape(-1, model_args.vocab_size)
+        targets = targets.reshape(-1)
+        return loss_fn(outputs, targets)
+
+
+    schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn)
+
+    if rank == 0:
+        schedule.step(x)
+    elif rank == 1:
+        losses = []
+        output = schedule.step(target=y, losses=losses)
+        print(f"losses: {losses}")
+    dist.destroy_process_group()
diff --git a/tutorials/ddp/pytorch_elastic.py b/tutorials/ddp/pytorch_elastic.py
new file mode 100644
index 00000000..01a44ff6
--- /dev/null
+++ b/tutorials/ddp/pytorch_elastic.py
@@ -0,0 +1,47 @@
+import os
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+def demo_basic():
+    torch.accelerator.set_device_index(int(os.environ["LOCAL_RANK"]))
+    acc = torch.accelerator.current_accelerator()
+    print('Accelerator:', acc)
+    backend = torch.distributed.get_default_backend_for_device(acc)
+    dist.init_process_group(backend)
+    rank = dist.get_rank()
+    print(f"Start running basic DDP example on rank {rank}.")
+    # create model and move it to GPU with id rank
+    device_id = rank % torch.accelerator.device_count()
+    print(f"Device ID: {device_id}")
+    model = ToyModel().to(device_id)
+    ddp_model = DDP(model, device_ids=[device_id])
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+    optimizer.zero_grad()
+    outputs = ddp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(device_id)
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+    dist.destroy_process_group()
+    print(f"Finished running basic DDP example on rank {rank}.")
+"""
+torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py
+"""
+if __name__ == "__main__":
+    demo_basic()
diff --git a/tutorials/tcp/libuv_demo.py b/tutorials/tcp/libuv_demo.py
new file mode 100644
index 00000000..1c9d5ac8
--- /dev/null
+++ b/tutorials/tcp/libuv_demo.py
@@ -0,0 +1,30 @@
+import logging
+import os
+
+from time import perf_counter
+
+import torch
+import torch.distributed as dist
+
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger(__name__)
+
+if __name__ == '__main__':
+    # Env var are preset when launching the benchmark
+    env_rank = os.environ.get("RANK", 0)
+    env_world_size = os.environ.get("WORLD_SIZE", 1)
+    env_master_addr = os.environ.get("MASTER_ADDR", "localhost")
+    env_master_port = os.environ.get("MASTER_PORT", "23456")
+
+    start = perf_counter()
+    tcp_store = dist.TCPStore(
+        env_master_addr,
+        int(env_master_port),
+        world_size=int(env_world_size),
+        is_master=(int(env_rank) == 0),
+    )
+    end = perf_counter()
+    time_elapsed = end - start
+    logger.info(
+        f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds."
+    )
diff --git a/tutorials/torch-distributed/init-process-group.py b/tutorials/torch-distributed/init-process-group.py
new file mode 100644
index 00000000..c034fe29
--- /dev/null
+++ b/tutorials/torch-distributed/init-process-group.py
@@ -0,0 +1,25 @@
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+
+def init_process(rank, world_size):
+    print(f"进程已启动: 此进程的 rank 是 {rank}")
+
+
+def main():
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '29500'
+    world_size = torch.cuda.device_count()
+    print(f"准备启动 {world_size} 个进程...")
+    mp.spawn(
+        init_process,
+        args=(world_size,),
+        nprocs=world_size,
+        join=True
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/torch-distributed/readme.md b/tutorials/torch-distributed/readme.md
new file mode 100644
index 00000000..0373b825
--- /dev/null
+++ b/tutorials/torch-distributed/readme.md
@@ -0,0 +1,94 @@
+<details>
+<summary>init_process_group</summary>
+
+```python
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+def init_process(rank, world_size):
+    print(f"进程已启动: 此进程的 rank 是 {rank}")
+    
+    # 设置当前进程使用的 GPU
+    torch.cuda.set_device(rank)
+    
+    try:
+        # 加入进程组
+        print(f"进程 {rank} 正在加入进程组...")
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+        print(f"进程 {rank} 已成功加入进程组")
+        
+        # 验证身份
+        assert rank == dist.get_rank()
+        assert world_size == dist.get_world_size()
+        
+        # 准备当前进程的信息
+        process_info = (
+            f"\n进程 {rank} 信息:\n"
+            f"- Device: {torch.cuda.current_device()}\n"
+            f"- GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}\n"
+        )
+        
+        # 将字符串转换为固定长度的张量
+        max_len = 100  # 确保足够长以容纳信息
+        process_info_tensor = torch.zeros(max_len, dtype=torch.int32, device='cuda')
+        process_info_bytes = process_info.encode('utf-8')
+        process_info_tensor[:len(process_info_bytes)] = torch.tensor([b for b in process_info_bytes], dtype=torch.int32)
+        
+        # 创建用于收集所有进程信息的张量列表
+        gathered_tensors = [torch.zeros(max_len, dtype=torch.int32, device='cuda') for _ in range(world_size)]
+
+        # 使用 all_gather 收集所有进程的信息
+        dist.all_gather(gathered_tensors, process_info_tensor)
+
+
+        if rank == 0:
+            print("=============== 所有进程信息 ===============")
+            for tensor in gathered_tensors:
+                info_bytes = tensor.cpu().numpy().astype('uint8').tobytes() 
+                info_str = info_bytes.decode('utf-8', 'ignore').strip('\x00')
+                print(info_str)
+        
+        # 创建张量并进行通信
+        tensor = torch.ones(1).cuda() * rank
+        print(f"进程 {rank} 的原始张量值: {tensor.item()}")
+        
+        # 所有进程同步点
+        dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+        print(f"进程 {rank} 的最终张量值: {tensor.item()}")
+    
+    finally:
+        dist.destroy_process_group()
+
+def main():
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '29500'
+    
+    world_size = torch.cuda.device_count()
+    print(f"准备启动 {world_size} 个进程...")
+    
+    mp.spawn(
+        init_process,
+        args=(world_size,),
+        nprocs=world_size,
+        join=True
+    )
+    
+    #! 等价于通过以下代码启动进程
+    # processes = []
+    # for rank in range(world_size):
+    #     p = mp.Process(target=init_process, args=(rank, world_size))
+    #     p.start()
+    #     processes.append(p)
+
+    # # 相当于 join=True 的效果
+    # for p in processes:
+    #     p.join()
+
+if __name__ == "__main__":
+    main()
+```
+
+</details> 
\ No newline at end of file
diff --git a/tutorials/torch-layout/demo1.py b/tutorials/torch-layout/demo1.py
new file mode 100644
index 00000000..72f49858
--- /dev/null
+++ b/tutorials/torch-layout/demo1.py
@@ -0,0 +1,144 @@
+import torch
+
+
+def stride_demo():
+    """
+    连续和非连续布局
+    1.按照张量的逻辑形状（比如 2x3、3x2），以C 风格（行优先） 遍历每一个元素时，访问的内存地址是否是连续递增的。
+    为什么内存布局重要？
+    性能影响：连续布局的张量访问内存时，CPU/GPU 的缓存命中率更高，运算速度更快；非连续张量可能因为内存跳跃访问导致性能下降。
+    操作限制：部分 PyTorch 操作（如view、resize_）仅支持连续张量，非连续张量会抛出RuntimeError。
+    内存效率：像转置这样的操作通过修改布局而非复制数据，能节省大量内存。
+    """
+    # 从0到12 分成三行四列
+    x = torch.arange(12).view(3, 4)
+    print(f"view: {x}")
+    print(f"shape: {x.shape}")
+    # 在指定维度上从一个元素调到下一个元素所需的距离 dim 是两个数字，为了访问下一行需要往前移动4步，下一列应该向前移动1步
+    print(f"stride: {x.stride()}")
+    print(f"is_contiguous: {x.is_contiguous()}")
+    print("\n***********************************\n")
+    y = x.t()
+
+    # 这里没有真正的转置，只是改变了视图的 stride “如何从内存中读取数据”
+    print(f"v_t is_contiguous: {y.is_contiguous()}")
+    print(f"v_t: {y}")
+    print(f"v_t shape: {y.shape}")
+    print(f"v_t stride: {y.stride()}")
+    print("\n***********************************\n")
+    z = y.contiguous()
+    print(f"v_t is_contiguous: {z.is_contiguous()}")
+    print(f"v_t: {z}")
+    print(f"v_t shape: {z.shape}")
+    print(f"v_t stride: {z.stride()}")
+
+
+def storage_demo():
+    print(f"PyTorch版本: {torch.__version__}")
+    print(
+        f"操作系统: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
+    )
+
+    x = torch.tensor([[0, 1, 2], [3, 4, 5]], dtype=torch.float32)
+    storage_x = x.storage()
+    # 1. storage永远是一维的，不管Tensor是几维
+    print(f"storage:\n{storage_x}\n")
+    print("storage的类型：", type(storage_x))  # torch.storage._TypedStorage
+    print("storage的长度：", len(storage_x))  # 6（元素总数）
+    print("storage_x的id：", id(storage_x))
+
+    # 2. 转置后的Tensor共享同一个storage（物理内存没复制）
+    y = x.t()
+    storage_y = y.storage()
+    print("x的数据指针：", x.data_ptr())
+    print("y的数据指针：", y.data_ptr())
+    print("x和y的数据指针是否相同：", x.data_ptr() == y.data_ptr())
+
+    print("storage_y的id：", id(storage_y))
+
+
+def shared_storage_demo():
+    """
+    验证多个tensor共享storage时，修改一个会影响其他所有tensor
+    """
+    print("=" * 60)
+    print("验证多个tensor共享storage的行为")
+    print("=" * 60)
+
+    # 创建原始tensor
+    x = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=torch.float32)
+
+    # 创建多个共享storage的tensor
+    y = x.t()  # 转置
+    z = x.view(-1)  # 展平成一维
+    w = x[1:, 2:]  # 切片
+    v = x.reshape(2, 6)  # reshape（如果可能，会共享storage）
+
+    print("\n【初始状态】")
+    print(f"x (原始) =\n{x}")
+    print(f"y (转置) =\n{y}")
+    print(f"z (展平) = {z}")
+    print(f"w (切片) =\n{w}")
+    print(f"v (reshape) =\n{v}")
+
+    # 验证它们是否共享存储（通过data_ptr比较）
+    print("\n【内存地址验证】")
+    print(f"x.data_ptr() = {x.data_ptr()}")
+    print(f"y.data_ptr() = {y.data_ptr()}")
+    print(f"z.data_ptr() = {z.data_ptr()}")
+    print(f"w.data_ptr() = {w.data_ptr()} (切片会有偏移)")
+    print(f"v.data_ptr() = {v.data_ptr()}")
+    print(f"\nx和y是否共享底层内存: {x.data_ptr() == y.data_ptr()}")
+    print(f"x和z是否共享底层内存: {x.data_ptr() == z.data_ptr()}")
+    print(f"x和v是否共享底层内存: {x.data_ptr() == v.data_ptr()}")
+
+    # 修改x的一个元素
+    print("\n【修改x[0, 0] = 999】")
+    x[0, 0] = 999
+
+    print(f"x =\n{x}")
+    print(f"y =\n{y}")  # y[0, 0]应该也变成999
+    print(f"z = {z}")  # z[0]应该也变成999
+    print(f"w =\n{w}")  # w不受影响，因为它是切片[1:, 2:]
+    print(f"v =\n{v}")  # v[0, 0]应该也变成999
+
+    # 修改y的一个元素
+    print("\n【修改y[1, 1] = 888】")
+    y[1, 1] = 888
+
+    print(f"x =\n{x}")  # x[1, 1]应该也变成888
+    print(f"y =\n{y}")
+    print(f"z = {z}")  # z[5]应该也变成888
+    print(f"v =\n{v}")  # v相应位置也会变化
+
+    # 修改z的一个元素
+    print("\n【修改z[10] = 777】")
+    z[10] = 777
+
+    print(f"x =\n{x}")  # x[2, 2]应该也变成777
+    print(f"y =\n{y}")
+    print(f"z = {z}")
+    print(f"w =\n{w}")  # w[1, 0]应该也变成777（因为w是x[1:, 2:]）
+
+    # 对比：使用clone()创建真正的副本
+    print("\n【对比：使用clone()创建独立副本】")
+    x_copy = x.clone()
+    print(f"x_copy.data_ptr() = {x_copy.data_ptr()}")
+    print(f"x_copy与x是否共享内存: {x_copy.data_ptr() == x.data_ptr()}")
+
+    x[0, 1] = 666
+    print("\n修改x[0, 1] = 666后：")
+    print(f"x =\n{x}")
+    print(f"x_copy =\n{x_copy}  (不受影响)")
+
+    print("\n" + "=" * 60)
+
+
+if __name__ == "__main__":
+    # shared_storage_demo()
+    x = torch.randn(2, 3, 4)
+    print(f"x: {x}")
+    print(f"x shape: {x.shape}")
+    print(f"x stride: {x.stride()}")
+    # import json
+    # print(json.dumps(x.tolist(), indent=4))