diff --git a/.gitignore b/.gitignore
index 7eade253..40ac8ab9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,17 @@
*.pkl
*.zip
+*.pth
+*.txt
+*.ckpt
+*.pyc
+*.onnx
+*.data
+*.lock
+*/__pycache__/
+*.DS_Store
+*.idea/
+*.pytest_cache/
+*.ruff_cache/
data/
.ipynb_checkpoints
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..9227b116
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Python: 当前文件",
+ "type": "python",
+ "request": "launch",
+ "program": "${file}",
+ "console": "integratedTerminal",
+ "cwd": "${workspaceFolder}",
+ "justMyCode": true
+ }
+ ]
+}
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..a6c53728
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,14 @@
+{
+ "python.defaultInterpreterPath": "${workspaceFolder}/pytorch-tutorial/bin/python",
+ "python.terminal.activateEnvironment": true,
+ "python.analysis.typeCheckingMode": "basic",
+ "[python]": {
+ "editor.defaultFormatter": "charliermarsh.ruff",
+ "editor.formatOnSave": true,
+ "editor.codeActionsOnSave": {
+ "source.fixAll": "explicit",
+ "source.organizeImports": "explicit"
+ }
+ }
+}
+
diff --git a/README.md b/README.md
index 59ac3300..6b0b5bb3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,12 @@
+
+## uv
+```bash
+uv venv --python 3.11
+uv venv
+source .venv/bin/activate
+uv add xxx
+uv sync
+```

--------------------------------------------------------------------------------
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..8bf6d2a4
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,59 @@
+[project]
+name = "tutorials"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.9"
+authors = [
+ { name = "lihanghang", email = "lihanghang@guazi.com" } # 对象格式,符合新版规范
+]
+dependencies = [
+ "matplotlib>=3.9.4",
+ "onnx>=1.19.1",
+ "onnxruntime>=1.20.1",
+ "onnxscript>=0.5.7",
+ "pandas>=2.3.3",
+ "setuptools>=80.9.0",
+ "tabulate>=0.9.0",
+ "tensordict>=0.10.0",
+ "torchvision>=0.23.0",
+]
+
+[tool.poetry.dependencies]
+aiohttp = "3.12.14" # 异步HTTP客户端/服务器库
+urllib3 = "2.6.2" # HTTP客户端库,提供连接池和线程安全
+orjson = ">=3.9.14,<4.0.0" # 高性能JSON序列化/反序列化库
+uuid = "^1.30" # 用于生成和操作UUID的库
+torch = "2.8.0" # PyTorch核心库,深度学习框架
+contourpy = "1.3.0" # 用于绘制等高线的Python库,matplotlib依赖
+cycler = "0.12.1" # 用于生成循环样式的工具库,matplotlib依赖
+fonttools = "4.60.2" # 用于处理字体文件的库,matplotlib依赖
+kiwisolver = "1.4.7" # 用于约束求解的库,matplotlib依赖
+pyparsing = "3.3.1" # 用于解析字符串的库,matplotlib依赖
+importlib-resources = "6.5.2" # 用于访问Python包资源的库
+matplotlib = "3.9.4" # 数据可视化库,用于创建图表和图形
+python-dateutil = "2.9.0.post0" # 日期时间处理扩展库,提供更强大的日期操作
+six = "1.17.0" # Python 2和Python 3兼容性库
+click = "8.1.8" # 命令行界面开发库,用于创建命令行工具
+joblib = "1.5.3" # 用于并行计算和任务调度的库,常用于机器学习
+nltk = "3.9.2" # 自然语言处理工具包,包含语料库和算法
+regex = "2025.11.3" # 正则表达式扩展库,提供比标准re更强大的功能
+tqdm = "4.67.1" # 进度条库,用于显示循环和任务的进度
+pycocotools = "2.0.11" # COCO数据集工具库,用于目标检测、分割等任务
+argparse = "1.4.0" # 命令行参数解析库,用于处理命令行输入
+pandas = "2.3.3"
+pytz = "2025.2"
+tzdata = "2025.3"
+tensordict = "0.10.0"
+cloudpickle = "3.1.2"
+importlib-metadata = "8.7.1"
+pyvers = "0.1.0"
+pillow = "11.3.0"
+setuptools = "80.9.0"
+[[tool.poetry.source]]
+name = "aliyun"
+url = "https://mirrors.huaweicloud.com/repository/pypi/simple/"
+priority = "primary"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/tutorials/01-basics/feedforward_neural_network/main.py b/tutorials/01-basics/feedforward_neural_network/main.py
index 0c766a7e..0fb48bbf 100644
--- a/tutorials/01-basics/feedforward_neural_network/main.py
+++ b/tutorials/01-basics/feedforward_neural_network/main.py
@@ -2,93 +2,121 @@
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
+import ssl
+# 前馈神经网络
+"""
+前馈神经网络是网络结构,反向传播是训练这个网络的核心算法。
+神经网络的「骨架」—— 定义了「神经元如何分层、层与层如何连接、每层神经元数量、用什么激活函数」的整体框架,
+决定了信息在网络中如何传递,是模型能拟合数据的基础,和 “反向传播(训练算法)” 是 “骨架” 和 “打磨骨架的方法” 的关系。
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+"""
+ssl._create_default_https_context = ssl._create_unverified_context
-# Hyper-parameters
-input_size = 784
-hidden_size = 500
-num_classes = 10
-num_epochs = 5
-batch_size = 100
-learning_rate = 0.001
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data',
- train=True,
- transform=transforms.ToTensor(),
- download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data',
- train=False,
- transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False)
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNet, self).__init__()
- self.fc1 = nn.Linear(input_size, hidden_size)
- self.relu = nn.ReLU()
- self.fc2 = nn.Linear(hidden_size, num_classes)
-
+ self.fc1 = nn.Linear(input_size, hidden_size) # 输入层→隐藏层
+ self.relu = nn.ReLU() # 激活函数
+ self.fc2 = nn.Linear(hidden_size, num_classes) # 隐藏层→输出层
+
def forward(self, x):
- out = self.fc1(x)
- out = self.relu(out)
- out = self.fc2(out)
+ out = self.fc1(x) # 线性变换
+ out = self.relu(out) # 非线性激活
+ out = self.fc2(out) # 线性变换
return out
-model = NeuralNet(input_size, hidden_size, num_classes).to(device)
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
- for i, (images, labels) in enumerate(train_loader):
- # Move tensors to the configured device
- images = images.reshape(-1, 28*28).to(device)
- labels = labels.to(device)
-
- # Forward pass
- outputs = model(images)
- loss = criterion(outputs, labels)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (i+1) % 100 == 0:
- print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
- .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-# In test phase, we don't need to compute gradients (for memory efficiency)
-with torch.no_grad():
- correct = 0
- total = 0
- for images, labels in test_loader:
- images = images.reshape(-1, 28*28).to(device)
- labels = labels.to(device)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
-
- print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
+"""
+FNN与之前学习的模型对比
+模型 结构复杂度 学习能力 应用场景
+线性回归 简单(单层) 只能学习线性关系 简单回归任务
+逻辑回归 简单(单层+激活) 只能学习线性可分的分类 简单分类任务
+前馈神经网络 复杂(多层+激活) 可学习复杂非线性关系 复杂分类/回归任务
+
+"""
+"""
+前馈神经网络的作用
+FNN的核心作用是学习输入与输出之间的复杂映射关系,主要用于两类任务:
+
+分类任务:将输入数据分为不同类别(如代码中的MNIST数字分类)
+回归任务:预测连续数值(如房价预测、股票价格预测)
+其强大之处在于:通过多层结构和非线性激活,能够拟合几乎任何复杂的函数关系(这是神经网络的"万能近似定理")。
+"""
+if __name__ == '__main__':
+
+ # Device configuration
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+ # Hyper-parameters
+ input_size = 784
+ hidden_size = 500
+ num_classes = 10
+ num_epochs = 5
+ batch_size = 100
+ learning_rate = 0.001
+
+ # MNIST dataset
+ train_dataset = torchvision.datasets.MNIST(root='../../data',
+ train=True,
+ transform=transforms.ToTensor(),
+ download=True)
+
+ test_dataset = torchvision.datasets.MNIST(root='../../data',
+ train=False,
+ transform=transforms.ToTensor())
+
+ # Data loader
+ train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+ batch_size=batch_size,
+ shuffle=True)
+
+ test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+ batch_size=batch_size,
+ shuffle=False)
+
+ model = NeuralNet(input_size, hidden_size, num_classes).to(device)
+
+ # Loss and optimizer
+ criterion = nn.CrossEntropyLoss()
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+ # Train the model
+ total_step = len(train_loader)
+ for epoch in range(num_epochs):
+ for i, (images, labels) in enumerate(train_loader):
+ # Move tensors to the configured device
+ images = images.reshape(-1, 28 * 28).to(device)
+ labels = labels.to(device)
+
+ # Forward pass
+ outputs = model(images)
+ loss = criterion(outputs, labels)
+
+ # Backward and optimize
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ if (i + 1) % 100 == 0:
+ print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+ .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+ # Test the model
+ # In test phase, we don't need to compute gradients (for memory efficiency)
+ with torch.no_grad():
+ correct = 0
+ total = 0
+ for images, labels in test_loader:
+ images = images.reshape(-1, 28 * 28).to(device)
+ labels = labels.to(device)
+ outputs = model(images)
+ _, predicted = torch.max(outputs.data, 1)
+ total += labels.size(0)
+ correct += (predicted == labels).sum().item()
+
+ print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
+
+ # Save the model checkpoint
+ torch.save(model.state_dict(), 'model.ckpt')
diff --git a/tutorials/01-basics/life/README.md b/tutorials/01-basics/life/README.md
new file mode 100644
index 00000000..3942dda3
--- /dev/null
+++ b/tutorials/01-basics/life/README.md
@@ -0,0 +1,193 @@
+一、推荐系统的核心目标
+为用户精准匹配他们可能感兴趣的内容/商品,同时帮助平台:
+
+提高用户留存和活跃度
+增加用户消费和观看时长
+优化平台资源利用率
+实现商业价值最大化
+二、推荐系统的基本流程
+数据收集:
+
+用户行为数据:浏览记录、点击、购买、评分、收藏、分享等
+物品特征数据:商品类别、价格、品牌、描述;视频的标签、时长、创作者等
+用户特征数据:年龄、性别、地域、偏好等
+数据预处理:
+
+清洗脏数据(如异常值、缺失值)
+特征工程(提取关键特征、特征编码、归一化等)
+构建用户-物品交互矩阵
+模型训练:
+
+使用机器学习或深度学习算法,从数据中学习用户偏好模式
+常见算法:协同过滤、基于内容的推荐、深度学习推荐等
+推荐生成:
+
+为用户预测对候选物品的兴趣度
+根据兴趣度排序,生成推荐列表
+在线服务与评估:
+
+将推荐结果实时展示给用户
+收集用户反馈,持续优化模型
+三、推荐系统的主要算法
+1. 协同过滤(Collaborative Filtering)
+协同过滤是推荐系统最经典的算法,基于**"相似用户喜欢相似物品"**的假设:
+
+(1)基于用户的协同过滤(User-based CF)
+原理:找到与目标用户兴趣相似的其他用户,推荐这些相似用户喜欢的物品
+步骤:
+计算用户之间的相似度(如余弦相似度、皮尔逊相关系数)
+为目标用户找到最相似的K个邻居
+将邻居喜欢的、目标用户未接触过的物品推荐给目标用户
+(2)基于物品的协同过滤(Item-based CF)
+原理:找到与目标用户喜欢的物品相似的其他物品,推荐这些相似物品
+
+步骤:
+
+计算物品之间的相似度(如余弦相似度、调整余弦相似度)
+为目标用户喜欢的每个物品找到最相似的K个物品
+将这些相似物品推荐给目标用户
+特点:计算复杂度低,推荐结果稳定,常用于电商平台(如Amazon)
+
+2. 基于内容的推荐(Content-based Recommendation)
+原理:基于用户过去喜欢的物品特征,推荐具有相似特征的其他物品
+
+步骤:
+
+提取物品的特征向量(如商品的类别、品牌、价格;视频的标签、风格)
+分析用户的历史行为,学习用户对不同特征的偏好权重
+根据物品特征与用户偏好的匹配度,生成推荐列表
+特点:不依赖其他用户数据,适用于冷启动场景,常用于视频/音乐推荐(如Spotify)
+
+3. 混合推荐系统(Hybrid Recommendation)
+原理:结合多种推荐算法的优势,提高推荐效果
+常见组合方式:
+加权混合:对不同算法的推荐结果进行加权融合
+特征组合:将多种算法的特征输入到统一模型中
+级联混合:先用一种算法生成候选集,再用另一种算法精排
+切换混合:根据不同场景或用户群体选择不同算法
+4. 深度学习在推荐系统中的应用
+随着深度学习的发展,越来越多的推荐系统开始采用深度神经网络:
+
+(1)矩阵分解模型(如MF、SVD++)
+将用户和物品映射到低维隐向量空间
+通过向量内积预测用户对物品的评分
+(2)神经网络推荐模型
+DNN(深度神经网络):学习复杂的用户-物品交互模式
+CNN(卷积神经网络):提取物品文本或图像的局部特征
+RNN/LSTM(循环神经网络):捕捉用户行为序列的时序信息
+Attention Mechanism(注意力机制):识别用户行为中的关键物品或特征
+(3)经典深度学习推荐模型
+Wide & Deep:结合记忆能力(Wide部分)和泛化能力(Deep部分)
+DeepFM:融合因子分解机(FM)和深度神经网络,自动学习高阶特征交互
+DIN(Deep Interest Network):引入注意力机制,捕捉用户动态兴趣
+BERT4Rec:基于Transformer,建模用户行为序列
+四、电商平台 vs 视频网站:推荐策略差异
+虽然核心原理相似,但由于业务场景和用户行为特点不同,两者的推荐策略有所差异:
+
+对比维度 电商平台(商品推荐) 视频网站(内容推荐)
+用户目标 明确(购物、购买特定商品) 相对模糊(娱乐、打发时间)
+决策周期 较长(需要比较、考虑) 较短(几秒内决定是否观看)
+物品特点 种类丰富,价格差异大 内容形式相对统一,时长多样
+关键特征 价格、品牌、销量、评价 内容质量、创作者、时效性、标签
+推荐重点 转化率、客单价、复购率 点击率、观看时长、完播率
+实时性要求 中等(购物决策较慢) 极高(需快速响应用户兴趣变化)
+冷启动挑战 新用户、新商品 新用户、新内容、新创作者
+五、推荐系统的评估指标
+推荐系统的效果通常通过以下指标评估:
+
+准确性指标:
+
+准确率(Precision):推荐列表中用户真正感兴趣的物品比例
+召回率(Recall):用户真正感兴趣的物品被推荐到的比例
+F1值:准确率和召回率的调和平均
+排序指标:
+
+NDCG(Normalized Discounted Cumulative Gain):衡量推荐列表的排序质量
+MAP(Mean Average Precision):平均准确率均值
+业务指标:
+
+点击率(CTR)
+转化率(CVR)
+平均观看时长
+用户留存率
+销售额/GMV
+六、推荐系统面临的挑战
+冷启动问题:
+
+新用户:缺乏历史行为数据
+新物品:缺乏用户交互数据
+数据稀疏性:
+
+用户-物品交互矩阵通常非常稀疏
+多数用户只与少数物品产生交互
+实时性要求:
+
+需实时响应用户兴趣变化
+处理海量并发请求
+公平性与多样性:
+
+避免"信息茧房"(只推荐用户已感兴趣的内容)
+确保推荐结果的多样性和公平性
+隐私保护:
+
+在利用用户数据的同时,保护用户隐私
+遵守数据保护法规(如GDPR、个人信息保护法)
+七、推荐系统的发展趋势
+多模态推荐:融合文本、图像、音频、视频等多种模态信息
+联邦学习:在保护用户隐私的前提下,实现跨平台的推荐模型训练
+强化学习:通过与环境交互,动态优化推荐策略
+因果推荐:从相关性分析转向因果关系挖掘,提高推荐的可解释性
+大模型与推荐系统结合:利用预训练大模型(如LLM)增强推荐系统的语义理解能力
+
+
+
+## 核心技术:人脸识别系统
+
+#### 1. 智能手机解锁和人脸支付都依赖于人脸识别技术,其基本流程如下:
+* 图像采集:通过手机前置摄像头捕捉用户面部图像
+* 人脸检测:从图像中定位并提取人脸区域
+* 特征提取:将人脸图像转换为计算机可理解的特征向量
+* 特征匹配:将提取的特征与数据库中存储的用户特征进行比对
+* 决策判断:根据匹配结果和阈值,判断是否通过验证
+## 二、深度学习在人脸识别中的应用
+现代智能手机的人脸识别系统几乎都采用了深度学习技术,特别是卷积神经网络(CNN),这与您学习的前馈神经网络有密切联系,但结构更复杂:
+
+1. 人脸检测
+使用专门的人脸检测网络(如MTCNN、RetinaFace等)
+这些网络能够在复杂环境下(如不同光线、角度、遮挡)准确检测人脸位置
+通常包含多个阶段:候选框生成 → 人脸回归 → 人脸关键点定位
+2. 特征提取
+使用深度卷积神经网络(如FaceNet、ArcFace、SphereFace等)
+这些网络经过大量人脸数据训练,能够学习到人脸的判别性特征
+关键特性:相同人脸在不同条件下的特征向量距离较小,不同人脸的特征向量距离较大
+3. 特征匹配与验证
+采用度量学习方法,将人脸映射到一个高维特征空间
+使用相似度度量(如欧氏距离、余弦相似度)比较特征向量
+设置阈值判断是否为同一人
+
+
+三、智能手机解锁 vs 人脸支付:主要区别
+虽然原理相似,但两者在安全性要求和实现细节上有很大差异:
+
+对比维度 智能手机解锁 人脸支付
+安全级别要求 中等(主要防止非授权使用) 极高(涉及资金安全)
+活体检测 基本(简单眨眼、摇头) 高级(3D结构光、红外成像)
+防攻击能力 一般(可能被高清照片欺骗) 强(能防御照片、视频、3D模型攻击)
+硬件支持 普通摄像头即可 专用硬件(如3D结构光、ToF传感器)
+验证阈值 相对宽松(优先用户体验) 非常严格(优先安全性)
+
+四、关键安全技术:活体检测
+为了防止照片、视频等欺骗手段,人脸支付系统通常采用活体检测技术:
+
+2D活体检测:分析面部表情变化(眨眼、张嘴)、纹理信息等
+3D结构光:投射红外点阵,获取面部3D结构信息(如iPhone的Face ID)
+红外成像:使用红外摄像头,区分真实人脸和照片/视频
+多光谱成像:结合可见光和红外光信息,提高检测准确率
+
+五、与您学习的PyTorch知识的联系
+虽然人脸识别系统比您当前学习的前馈神经网络复杂得多,但它们的核心原理是一致的:
+
+神经网络结构:都采用层级结构,通过非线性激活函数(如ReLU)提取特征
+训练过程:都需要大量标注数据,通过反向传播和优化器(如Adam)更新参数
+损失函数:人脸识别常用专门的损失函数(如Triplet Loss、ArcFace Loss),但基本思想仍是最小化预测与真实值的差距
+模型评估:都需要在测试集上评估准确率、召回率等指标
\ No newline at end of file
diff --git a/tutorials/01-basics/linear_regression/main.py b/tutorials/01-basics/linear_regression/main.py
index b3715d99..15d513f2 100644
--- a/tutorials/01-basics/linear_regression/main.py
+++ b/tutorials/01-basics/linear_regression/main.py
@@ -3,53 +3,73 @@
import numpy as np
import matplotlib.pyplot as plt
+# 线性回归
-# Hyper-parameters
-input_size = 1
-output_size = 1
-num_epochs = 60
-learning_rate = 0.001
-
-# Toy dataset
-x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
- [9.779], [6.182], [7.59], [2.167], [7.042],
- [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
-
-y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
- [3.366], [2.596], [2.53], [1.221], [2.827],
- [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
-
-# Linear regression model
-model = nn.Linear(input_size, output_size)
-
-# Loss and optimizer
-criterion = nn.MSELoss()
-optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
-
-# Train the model
-for epoch in range(num_epochs):
- # Convert numpy arrays to torch tensors
- inputs = torch.from_numpy(x_train)
- targets = torch.from_numpy(y_train)
-
- # Forward pass
- outputs = model(inputs)
- loss = criterion(outputs, targets)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (epoch+1) % 5 == 0:
- print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
-
-# Plot the graph
-predicted = model(torch.from_numpy(x_train)).detach().numpy()
-plt.plot(x_train, y_train, 'ro', label='Original data')
-plt.plot(x_train, predicted, label='Fitted line')
-plt.legend()
-plt.show()
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
+if __name__ == '__main__':
+ # Hyper-parameters
+ input_size = 1
+ output_size = 1
+ num_epochs = 60
+ learning_rate = 0.001
+
+ # Toy dataset
+ x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
+ [9.779], [6.182], [7.59], [2.167], [7.042],
+ [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
+
+ y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
+ [3.366], [2.596], [2.53], [1.221], [2.827],
+ [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
+
+ # Linear regression model
+ # 创建一个线性回归模型(也称为全连接层或仿射变换层)
+ # 在PyTorch中,nn.Linear 会自动初始化权重 w 和偏置 b
+ model = nn.Linear(input_size, output_size)
+
+ # Loss and optimizer
+ # 定义损失函数,用于衡量模型预测值与真实值之间的差异
+ # 这里使用均方误差损失(Mean Squared Error Loss)
+ criterion = nn.MSELoss()
+ # 创建优化器,用于更新模型的参数(权重 w 和偏置 b)
+ # 学习率的作用:学习率过大可能导致模型训练不稳定,过小则训练速度太慢
+ optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+ # Train the model
+ for epoch in range(num_epochs):
+ # Convert numpy arrays to torch tensors
+ inputs = torch.from_numpy(x_train)
+ targets = torch.from_numpy(y_train)
+
+ # Forward pass
+ outputs = model(inputs)
+ loss = criterion(outputs, targets)
+
+ # Backward and optimize
+ optimizer.zero_grad()
+ # 反向传播计算梯度:
+ # 计算损失函数关于模型参数的梯度
+ loss.backward()
+ # 更新模型参数:
+ # 使用优化器根据计算得到的梯度更新模型参数
+ optimizer.step()
+
+ if (epoch + 1) % 5 == 0:
+ print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))
+ """
+ 模型训练时确实使用了 x_train(输入)和 y_train(目标),但训练完成后:
+
+ 我们需要验证模型在训练数据上的拟合效果
+ 通过对相同的 x_train 进行预测,得到 predicted(模型输出)
+ 然后将 predicted 与真实的 y_train 对比绘图,直观展示模型学习的线性关系
+ """
+ # .detach() # 从计算图中分离张量
+ # Plot the graph
+ predicted = model(
+ torch.from_numpy(x_train)).detach().numpy() # 将PyTorch张量转换回NumPy数组 因为matplotlib绘图库需要NumPy数组格式 方便后续的可视化操作
+ plt.plot(x_train, y_train, 'ro', label='Original data')
+ plt.plot(x_train, predicted, label='Fitted line')
+ plt.legend()
+ plt.show()
+
+ # Save the model checkpoint
+ torch.save(model.state_dict(), 'model.ckpt')
diff --git a/tutorials/01-basics/logistic_regression/main.py b/tutorials/01-basics/logistic_regression/main.py
index c7eb378b..ea3c4b29 100644
--- a/tutorials/01-basics/logistic_regression/main.py
+++ b/tutorials/01-basics/logistic_regression/main.py
@@ -2,75 +2,109 @@
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
+import ssl
+# 逻辑回归
+ssl._create_default_https_context = ssl._create_unverified_context
+if __name__ == '__main__':
+ # Hyper-parameters
+ input_size = 28 * 28 # 784
+ num_classes = 10
+ num_epochs = 5
+ batch_size = 100
+ learning_rate = 0.001
+ # MNIST dataset (images and labels)
+ train_dataset = torchvision.datasets.MNIST(root='../../data',
+ train=True,
+ transform=transforms.ToTensor(),
+ download=True)
-# Hyper-parameters
-input_size = 28 * 28 # 784
-num_classes = 10
-num_epochs = 5
-batch_size = 100
-learning_rate = 0.001
+ test_dataset = torchvision.datasets.MNIST(root='../../data',
+ train=False,
+ transform=transforms.ToTensor())
-# MNIST dataset (images and labels)
-train_dataset = torchvision.datasets.MNIST(root='../../data',
- train=True,
- transform=transforms.ToTensor(),
- download=True)
+ # Data loader (input pipeline)
+ train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+ batch_size=batch_size,
+ shuffle=True)
-test_dataset = torchvision.datasets.MNIST(root='../../data',
- train=False,
- transform=transforms.ToTensor())
+ test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+ batch_size=batch_size,
+ shuffle=False)
-# Data loader (input pipeline)
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True)
+ # Logistic regression model
+ # 逻辑回归(多分类):输出类别数量的得分
+ # 逻辑回归:输出层维度 = 类别数量(这里10个数字)
+ # 线性回归:输出层维度 = 1(预测单一连续值)
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False)
+ model = nn.Linear(input_size, num_classes)
-# Logistic regression model
-model = nn.Linear(input_size, num_classes)
+ # Loss and optimizer
+ # nn.CrossEntropyLoss() computes softmax internally
+ # 逻辑回归:使用交叉熵损失(自动包含softmax)
+ criterion = nn.CrossEntropyLoss()
+ optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
-# Loss and optimizer
-# nn.CrossEntropyLoss() computes softmax internally
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+ # Train the model
+ total_step = len(train_loader)
+ for epoch in range(num_epochs):
+ for i, (images, labels) in enumerate(train_loader):
+ # Reshape images to (batch_size, input_size)
+ images = images.reshape(-1, input_size)
+ # 逻辑回归:CrossEntropyLoss内部已包含softmax激活
+ # Forward pass
+ outputs = model(images) # 输出是原始得分(logits)
+ loss = criterion(outputs, labels)
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
- for i, (images, labels) in enumerate(train_loader):
- # Reshape images to (batch_size, input_size)
- images = images.reshape(-1, input_size)
-
- # Forward pass
- outputs = model(images)
- loss = criterion(outputs, labels)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (i+1) % 100 == 0:
- print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
- .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
+ # Backward and optimize
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
-# Test the model
-# In test phase, we don't need to compute gradients (for memory efficiency)
-with torch.no_grad():
- correct = 0
- total = 0
- for images, labels in test_loader:
- images = images.reshape(-1, input_size)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum()
+ if (i+1) % 100 == 0:
+ print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+ .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
+ """
+ 线性回归和逻辑回归的核心区别在于:
- print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+ 任务目标不同:回归 vs 分类
+ 输出处理不同:直接输出 vs 激活函数映射
+ 损失函数不同:MSE vs 交叉熵
+ """
+ # Test the model
+ # In test phase, we don't need to compute gradients (for memory efficiency)
+ with torch.no_grad():
+ correct = 0
+ total = 0
+ for images, labels in test_loader:
+ images = images.reshape(-1, input_size)
+ outputs = model(images)
+ _, predicted = torch.max(outputs.data, 1)
+ total += labels.size(0)
+ correct += (predicted == labels).sum()
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
+ print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+ # Save the model checkpoint
+ torch.save(model.state_dict(), 'model.ckpt')
+
+ """
+ 一、核心区别对比表
+ 对比维度 均方误差损失 (MSE) 交叉熵损失 (Cross-Entropy)
+ 数学定义 $MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2$ 二分类:$CE = -\frac{1}{n}\sum_{i=1}^{n}[y_i\log\hat{y}i + (1-y_i)\log(1-\hat{y}i)]$
+ 多分类:$CE = -\frac{1}{n}\sum{i=1}^{n}\sum{c=1}^{C}y_{ic}\log\hat{y}_{ic}$
+ 适用任务 回归任务(预测连续值,如房价、温度) 分类任务(预测离散类别,如图片分类、文本分类)
+ 输出范围假设 预测值$\hat{y}$可任意实数($-\infty, +\infty$) 预测值需转换为概率分布($(0, 1)$区间)
+ 激活函数配合 通常不需要特定激活函数(线性输出)
+ 或配合sigmoid/tanh(约束输出范围) 二分类:配合sigmoid激活
+ 多分类:配合softmax激活(PyTorch中CrossEntropyLoss内部自动计算)
+ 梯度特性 梯度与预测偏差$(y-\hat{y})$成正比
+ 预测远离真实值时梯度大,易不稳定 梯度与概率分布的差异相关
+ 训练更稳定,尤其适合分类任务
+
+ 场景 推荐损失函数 代码示例
+ 预测连续值(如房价、温度) MSE criterion = nn.MSELoss()
+ 二分类(如垃圾邮件检测) Binary Cross-Entropy criterion = nn.BCELoss()
+ 多分类(如MNIST数字识别) Cross-Entropy criterion = nn.CrossEntropyLoss()
+ 简单来说:回归用MSE,分类用CrossEntropy,这是深度学习中的"黄金法则"之一!
+ """
\ No newline at end of file
diff --git a/tutorials/01-basics/pytorch_basics/main.py b/tutorials/01-basics/pytorch_basics/main.py
index 744400c2..16dbf19d 100644
--- a/tutorials/01-basics/pytorch_basics/main.py
+++ b/tutorials/01-basics/pytorch_basics/main.py
@@ -1,10 +1,27 @@
-import torch
-import torchvision
-import torch.nn as nn
+import ssl
+
import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
import torchvision.transforms as transforms
+class CustomDataset(torch.utils.data.Dataset):
+ def __init__(self):
+ # 初始化一些示例数据
+ self.data = torch.randn(100, 3) # 100个样本,每个样本3个特征
+ self.labels = torch.randint(0, 2, (100,)) # 100个标签,0或1
+
+ def __getitem__(self, index):
+ # 返回数据对 (特征, 标签)
+ return self.data[index], self.labels[index]
+
+ def __len__(self):
+ # 返回数据集大小
+ return len(self.data)
+
+
# ================================================================== #
# Table of Contents #
# ================================================================== #
@@ -15,175 +32,264 @@
# 4. Input pipline (Line 104 to 129)
# 5. Input pipline for custom dataset (Line 136 to 156)
# 6. Pretrained model (Line 163 to 176)
-# 7. Save and load model (Line 183 to 189)
+# 7. Save and load model (Line 183 to 189)
+# (CNN)的基本原理 是什么?
# ================================================================== #
# 1. Basic autograd example 1 #
# ================================================================== #
-# Create tensors.
-x = torch.tensor(1., requires_grad=True)
-w = torch.tensor(2., requires_grad=True)
-b = torch.tensor(3., requires_grad=True)
-
-# Build a computational graph.
-y = w * x + b # y = 2 * x + 3
-
-# Compute gradients.
-y.backward()
-
-# Print out the gradients.
-print(x.grad) # x.grad = 2
-print(w.grad) # w.grad = 1
-print(b.grad) # b.grad = 1
-
-
-# ================================================================== #
-# 2. Basic autograd example 2 #
-# ================================================================== #
-
-# Create tensors of shape (10, 3) and (10, 2).
-x = torch.randn(10, 3)
-y = torch.randn(10, 2)
-
-# Build a fully connected layer.
-linear = nn.Linear(3, 2)
-print ('w: ', linear.weight)
-print ('b: ', linear.bias)
-
-# Build loss function and optimizer.
-criterion = nn.MSELoss()
-optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)
-
-# Forward pass.
-pred = linear(x)
-
-# Compute loss.
-loss = criterion(pred, y)
-print('loss: ', loss.item())
-
-# Backward pass.
-loss.backward()
-
-# Print out the gradients.
-print ('dL/dw: ', linear.weight.grad)
-print ('dL/db: ', linear.bias.grad)
-
-# 1-step gradient descent.
-optimizer.step()
-
-# You can also perform gradient descent at the low level.
-# linear.weight.data.sub_(0.01 * linear.weight.grad.data)
-# linear.bias.data.sub_(0.01 * linear.bias.grad.data)
-
-# Print out the loss after 1-step gradient descent.
-pred = linear(x)
-loss = criterion(pred, y)
-print('loss after 1 step optimization: ', loss.item())
-
-
-# ================================================================== #
-# 3. Loading data from numpy #
-# ================================================================== #
-
-# Create a numpy array.
-x = np.array([[1, 2], [3, 4]])
-
-# Convert the numpy array to a torch tensor.
-y = torch.from_numpy(x)
-
-# Convert the torch tensor to a numpy array.
-z = y.numpy()
-
-
-# ================================================================== #
-# 4. Input pipeline #
-# ================================================================== #
-
-# Download and construct CIFAR-10 dataset.
-train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
- train=True,
- transform=transforms.ToTensor(),
- download=True)
-
-# Fetch one data pair (read data from disk).
-image, label = train_dataset[0]
-print (image.size())
-print (label)
-
-# Data loader (this provides queues and threads in a very simple way).
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=64,
- shuffle=True)
-
-# When iteration starts, queue and thread start to load data from files.
-data_iter = iter(train_loader)
-
-# Mini-batch images and labels.
-images, labels = data_iter.next()
-
-# Actual usage of the data loader is as below.
-for images, labels in train_loader:
- # Training code should be written here.
- pass
-
-
-# ================================================================== #
-# 5. Input pipeline for custom dataset #
-# ================================================================== #
-
-# You should build your custom dataset as below.
-class CustomDataset(torch.utils.data.Dataset):
- def __init__(self):
- # TODO
- # 1. Initialize file paths or a list of file names.
+"""
+神经网络中的延伸
+在神经网络中,损失函数 ( L ) 是关于所有模型参数(权重 ( w_1, w_2, ... ) 和偏置 ( b_1, b_2, ... ))的高维多元函数。因此:
+
+参数数量 = 偏导数数量
+每个参数的偏导数 ( \frac{\partial L}{\partial \theta} ) 代表损失函数在该参数方向上的变化率
+所有偏导数组成的向量称为梯度,用于指导参数更新方向
+"""
+if __name__ == '__main__':
+ """
+ 偏导数的大小:变化率的强度
+ 偏导数的绝对值大小代表了函数在该参数方向上的变化率强度:
+
+ 偏导数绝对值越大 → 函数在该方向上变化越快 → 该参数对损失函数的影响越敏感
+ 偏导数绝对值越小 → 函数在该方向上变化越慢 → 该参数对损失函数的影响较不敏感
+ """
+ ssl._create_default_https_context = ssl._create_unverified_context
+ # Create tensors.
+ x = torch.tensor(1., requires_grad=True)
+ print(f"x Python 类型: {type(x)}")
+ print(f"tensor 类名: {x.__class__.__name__}")
+ print(f"tensor 模块: {x.__class__.__module__}")
+ print(f"tensor 是否是 torch.Tensor: {isinstance(x, torch.Tensor)}")
+ print("在 PyTorch 中,当你创建一个张量并设置 requires_grad=True 时,PyTorch 会自动构建一个计算图"
+ "计算图记录了所有对该张量的操作,以便后续计算梯度"
+ "y.backward() 会从 y 开始,沿着计算图反向传播,计算 y 相对于所有具有 requires_grad=True 的输入张量(这里是 x、w、b)的偏导数")
+ w = torch.tensor(2., requires_grad=True)
+ b = torch.tensor(3., requires_grad=True)
+
+ # Build a computational graph.
+ y = w * x + b # y = 2 * x + 3
+ print("""你不能直接打印 y.grad,因为:
+ 只有叶子张量(Leaf Tensor)才能保存梯度
+ 叶子张量是指直接创建的张量(不是通过其他张量计算得到的)
+ 在这个例子中,x、w、b 是叶子张量,而 y 是通过计算得到的中间张量
+ PyTorch 默认只保存叶子张量的梯度,以节省内存空间""")
+ # Compute gradients.
+ # y.retain_grad() # 保存 y 的梯度
+ # 执行反向传播
+ y.backward()
+
+ # Print out the gradients.
+ # 用于执行反向传播,计算梯度:
+ print(x.grad) # x.grad = 2
+ print(w.grad) # w.grad = 1
+ print(b.grad) # b.grad = 1
+
+ # ================================================================== #
+ # 2. Basic autograd example 2 #
+ # ================================================================== #
+
+ # Create tensors of shape (10, 3) and (10, 2).
+ """
+ torch.randn(10, 3): 创建一个形状为 (10, 3) 的张量(矩阵),其中包含从标准正态分布中随机采样的值
+
+ 10 表示批次大小(batch size),即一次处理10个样本
+ 3 表示每个样本有3个特征
+ torch.randn(10, 2): 创建一个形状为 (10, 2) 的张量,作为模型的目标输出
+
+ 10 同样是批次大小,与输入数据对应
+ 2 表示每个样本期望输出2个值
+ """
+ x = torch.randn(10, 3)
+ y = torch.randn(10, 2)
+ print('x: ', x)
+ print('y: ', y)
+ # Build a fully connected layer.
+ # 构建全连接层
+ """
+ 第一个参数 3 是输入特征的维度
+ 第二个参数 2 是输出特征的维度
+ 这个线性层会自动初始化权重和偏置参数
+ """
+ linear = nn.Linear(3, 2)
+ print('w: ', linear.weight)
+ print('b: ', linear.bias)
+
+ # Build loss function and optimizer.
+ criterion = nn.MSELoss()
+ optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)
+
+ # Forward pass.
+ pred = linear(x)
+
+ # Compute loss.
+ loss = criterion(pred, y)
+ print('loss: ', loss.item())
+
+ # Backward pass.
+ loss.backward()
+
+ # Print out the gradients.
+ print('dL/dw: ', linear.weight.grad)
+ print('dL/db: ', linear.bias.grad)
+
+ # 1-step gradient descent.
+ optimizer.step()
+
+ # You can also perform gradient descent at the low level.
+ # linear.weight.data.sub_(0.01 * linear.weight.grad.data)
+ # linear.bias.data.sub_(0.01 * linear.bias.grad.data)
+
+ # Print out the loss after 1-step gradient descent.
+ pred = linear(x)
+ loss = criterion(pred, y)
+ print('loss after 1 step optimization: ', loss.item())
+ """
+ 这段代码的意义
+ 这是深度学习中构建神经网络的基础步骤:
+ 准备输入数据和目标数据
+ 定义模型结构(这里是一个简单的线性层)
+ 查看和理解模型参数
+ 在实际应用中,这段代码之后通常会添加:
+ 损失函数定义(如 MSE、交叉熵等)
+ 优化器选择(如 SGD、Adam 等)
+ 前向传播、损失计算、反向传播和参数更新的循环
+ 这段代码展示了PyTorch构建神经网络的核心概念,是理解更复杂模型的基础。
+ """
+
+ # ================================================================== #
+ # 3. Loading data from numpy #
+ # ================================================================== #
+
+ # Create a numpy array.
+ x = np.array([[1, 2], [3, 4]])
+
+ # Convert the numpy array to a torch tensor.
+ y = torch.from_numpy(x)
+
+ # Convert the torch tensor to a numpy array.
+ z = y.numpy()
+ # x==z: [[ True True]
+ print(f"x==z: {x == z}")
+
+ # ================================================================== #
+ # 4. Input pipeline #
+ # ================================================================== #
+
+ # Download and construct CIFAR-10 dataset.
+ print("""
+ # CIFAR - 10
+ # 是一个经典的图像分类数据集,包含:
+ # 60000 张 32x32 彩色图像
+ # 10 个类别:飞机、汽车、鸟、猫、鹿、狗、青蛙、马、船、卡车
+ # 每个类别有 6000 张图像
+ # 训练集:50000 张
+ # 测试集:10000 张
+ # 数据预处理:
+ # 图像被归一化到 [0, 1] 范围
+ # 每个通道的均值和标准差分别为 [0.5, 0.5, 0.5] 和 [0.5, 0.5, 0.5]
+ """)
+ train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
+ train=True,
+ transform=transforms.ToTensor(),
+ download=True)
+
+ # Fetch one data pair (read data from disk).
+ image, label = train_dataset[0]
+ print(f"image.size(): {image.size()}")
+ print(f"label: {label}")
+
+ # Data loader (this provides queues and threads in a very simple way).
+ train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+ batch_size=64,
+ shuffle=True)
+
+ # When iteration starts, queue and thread start to load data from files.
+ data_iter = iter(train_loader)
+
+ # Mini-batch images and labels.
+ images, labels = data_iter.__next__()
+ print(f"images.size(): {images.size()}")
+ print(f"labels: {labels}")
+ # Actual usage of the data loader is as below.
+ for images, labels in train_loader:
+ # Training code should be written here.
pass
- def __getitem__(self, index):
- # TODO
- # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
- # 2. Preprocess the data (e.g. torchvision.Transform).
- # 3. Return a data pair (e.g. image and label).
- pass
- def __len__(self):
- # You should change 0 to the total size of your dataset.
- return 0
-
-# You can then use the prebuilt data loader.
-custom_dataset = CustomDataset()
-train_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
- batch_size=64,
- shuffle=True)
-
-
-# ================================================================== #
-# 6. Pretrained model #
-# ================================================================== #
-
-# Download and load the pretrained ResNet-18.
-resnet = torchvision.models.resnet18(pretrained=True)
-
-# If you want to finetune only the top layer of the model, set as below.
-for param in resnet.parameters():
- param.requires_grad = False
-
-# Replace the top layer for finetuning.
-resnet.fc = nn.Linear(resnet.fc.in_features, 100) # 100 is an example.
-
-# Forward pass.
-images = torch.randn(64, 3, 224, 224)
-outputs = resnet(images)
-print (outputs.size()) # (64, 100)
-
-
-# ================================================================== #
-# 7. Save and load the model #
-# ================================================================== #
-
-# Save and load the entire model.
-torch.save(resnet, 'model.ckpt')
-model = torch.load('model.ckpt')
-# Save and load only the model parameters (recommended).
-torch.save(resnet.state_dict(), 'params.ckpt')
-resnet.load_state_dict(torch.load('params.ckpt'))
+ # ================================================================== #
+ # 5. Input pipeline for custom dataset #
+ # ================================================================== #
+
+ # You should build your custom dataset as below.
+
+ # You can then use the prebuilt data loader.
+ custom_dataset = CustomDataset()
+ train_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
+ batch_size=64,
+ shuffle=True)
+
+ # ================================================================== #
+ # 6. Pretrained model #
+ # ================================================================== #
+ """
+ torchvision.models:PyTorch视觉库(torchvision)中的模型模块,包含了多种经典的计算机视觉预定义模型(如ResNet、VGG、AlexNet等)
+ resnet18:ResNet(Residual Network,残差网络)模型家族中的一个变体,指具有18层网络结构的ResNet模型
+ pretrained=True:关键参数,指定加载在ImageNet数据集上预训练好的模型权重
+ """
+ """预训练模型是指:
+ 已经在大型数据集(这里是ImageNet,包含1400万张图像,1000
+ 个类别)上训练完成的模型
+ 模型权重已经学习到了通用的图像特征(如边缘、纹理、形状等高级视觉特征)
+ 可以直接用于推理,或作为迁移学习的起点"""
+ # Download and load the pretrained ResNet-18.
+ resnet = torchvision.models.resnet18(pretrained=True)
+ # 模型微调(Fine - tuning)的经典实现
+ # If you want to finetune only the top layer of the model, set as below.
+ """
+ 作用:将ResNet-18模型中所有原有参数的requires_grad设置为False
+ 效果:在反向传播时,这些参数不会计算梯度,也就不会被更新
+ 原理:预训练模型的底层(卷积层)已经学习到了通用的视觉特征(如边缘、纹理、形状),这些特征对大多数图像任务都有效,无需重新学习
+ """
+ for param in resnet.parameters():
+ param.requires_grad = False
+
+ # Replace the top layer for finetuning.
+ """
+ 作用:将ResNet-18模型的全连接层(fc层)替换为一个新的全连接层,输出维度为100
+ 效果:模型的输出层从原来的1000个类别(ImageNet数据集的类别数)减少到100个类别
+ 原理:全连接层是模型的最后一层,负责将特征映射到类别空间。通过替换fc层,我们可以将模型用于不同的分类任务(如CIFAR - 10)
+ """
+ resnet.fc = nn.Linear(resnet.fc.in_features, 100) # 100 is an example.
+
+ # Forward pass.
+ images = torch.randn(64, 3, 224, 224)
+ outputs = resnet(images)
+ print(outputs.size()) # (64, 100)
+
+ # ================================================================== #
+ # 7. Save and load the model #
+ # ================================================================== #
+ """
+ torch.save(resnet, 'model.ckpt'):保存完整模型对象
+ 保存的是整个模型的Python对象,包括:
+ 模型的网络结构(如ResNet-18的卷积层、池化层、全连接层等)
+ 模型的所有参数(权重和偏置)
+ 模型的优化器状态(如果模型包含的话)
+ 其他与模型相关的Python对象(如类定义、导入依赖等)
+ 本质:使用Python的pickle序列化机制保存整个对象
+ """
+ # Save and load the entire model.
+ torch.save(resnet, 'model.ckpt')
+ model = torch.load('model.ckpt', weights_only=False)
+ """
+ 只保存模型的参数状态字典(State Dictionary):
+ 以字典形式保存所有可学习参数的名称和值
+ 不包含模型的网络结构信息
+ 不包含任何Python类定义或依赖
+ 本质:只保存模型的"权重",不保存"骨架"
+ """
+ # Save and load only the model parameters (recommended).
+ torch.save(resnet.state_dict(), 'params.ckpt')
+ resnet.load_state_dict(torch.load('params.ckpt'))
diff --git a/tutorials/01-mine/FashionMNIST.py b/tutorials/01-mine/FashionMNIST.py
new file mode 100644
index 00000000..fe897f3e
--- /dev/null
+++ b/tutorials/01-mine/FashionMNIST.py
@@ -0,0 +1,149 @@
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+
+# 设备配置:优先使用GPU,否则使用CPU,M系列芯片可使用MPS
+# if torch.backends.mps.is_available():
+# device = torch.device('mps') # Apple Silicon M系列芯片加速
+# elif torch.cuda.is_available():
+# device = torch.device('cuda') # NVIDIA GPU加速
+# else:
+# device = torch.device('cpu') # CPU训练
+class NeuralNetwork(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.flatten = nn.Flatten()
+ self.linear_relu_stack = nn.Sequential(
+ nn.Linear(28 * 28, 512),
+ nn.ReLU(),
+ nn.Linear(512, 512),
+ nn.ReLU(),
+ nn.Linear(512, 10)
+ )
+
+ def forward(self, x):
+ x = self.flatten(x)
+ logits = self.linear_relu_stack(x)
+ return logits
+
+
+# 在单个训练循环中,模型会对训练数据集(以批次形式输入)进行预测,并通过反向传播预测误差来调整模型的参数。
+def train(dataloader, model, loss_fn, optimizer):
+ size = len(dataloader.dataset)
+ model.train()
+ for batch, (X, y) in enumerate(dataloader):
+ X, y = X.to(device), y.to(device)
+ # Compute prediction error
+ pred = model(X)
+ loss = loss_fn(pred, y)
+
+ # Backpropagation
+ loss.backward()
+ optimizer.step()
+ optimizer.zero_grad()
+
+ if batch % 100 == 0:
+ loss, current = loss.item(), (batch + 1) * len(X)
+ print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
+
+# 根据测试数据集检查模型的性能,以确保它在学习。
+def test(dataloader, model, loss_fn):
+ size = len(dataloader.dataset)
+ num_batches = len(dataloader)
+ model.eval()
+ test_loss, correct = 0, 0
+ with torch.no_grad():
+ for X, y in dataloader:
+ X, y = X.to(device), y.to(device)
+ pred = model(X)
+ test_loss += loss_fn(pred, y).item()
+ correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+ test_loss /= num_batches
+ correct /= size
+ print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+"""
+下载模型数据
+创建模型
+优化模型参数
+保存模型
+加载模型
+预测数据
+"""
+if __name__ == '__main__':
+ # Download training data from open datasets.
+ training_data = datasets.FashionMNIST(
+ root="data",
+ train=True,
+ download=True,
+ transform=ToTensor(),
+ )
+
+ # Download test data from open datasets.
+ # 它是一个包含图像和对应标签的数据集对象。
+ test_data = datasets.FashionMNIST(
+ root="data",
+ train=False,
+ download=True,
+ transform=ToTensor(),
+ )
+
+ batch_size = 64
+ # Create data loaders.
+ train_dataloader = DataLoader(training_data, batch_size=batch_size)
+ test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+ for X, y in test_dataloader:
+ print(f"Shape of X [N, C, H, W]: {X.shape}")
+ print(f"Shape of y: {y.shape} {y.dtype}")
+ break
+ # Creating Models
+ device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+ print(f"Using {device} device")
+ # Define model
+
+ model = NeuralNetwork().to(device)
+ print(f"model: {model}")
+ # To train a model, we need a loss function and an optimizer.
+ loss_fn = nn.CrossEntropyLoss()
+ optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+ epochs = 5
+ for t in range(epochs):
+ print(f"Epoch {t + 1}\n-------------------------------")
+ train(train_dataloader, model, loss_fn, optimizer)
+ test(test_dataloader, model, loss_fn)
+ print("Done!")
+ torch.save(model.state_dict(), "model.pth")
+ print("Saved PyTorch Model State to model.pth")
+ model.load_state_dict(torch.load("model.pth", weights_only=True))
+ classes = [
+ "T-shirt/top",
+ "Trouser",
+ "Pullover",
+ "Dress",
+ "Coat",
+ "Sandal",
+ "Shirt",
+ "Sneaker",
+ "Bag",
+ "Ankle boot",
+ ]
+
+ model.eval() # 设置模型为评估模式
+ x, y = test_data[0][0], test_data[0][1]
+ print(f"x: {x} ,y: {y}")
+ with torch.no_grad():
+ x = x.to(device)
+ # logits 指的是 模型最后一层(通常是线性层 nn.Linear)的原始输出,这些输出 没有经过归一化,因此它们不是概率值。
+ # Softmax 函数转换为概率分布:
+ pred = model(x) # 模型预测,输出10个类别的logits
+ # probs = torch.softmax(pred, dim=1)
+ # print(probs)
+
+ print(f"pred: {pred}")
+ # 在分类任务中,我们通常只需要找到最大logits对应的类别即可,不需要转换为概率:
+ predicted, actual = classes[pred[0].argmax(0)], classes[y]
+ print(f'Predicted: "{predicted}", Actual: "{actual}"')
\ No newline at end of file
diff --git a/tutorials/01-mine/autograd.py b/tutorials/01-mine/autograd.py
new file mode 100644
index 00000000..ee0818f8
--- /dev/null
+++ b/tutorials/01-mine/autograd.py
@@ -0,0 +1,91 @@
+# 微分是把整体拆成无限小的局部,求「瞬间变化率」;积分是把无限小的局部拼回整体,求「累积总量」,二者是互逆运算,就像 “拆积木” 和 “搭积木” 的关系。
+# 通俗说:比如汽车行驶,微分就是求某一秒的瞬时速度(不是平均速度);比如山坡,微分就是求某一点的坡度(斜率)。
+"""
+定积分:求 “面积 / 总量”(核心应用)
+对函数\(y=f(x)\),在区间\([a,b]\)上的定积分\(\int_{a}^{b}f(x)dx\),本质是:把区间\([a,b]\)拆成无数个微小区间,每个区间对应一个微小矩形(高 = f (x),宽 = dx),把所有微小矩形的面积加起来,就是定积分的结果。
+不定积分:微分的 “逆运算”
+通俗说:知道 “每一点的斜率”,反推 “原来的曲线”;知道 “瞬时速度”,反推 “位移函数”。✅ 例子:已知微分(导数)\(y'=2x\),不定积分就是 \(y=x^2 + C\)(C 是任意常数),因为\(x^2\)、\(x^2+1\)、\(x^2+100\)的导数都是 2x。
+
+"""
+
+# %matplotlib inline
+
+import torch
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import math
+import torch
+
+
+def demo1():
+ """
+ Consider the simplest one-layer neural network,
+ with input x, parameters w and b, and some loss function. It can be defined in PyTorch in the following manner:
+ """
+ x = torch.ones(5) # input tensor
+ y = torch.zeros(3) # expected output
+ w = torch.randn(5, 3, requires_grad=True)
+ b = torch.randn(3, requires_grad=True)
+ z = torch.matmul(x, w) + b
+ loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
+ print(f"Gradient function for z = {z.grad_fn}")
+ print(f"Gradient function for loss = {loss.grad_fn}")
+ # Computing Gradients
+ loss.backward()
+ """
+ 我们只能获取计算图中叶节点的grad属性,这些叶节点的requires_grad属性被设置为True。对于图中的所有其他节点,梯度将不可用。
+
+ """
+ print(w.grad)
+ print(b.grad)
+
+
+def sin_demo():
+ a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+ print(a)
+ b = torch.sin(a)
+ plt.plot(a.detach(), b.detach())
+ print(b)
+BATCH_SIZE = 16
+DIM_IN = 1000
+HIDDEN_SIZE = 100
+DIM_OUT = 10
+
+class TinyModel(torch.nn.Module):
+
+ def __init__(self):
+ super(TinyModel, self).__init__()
+
+ self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
+ self.relu = torch.nn.ReLU()
+ self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)
+
+ def forward(self, x):
+ x = self.layer1(x)
+ x = self.relu(x)
+ x = self.layer2(x)
+ return x
+
+def sin_demo1():
+ some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
+ ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)
+
+ model = TinyModel()
+ print(model.layer2.weight[0][0:10]) # just a small slice
+ print(model.layer2.weight.grad)
+ optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+ prediction = model(some_input)
+ loss = (ideal_output - prediction).pow(2).sum()
+ print(loss)
+ loss.backward()
+ print(model.layer2.weight[0][0:10])
+ print(model.layer2.weight.grad[0][0:10])
+ optimizer.step()
+ print(model.layer2.weight[0][0:10])
+ print(model.layer2.weight.grad[0][0:10])
+
+
+
+if __name__ == '__main__':
+ sin_demo1()
diff --git a/tutorials/01-mine/build_neural_network.py b/tutorials/01-mine/build_neural_network.py
new file mode 100644
index 00000000..03bd0f23
--- /dev/null
+++ b/tutorials/01-mine/build_neural_network.py
@@ -0,0 +1,69 @@
+
+
+
+# 构建神经网络
+import os
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+class NeuralNetwork(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.flatten = nn.Flatten()
+ self.linear_relu_stack = nn.Sequential(
+ nn.Linear(28*28, 512),
+ nn.ReLU(),
+ nn.Linear(512, 512),
+ nn.ReLU(),
+ nn.Linear(512, 10),
+ )
+
+ def forward(self, x):
+ x = self.flatten(x)
+ logits = self.linear_relu_stack(x)
+ return logits
+
+
+if __name__ == '__main__':
+ model = NeuralNetwork().to(device)
+ print(model)
+ # To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. Do not call model.forward() directly!
+ # 要使用该模型,我们需向其传入输入数据。这会执行模型的forward以及一些后台操作。请勿直接调用model.forward()!
+ X = torch.rand(1, 28, 28, device=device)
+ logits = model(X)
+ print(logits)
+ pred_probab = nn.Softmax(dim=1)(logits)
+ y_pred = pred_probab.argmax(1)
+ print(f"Predicted class: {y_pred}")
+ input_image = torch.rand(3, 28, 28)
+ print(input_image.size())
+ # 我们初始化nn.Flatten层,将每个2D的28x28图像转换为一个包含784个像素值的连续数组(保持dim=0处的小批量维度)。
+ flatten = nn.Flatten()
+ flat_image = flatten(input_image)
+ print(flat_image.size())
+ # 线性层是一个模块,它使用其存储的权重和偏置对输入进行线性变换。
+ layer1 = nn.Linear(in_features=28 * 28, out_features=20)
+ hidden1 = layer1(flat_image)
+ print(hidden1.size())
+ print(f"Before ReLU: {hidden1}\n\n")
+ hidden1 = nn.ReLU()(hidden1)
+ print(f"After ReLU: {hidden1}")
+ seq_modules = nn.Sequential(
+ flatten,
+ layer1,
+ nn.ReLU(),
+ nn.Linear(20, 10)
+ )
+ input_image = torch.rand(3, 28, 28)
+ logits = seq_modules(input_image)
+ softmax = nn.Softmax(dim=1)
+ pred_probab = softmax(logits)
+ print(f"Model structure: {model}\n\n")
+
+ for name, param in model.named_parameters():
+ print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
\ No newline at end of file
diff --git a/tutorials/01-mine/control_flow_weight_sharing.py b/tutorials/01-mine/control_flow_weight_sharing.py
new file mode 100644
index 00000000..f4dc4732
--- /dev/null
+++ b/tutorials/01-mine/control_flow_weight_sharing.py
@@ -0,0 +1,75 @@
+import random
+import torch
+import math
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+"""
+模块 torch.nn (Neural Network) torch.optim (Optimizer)
+核心职责 定义模型结构,执行前向计算 更新模型参数,实现优化算法
+是什么 模型架构师与计算器 模型调参优化师
+管什么 管理模型的层、参数、计算图 管理参数的更新方向和步长
+关键输出 模型的预测输出 模型参数的新数值
+类比 汽车的发动机与车身设计图 汽车的驾驶员与导航系统
+"""
+class DynamicNet(torch.nn.Module):
+ def __init__(self):
+ """
+ In the constructor we instantiate five parameters and assign them as members.
+ """
+ super().__init__()
+ self.a = torch.nn.Parameter(torch.randn(()))
+ self.b = torch.nn.Parameter(torch.randn(()))
+ self.c = torch.nn.Parameter(torch.randn(()))
+ self.d = torch.nn.Parameter(torch.randn(()))
+ self.e = torch.nn.Parameter(torch.randn(()))
+
+ def forward(self, x):
+ """
+ For the forward pass of the model, we randomly choose either 4, 5
+ and reuse the e parameter to compute the contribution of these orders.
+
+ Since each forward pass builds a dynamic computation graph, we can use normal
+ Python control-flow operators like loops or conditional statements when
+ defining the forward pass of the model.
+
+ Here we also see that it is perfectly safe to reuse the same parameter many
+ times when defining a computational graph.
+ """
+ y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+ for exp in range(4, random.randint(4, 6)):
+ y = y + self.e * x ** exp
+ return y
+
+ def string(self):
+ """
+ Just like any class in Python, you can also define custom method on PyTorch modules
+ """
+ return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000,device=device)
+y = torch.sin(x).to(device)
+
+# Construct our model by instantiating the class defined above
+model = DynamicNet().to(device)
+
+# Construct our loss function and an Optimizer. Training this strange model with
+# vanilla stochastic gradient descent is tough, so we use momentum
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
+for t in range(30000):
+ # Forward pass: Compute predicted y by passing x to the model
+ y_pred = model(x)
+
+ # Compute and print loss
+ loss = criterion(y_pred, y)
+ if t % 2000 == 1999:
+ print(t, loss.item())
+
+ # Zero gradients, perform a backward pass, and update the weights.
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+print(f'Result: {model.string()}')
\ No newline at end of file
diff --git a/tutorials/01-mine/cpu_2_gpu.py b/tutorials/01-mine/cpu_2_gpu.py
new file mode 100644
index 00000000..0413258b
--- /dev/null
+++ b/tutorials/01-mine/cpu_2_gpu.py
@@ -0,0 +1,59 @@
+"""
+在许多PyTorch应用中,将数据从CPU传输到GPU是基本操作。用户理解在设备之间移动数据的最有效工具和选项至关重要。
+本教程探讨了PyTorch中设备到设备数据传输的两种关键方法:pin_memory()和带有non_blocking=True选项的to()。
+"""
+
+import contextlib
+import torch
+from torch.cuda import Stream
+
+s = Stream()
+
+torch.manual_seed(42)
+t1_cpu_pinned = torch.randn(1024 ** 2 * 5, pin_memory=True)
+t2_cpu_paged = torch.randn(1024 ** 2 * 5, pin_memory=False)
+t3_cuda = torch.randn(1024 ** 2 * 5, device="cuda:0")
+
+assert torch.cuda.is_available()
+device = torch.device("cuda", torch.cuda.current_device())
+
+
+# The function we want to profile
+def inner(pinned: bool, streamed: bool):
+ with torch.cuda.stream(s) if streamed else contextlib.nullcontext():
+ if pinned:
+ t1_cuda = t1_cpu_pinned.to(device, non_blocking=True)
+ else:
+ t2_cuda = t2_cpu_paged.to(device, non_blocking=True)
+ t_star_cuda_h2d_event = s.record_event()
+ # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is
+ # done in the other stream
+ t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda
+ t3_cuda_h2d_event = torch.cuda.current_stream().record_event()
+ t_star_cuda_h2d_event.synchronize()
+ t3_cuda_h2d_event.synchronize()
+
+
+# Our profiler: profiles the `inner` function and stores the results in a .json file
+def benchmark_with_profiler(
+ pinned,
+ streamed,
+) -> None:
+ torch._C._profiler._set_cuda_sync_enabled_val(True)
+ wait, warmup, active = 1, 1, 2
+ num_steps = wait + warmup + active
+ rank = 0
+ with torch.profiler.profile(
+ activities=[
+ torch.profiler.ProfilerActivity.CPU,
+ torch.profiler.ProfilerActivity.CUDA,
+ ],
+ schedule=torch.profiler.schedule(
+ wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1
+ ),
+ ) as prof:
+ for step_idx in range(1, num_steps + 1):
+ inner(streamed=streamed, pinned=pinned)
+ if rank is None or rank == 0:
+ prof.step()
+ prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json")
diff --git a/tutorials/01-mine/custom_autograd.py b/tutorials/01-mine/custom_autograd.py
new file mode 100644
index 00000000..181b434d
--- /dev/null
+++ b/tutorials/01-mine/custom_autograd.py
@@ -0,0 +1,88 @@
+import torch
+import math
+
+
+class LegendrePolynomial3(torch.autograd.Function):
+ """
+ We can implement our own custom autograd Functions by subclassing
+ torch.autograd.Function and implementing the forward and backward passes
+ which operate on Tensors.
+ """
+
+ @staticmethod
+ def forward(ctx, input):
+ """
+ In the forward pass we receive a Tensor containing the input and return
+ a Tensor containing the output. ctx is a context object that can be used
+ to stash information for backward computation. You can cache tensors for
+ use in the backward pass using the ``ctx.save_for_backward`` method. Other
+ objects can be stored directly as attributes on the ctx object, such as
+ ``ctx.my_object = my_object``. Check out `Extending torch.autograd `_
+ for further details.
+ """
+ ctx.save_for_backward(input)
+ return 0.5 * (5 * input ** 3 - 3 * input)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ """
+ In the backward pass we receive a Tensor containing the gradient of the loss
+ with respect to the output, and we need to compute the gradient of the loss
+ with respect to the input.
+ """
+ input, = ctx.saved_tensors
+ return grad_output * 1.5 * (5 * input ** 2 - 1)
+
+
+dtype = torch.float
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+# device = torch.device("cuda:0") # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For this example, we need
+# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
+# not too far from the correct result to ensure convergence.
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
+c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 5e-6
+for t in range(2000):
+ # To apply our Function, we use Function.apply method. We alias this as 'P3'.
+ P3 = LegendrePolynomial3.apply
+
+ # Forward pass: compute predicted y using operations; we compute
+ # P3 using our custom autograd operation.
+ y_pred = a + b * P3(c + d * x)
+
+ # Compute and print loss
+ loss = (y_pred - y).pow(2).sum()
+ if t % 100 == 99:
+ print(t, loss.item())
+
+ # Use autograd to compute the backward pass.
+ loss.backward()
+
+ # Update weights using gradient descent
+ with torch.no_grad():
+ a -= learning_rate * a.grad
+ b -= learning_rate * b.grad
+ c -= learning_rate * c.grad
+ d -= learning_rate * d.grad
+
+ # Manually zero the gradients after updating weights
+ a.grad = None
+ b.grad = None
+ c.grad = None
+ d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')
\ No newline at end of file
diff --git a/tutorials/01-mine/dataset_demo.py b/tutorials/01-mine/dataset_demo.py
new file mode 100644
index 00000000..9e91ff6a
--- /dev/null
+++ b/tutorials/01-mine/dataset_demo.py
@@ -0,0 +1,87 @@
+import os
+import pandas as pd
+from torchvision.io import decode_image
+import torch
+from torch.utils.data import Dataset
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+import matplotlib.pyplot as plt
+from torch.utils.data import DataLoader
+
+
+
+class CustomImageDataset(Dataset):
+ def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
+ self.img_labels = pd.read_csv(annotations_file)
+ self.img_dir = img_dir
+ self.transform = transform
+ self.target_transform = target_transform
+
+ def __len__(self):
+ return len(self.img_labels)
+
+ def __getitem__(self, idx):
+ img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
+ image = decode_image(img_path)
+ label = self.img_labels.iloc[idx, 1]
+ if self.transform:
+ image = self.transform(image)
+ if self.target_transform:
+ label = self.target_transform(label)
+ return image, label
+
+
+def plt_show(training_data):
+ labels_map = {
+ 0: "T-Shirt",
+ 1: "Trouser",
+ 2: "Pullover",
+ 3: "Dress",
+ 4: "Coat",
+ 5: "Sandal",
+ 6: "Shirt",
+ 7: "Sneaker",
+ 8: "Bag",
+ 9: "Ankle Boot",
+ }
+ figure = plt.figure(figsize=(8, 8))
+ cols, rows = 3, 3
+ for i in range(1, cols * rows + 1):
+ sample_idx = torch.randint(len(training_data), size=(1,)).item()
+ img, label = training_data[sample_idx]
+ figure.add_subplot(rows, cols, i)
+ plt.title(labels_map[label])
+ plt.axis("off")
+ plt.imshow(img.squeeze(), cmap="gray")
+ plt.show()
+
+
+if __name__ == '__main__':
+ training_data = datasets.FashionMNIST(
+ root="data",
+ train=True,
+ download=True,
+ transform=ToTensor()
+ )
+
+ test_data = datasets.FashionMNIST(
+ root="data",
+ train=False,
+ download=True,
+ transform=ToTensor()
+ )
+ # plt_show(training_data)
+
+ #
+ #
+ train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
+ test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
+ # Display image and label.
+ train_features, train_labels = next(iter(train_dataloader))
+ print(f"Feature batch shape: {train_features.size()}")
+ print(f"Labels batch shape: {train_labels.size()}")
+ img = train_features[0].squeeze()
+ label = train_labels[0]
+ plt.imshow(img, cmap="gray")
+ plt.show()
+ print(f"Label: {label}")
\ No newline at end of file
diff --git a/tutorials/01-mine/neural_network.md b/tutorials/01-mine/neural_network.md
new file mode 100644
index 00000000..31366da2
--- /dev/null
+++ b/tutorials/01-mine/neural_network.md
@@ -0,0 +1,165 @@
+## 1. **神经网络的本质:数据处理的 “流水线”**
+
+在 PyTorch 中,**神经网络**本质上是一个 **由可训练参数(权重和偏置)组成的计算图**,它的作用是:
+
+- 接收一个输入(通常是张量,比如图片的像素矩阵、文本的向量表示)
+- 通过一系列 **线性变换**(矩阵乘法)和 **非线性激活**(比如 ReLU、Sigmoid)
+- 输出一个预测结果(比如分类任务的类别概率、回归任务的数值)
+
+你可以把它想象成一个 **“智能函数”**:
+
+\(y = f(x; W, b)\)
+
+其中:
+
+- x 是输入(张量)
+- W 和 b 是网络的 **可训练参数**(张量)
+- f 是网络的计算逻辑(由多个层组成)
+- y 是输出(张量)
+
+------
+
+## 2. **PyTorch 神经网络的核心组成**
+
+在 PyTorch 中,神经网络通常由以下几个关键部分构成:
+
+### (1)`torch.nn.Module`:网络的 “容器”
+
+- 所有神经网络都必须继承自 `torch.nn.Module` 类
+
+- 它是一个
+
+
+
+ 参数化的容器
+
+ ,可以包含:
+
+ - 网络的层(如 `nn.Linear`、`nn.Conv2d`、`nn.ReLU` 等)
+ - 可训练的参数(`nn.Parameter`)
+ - 自定义的计算逻辑
+
+例如:
+
+```python
+import torch
+import torch.nn as nn
+
+class MyNet(nn.Module):
+ def __init__(self):
+ super(MyNet, self).__init__()
+ # 定义层(包含可训练参数)
+ self.fc1 = nn.Linear(10, 20) # 输入10维,输出20维
+ self.relu = nn.ReLU() # 非线性激活
+ self.fc2 = nn.Linear(20, 2) # 输出2类
+
+ def forward(self, x):
+ # 定义数据流动的路径(前向传播)
+ x = self.fc1(x) # 线性变换:x @ W1 + b1
+ x = self.relu(x) # 非线性激活
+ x = self.fc2(x) # 线性变换:x @ W2 + b2
+ return x
+```
+
+------
+
+### (2)**层(Layer):网络的 “基本单元”**
+
+层是神经网络的核心组件,每个层都是一个 **参数化的函数**,负责对输入张量进行特定的变换。常见的层包括:
+
+| 层类型 | 作用 | 数学表达(简化) |
+| -------------------- | ------------------------------ | ---------------------------------------- |
+| `nn.Linear(in, out)` | 线性变换(全连接层) | \(y = xW^T + b\) |
+| `nn.Conv2d(in, out)` | 二维卷积(提取空间特征) | \(y = \text{Conv}(x, W) + b\) |
+| `nn.ReLU()` | 非线性激活(增加模型表达能力) | \(y = \max(0, x)\) |
+| `nn.Softmax(dim)` | 归一化输出为概率分布 | \(y_i = \frac{e^{x_i}}{\sum_j e^{x_j}}\) |
+
+这些层的本质都是 **对张量的运算**,而层中的 `weight` 和 `bias` 是 **可训练的张量**(`nn.Parameter` 类型),会在训练过程中通过梯度下降更新。
+
+------
+
+### (3)**前向传播(forward):数据的 “流动路径”**
+
+- `forward` 方法定义了 **数据如何在网络中流动**
+- 输入张量 `x` 依次经过各层的变换,最终得到输出张量
+- 这个过程就是 **计算图的构建过程**(PyTorch 会自动记录运算,用于反向传播)
+
+例如:
+
+```python
+net = MyNet()
+x = torch.randn(3, 10) # 3个样本,每个样本10维特征
+y = net(x) # 前向传播:x → fc1 → relu → fc2 → y
+print(y.shape) # 输出: torch.Size([3, 2])
+```
+
+------
+
+### (4)**参数(Parameter):网络的 “可训练变量”**
+
+- 网络的参数(权重 W 和偏置 b)是 `nn.Parameter` 类型的张量
+- 它们会被自动注册到网络的 `parameters()` 或 `named_parameters()` 方法中
+- 在训练时,优化器(如 `torch.optim.SGD`)会根据梯度更新这些参数
+
+查看网络参数:
+
+```python
+for name, param in net.named_parameters():
+ print(name, param.shape)
+```
+
+输出:
+
+```plaintext
+fc1.weight torch.Size([20, 10])
+fc1.bias torch.Size([20])
+fc2.weight torch.Size([2, 20])
+fc2.bias torch.Size([2])
+```
+
+------
+
+## 3. **神经网络的 “学习” 过程**
+
+神经网络之所以能 “智能”,是因为它的参数可以通过 **数据驱动的方式更新**(即训练过程),核心步骤如下:
+
+### (1)定义损失函数(衡量预测与真实值的差距)
+
+```python
+loss_fn = nn.CrossEntropyLoss() # 分类任务常用损失
+```
+
+### (2)定义优化器(负责更新参数)
+
+```python
+optimizer = torch.optim.SGD(net.parameters(), lr=0.01) # 随机梯度下降
+```
+
+### (3)训练循环(前向传播 → 计算损失 → 反向传播 → 更新参数)
+
+```python
+for epoch in range(100):
+ # 前向传播
+ y_pred = net(x)
+
+ # 计算损失
+ loss = loss_fn(y_pred, y_true) # y_true是真实标签
+
+ # 反向传播(计算梯度)
+ optimizer.zero_grad() # 清空上一轮梯度
+ loss.backward() # 自动计算参数的梯度
+
+ # 更新参数(梯度下降)
+ optimizer.step()
+```
+
+通过这个过程,网络的参数会逐渐调整,使得预测结果越来越接近真实值。
+
+------
+
+## 4. **PyTorch 神经网络的特点**
+
+- **模块化**:可以像搭积木一样组合不同的层
+- **自动微分**:`backward()` 方法自动计算梯度,无需手动推导
+- **灵活性**:`forward` 方法可以写任意复杂的逻辑(如循环、条件判断)
+- **GPU 加速**:只需调用 `.to("cuda")` 即可在 GPU 上运行“智能函数”:\(y = f(x; W, b)\)其中:x 是输入(张量)W 和 b 是网络的 可训练参数(张量)f 是网络的计算逻辑(由多个层组成)y 是输出(张量)
\ No newline at end of file
diff --git a/tutorials/01-mine/tensors_demo.py b/tutorials/01-mine/tensors_demo.py
new file mode 100644
index 00000000..3bbede3f
--- /dev/null
+++ b/tutorials/01-mine/tensors_demo.py
@@ -0,0 +1,69 @@
+import torch
+import numpy as np
+
+
+if __name__ == '__main__':
+ # Initializing a Tensor
+ data = [[1, 2], [3, 4]]
+ x_data = torch.tensor(data)
+ np_array = np.array(data)
+ x_np = torch.from_numpy(np_array)
+ print(f"x_np.numpy() == np_array: {x_np.numpy() == np_array}")
+ # The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.
+ x_ones = torch.ones_like(x_data) # retains the properties of x_data
+ print(f"Ones Tensor: \n {x_ones} \n")
+ x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
+ print(f"Random Tensor: \n {x_rand} \n")
+ shape = (2, 3,)
+ rand_tensor = torch.rand(shape)
+ ones_tensor = torch.ones(shape)
+ zeros_tensor = torch.zeros(shape)
+
+ print(f"Random Tensor: \n {rand_tensor} \n")
+ print(f"Ones Tensor: \n {ones_tensor} \n")
+ print(f"Zeros Tensor: \n {zeros_tensor}")
+
+ tensor = torch.rand(3, 4)
+
+ print(f"Shape of tensor: {tensor.shape}")
+ print(f"Datatype of tensor: {tensor.dtype}")
+ print(f"Device tensor is stored on: {tensor.device}")
+ # Operations on Tensors
+ # 超过1200种张量运算,包括算术运算、线性代数运算、矩阵操作(转置、索引、切片)、采样等
+ # 默认情况下,张量在CPU上创建。我们需要使用.to方法(在检查加速器可用性之后)将张量显式移动到加速器。请记住,跨设备复制大型张量在时间和内存方面可能成本很高!
+ # We move our tensor to the current accelerator if available
+ device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+ print(f"Using {device} device")
+ # tensor = tensor.to(device)
+ tensor = torch.ones(4, 4)
+ print(f"First row: {tensor[0]}")
+ print(f"First column: {tensor[:, 0]}")
+ print(f"Last column: {tensor[..., -1]}")
+ tensor[:, 1] = 0
+ print(tensor)
+ # Joining tensors 拼接张量
+ t1 = torch.cat([tensor, tensor, tensor], dim=1)
+ print(f"t1: \n {t1} \n")
+ # Arithmetic operations 算术运算
+ # This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
+ # ``tensor.T`` returns the transpose of a tensor
+ print(f"tensor: \n {tensor} \n")
+ print(f"tensor.T: \n {tensor.T} \n")
+ y1 = tensor @ tensor.T #矩阵乘法
+ y2 = tensor.matmul(tensor.T) #矩阵乘法
+
+ y3 = torch.rand_like(y1)
+ print(f"y1: \n {y1} \n, \ny2: \n {y2} \n, \ny3: \n {y3}")
+ torch.matmul(tensor, tensor.T, out=y3)
+
+ # This computes the element-wise product. z1, z2, z3 will have the same value
+ z1 = tensor * tensor
+ z2 = tensor.mul(tensor)
+
+ z3 = torch.rand_like(tensor)
+ torch.mul(tensor, tensor, out=z3)
+ print(f"agg tensor: \n {tensor} \n")
+ # tensor.sum() 会对张量的所有元素进行求和,返回一个标量(只有一个值的张量)。
+ agg = tensor.sum()
+ agg_item = agg.item()
+ print(agg_item, type(agg_item))
diff --git a/tutorials/01-mine/tensors_demo2.py b/tutorials/01-mine/tensors_demo2.py
new file mode 100644
index 00000000..71d383e5
--- /dev/null
+++ b/tutorials/01-mine/tensors_demo2.py
@@ -0,0 +1,191 @@
+import torch
+import numpy as np
+import torch # for all things PyTorch
+import torch.nn as nn # for torch.nn.Module, the parent object for PyTorch models
+import torch.nn.functional as F # for the activation function
+import torch
+import math
+
+"""
+ Tensors operations (张量运算)
+"""
+
+
+def tensors_demo():
+ z = torch.zeros(5, 3)
+ print(z)
+ # 发现这些零是32位浮点数,这是PyTorch的默认类型。
+ print(z.dtype)
+ i = torch.ones((5, 3), dtype=torch.int16)
+ print(i)
+ print(i.dtype)
+ torch.manual_seed(1729)
+ r1 = torch.rand(2, 2)
+ print('A random tensor:')
+ print(r1)
+
+ r2 = torch.rand(2, 2)
+ print('\nA different random tensor:')
+ print(r2) # new values
+
+ torch.manual_seed(1729)
+ r3 = torch.rand(2, 2)
+ print('\nShould match r1:')
+ print(r3) # repeats values of r1 because of re-seed
+ ones = torch.ones(2, 3)
+ print(ones)
+
+ twos = torch.ones(2, 3) * 2 # every element is multiplied by 2
+ print(twos)
+
+ threes = ones + twos # addition allowed because shapes are similar
+ print(threes) # tensors are added element-wise
+ print(threes.shape) # this has the same dimensions as input tensors
+
+ r1 = torch.rand(2, 3)
+ r2 = torch.rand(3, 2)
+ # uncomment this line to get a runtime error
+ # r3 = r1 + r2
+ r = (torch.rand(2, 2) - 0.5) * 2 # values between -1 and 1
+ print('A random matrix, r:')
+ print(r)
+
+ # Common mathematical operations are supported:
+ print('\nAbsolute value of r:')
+ print(torch.abs(r))
+
+ # ...as are trigonometric functions:
+ print('\nInverse sine of r:')
+ """
+ 求每个元素的反正弦值
+ """
+ print(torch.asin(r))
+
+ # ...and linear algebra operations like determinant and singular value decomposition
+ print('\nDeterminant of r:')
+ """
+ 计算 方阵 的 行列式(determinant)。行列式是一个标量,描述矩阵的缩放因子和方向变化。
+ r = torch.tensor([[1.0, 2.0],
+ [3.0, 4.0]])
+ print(torch.det(r))
+ =1×4−2×3=−2
+ """
+ print(torch.det(r))
+ print('\nSingular value decomposition of r:')
+ print(torch.svd(r))
+
+ # ...and statistical and aggregate operations:
+ print('\nAverage and standard deviation of r:')
+ # 同时计算张量的 标准差(standard deviation) 和 均值(mean)。
+ print(torch.std_mean(r))
+ print('\nMaximum value of r:')
+ print(torch.max(r))
+
+
+class LeNet(nn.Module):
+
+ def __init__(self):
+ super(LeNet, self).__init__()
+ # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
+ # kernel
+ self.conv1 = nn.Conv2d(1, 6, 5)
+ self.conv2 = nn.Conv2d(6, 16, 5)
+ # an affine operation: y = Wx + b
+ self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
+ self.fc2 = nn.Linear(120, 84)
+ self.fc3 = nn.Linear(84, 10)
+ # 前向计算
+ """
+ 前向计算
+ 就是让输入数据通过神经网络,得到模型的预测结果的过程。你可以把它想象成:
+ 输入数据(比如一张图片、一段文字)从网络的第一层开始,依次经过每一层的运算(卷积、矩阵乘法、激活函数等)。
+ 每一层都会对上一层的输出做处理,最终在网络的最后一层得到预测值(比如分类任务的类别概率、回归任务的数值)。
+ 作用是什么?
+ 得到预测结果:比如输入一张猫的图片,前向计算会输出模型认为这是 “猫” 的概率。
+ 计算损失:将预测结果与真实标签(比如 “猫”)比较,用损失函数(Loss Function)衡量模型预测的误差。
+
+ 反向传播(Backward Pass)
+ 反向传播就是根据损失函数,从网络的最后一层往回计算每一层参数的梯度的过程。梯度表示参数变化对损失的影响程度,优化器(如 SGD、Adam)会用这些梯度来更新参数,让损失更小。
+ 你可以把它想象成:
+ 从损失值开始,沿着网络的每一层反向计算参数的梯度(使用链式法则)。
+ 梯度会告诉我们:每个参数应该调整多少,才能让模型预测更准确。
+ 作用是什么?
+ 计算梯度:得到每个参数(权重 w、偏置 b)的梯度。更新参数:优化器根据梯度调整参数,让模型的预测结果越来越接近真实值。
+
+ 前向计算:输入数据 → 模型 → 预测值 → 损失。
+ 反向传播:损失 → 计算梯度 → 更新参数。
+ """
+ def forward(self, x):
+ # Max pooling over a (2, 2) window
+ x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+ # If the size is a square you can only specify a single number
+ x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+ x = x.view(-1, self.num_flat_features(x))
+ x = F.relu(self.fc1(x))
+ x = F.relu(self.fc2(x))
+ x = self.fc3(x)
+ return x
+
+ def num_flat_features(self, x):
+ size = x.size()[1:] # all dimensions except the batch dimension
+ num_features = 1
+ for s in size:
+ num_features *= s
+ return num_features
+
+
+def pytorch_model_demo():
+ net = LeNet()
+ print(net)
+ input = torch.rand(1, 1, 32, 32) # stand-in for a 32x32 black & white image
+ print('\nImage batch shape:')
+ print(input.shape)
+
+ output = net(input) # we don't call forward() directly
+ print('\nRaw output:')
+ print(output)
+ print(output.shape)
+
+
+def tensors_demo1():
+ x = torch.empty(3, 4)
+ print(type(x))
+ print(x)
+ #随机张量与种子
+ torch.manual_seed(1729)
+ random1 = torch.rand(2, 3)
+ print(random1)
+
+ random2 = torch.rand(2, 3)
+ print(random2)
+
+ torch.manual_seed(1729)
+ random3 = torch.rand(2, 3)
+ print(random3)
+
+ random4 = torch.rand(2, 3)
+ print(random4)
+ # In Brief: Tensor Broadcasting
+ # 广播是一种机制,它允许在进行逐元素运算时,对不同形状的张量自动进行 “虚拟扩展”,使它们的形状兼容,从而可以进行运算。
+ """
+ 广播的核心思想是:
+ 不需要实际复制数据,而是在逻辑上扩展张量的维度。
+ 它遵循一套规则,自动对齐不同形状的张量。
+ 广播的规则
+ 广播有两个核心规则:
+ 规则 1:维度对齐
+ 从 最后一个维度 开始向前比较两个张量的维度:
+ 如果两个维度的大小 相同 → 兼容。
+ 如果其中一个维度的大小是 1 → 兼容(会被扩展)。
+ 如果两个维度的大小 不同且都不是 1 → 不兼容,会报错。
+ 规则 2:维度扩展
+ 对于大小为 1 的维度,沿着该维度复制数据(在逻辑上),直到与另一个张量的对应维度大小一致。
+ """
+ rand = torch.rand(2, 4)
+ doubled = rand * (torch.ones(1, 4) * 2)
+
+ print(f"rand: {rand}")
+ print(f"doubled: {doubled}")
+
+if __name__ == '__main__':
+ tensors_demo1()
diff --git a/tutorials/01-mine/transforms.py b/tutorials/01-mine/transforms.py
new file mode 100644
index 00000000..c8dd4645
--- /dev/null
+++ b/tutorials/01-mine/transforms.py
@@ -0,0 +1,21 @@
+import torch
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda
+
+
+"""
+数据并不总是以训练机器学习算法所需的最终处理形式出现。我们使用变换对数据进行一些处理,使其适合训练。
+所有TorchVision数据集都有两个参数——transform用于修改特征,target_transform用于修改标签,
+这两个参数接收包含转换逻辑的可调用对象。torchvision.transforms模块提供了几种常用的现成转换方法。
+"""
+if __name__ == '__main__':
+ # ToTensor将PIL图像或NumPy ndarray转换为FloatTensor`_
+
+This tutorials is part of a three-part series:
+
+* `NLP From Scratch: Classifying Names with a Character-Level RNN `__
+* `NLP From Scratch: Generating Names with a Character-Level RNN `__
+* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__
+
+This is the third and final tutorial on doing **NLP From Scratch**, where we
+write our own classes and functions to preprocess the data to do our NLP
+modeling tasks.
+
+In this project we will be teaching a neural network to translate from
+French to English.
+
+.. code-block:: sh
+
+ [KEY: > input, = target, < output]
+
+ > il est en train de peindre un tableau .
+ = he is painting a picture .
+ < he is painting a picture .
+
+ > pourquoi ne pas essayer ce vin delicieux ?
+ = why not try that delicious wine ?
+ < why not try that delicious wine ?
+
+ > elle n est pas poete mais romanciere .
+ = she is not a poet but a novelist .
+ < she not not a poet but a novelist .
+
+ > vous etes trop maigre .
+ = you re too skinny .
+ < you re all alone .
+
+... to varying degrees of success.
+
+This is made possible by the simple but powerful idea of the `sequence
+to sequence network `__, in which two
+recurrent neural networks work together to transform one sequence to
+another. An encoder network condenses an input sequence into a vector,
+and a decoder network unfolds that vector into a new sequence.
+
+.. figure:: /_static/img/seq-seq-images/seq2seq.png
+ :alt:
+
+To improve upon this model we'll use an `attention
+mechanism `__, which lets the decoder
+learn to focus over a specific range of the input sequence.
+
+**Recommended Reading:**
+
+I assume you have at least installed PyTorch, know Python, and
+understand Tensors:
+
+- https://pytorch.org/ For installation instructions
+- :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
+- :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
+- :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
+
+
+It would also be useful to know about Sequence to Sequence networks and
+how they work:
+
+- `Learning Phrase Representations using RNN Encoder-Decoder for
+ Statistical Machine Translation `__
+- `Sequence to Sequence Learning with Neural
+ Networks `__
+- `Neural Machine Translation by Jointly Learning to Align and
+ Translate `__
+- `A Neural Conversational Model `__
+
+You will also find the previous tutorials on
+:doc:`/intermediate/char_rnn_classification_tutorial`
+and :doc:`/intermediate/char_rnn_generation_tutorial`
+helpful as those concepts are very similar to the Encoder and Decoder
+models, respectively.
+
+**Requirements**
+"""
+from __future__ import unicode_literals, print_function, division
+from io import open
+import unicodedata
+import re
+import random
+
+import torch
+import torch.nn as nn
+from torch import optim
+import torch.nn.functional as F
+
+import time
+import math
+
+import numpy as np
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+#
+SOS_token = 0
+EOS_token = 1
+
+class Lang:
+ def __init__(self, name):
+ self.name = name
+ self.word2index = {}
+ self.word2count = {}
+ self.index2word = {0: "SOS", 1: "EOS"}
+ self.n_words = 2 # Count SOS and EOS
+
+ def addSentence(self, sentence):
+ """添加整个句子到词汇表"""
+ # 检查句子是否包含中文字符
+ has_chinese = any('\u4e00' <= char <= '\u9fff' for char in sentence)
+ if has_chinese: # 中文按字符处理
+ for char in sentence:
+ self.addWord(char)
+ else: # 英文按单词处理
+ for word in sentence.split(' '):
+ self.addWord(word)
+
+ def addWord(self, word):
+ """添加单个词到词汇表"""
+ if word not in self.word2index:
+ self.word2index[word] = self.n_words
+ self.word2count[word] = 1
+ self.index2word[self.n_words] = word
+ self.n_words += 1
+ else:
+ self.word2count[word] += 1
+
+def unicodeToAscii(s):
+ return ''.join(
+ c for c in unicodedata.normalize('NFD', s)
+ if unicodedata.category(c) != 'Mn'
+ )
+
+
+def normalizeString(s):
+ s = s.strip()
+ # 判断是否为英文句子(以字母开头)
+ if s and s[0].isalpha() and s[0].lower() in 'abcdefghijklmnopqrstuvwxyz':
+ # 英文句子处理
+ s = unicodeToAscii(s.lower())
+ s = re.sub(r"([.!?])", r" \1", s)
+ s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
+ else:
+ # 中文句子处理
+ s = re.sub(r'\\s+', ' ', s) # 只去除多余空格
+ return s.strip()
+
+def readLangs(lang1, lang2, reverse=False):
+ print("Reading lines...")
+
+ # Read the file and split into lines
+ lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \
+ read().strip().split('\n')
+
+ # Split every line into pairs and normalize
+ pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
+
+ # Reverse pairs, make Lang instances
+ if reverse:
+ pairs = [list(reversed(p)) for p in pairs]
+ input_lang = Lang(lang2)
+ output_lang = Lang(lang1)
+ else:
+ input_lang = Lang(lang1)
+ output_lang = Lang(lang2)
+
+ return input_lang, output_lang, pairs
+
+
+MAX_LENGTH = 10
+
+# 更宽松的英文前缀列表
+eng_prefixes = (
+ "i ", "you ", "he ", "she ", "we ", "they ",
+ "it ", "this ", "that ", "there ", "the ",
+ "a ", "an ", "my ", "your ", "his ", "her ",
+ "our ", "their "
+)
+
+
+def filterPair(p):
+ # 判断输入是英文还是中文,分别计算长度
+ if p[0] and p[0][0].isalpha() and p[0][0].lower() in 'abcdefghijklmnopqrstuvwxyz':
+ input_length = len(p[0].split(' '))
+ else:
+ input_length = len(p[0])
+
+ output_length = len(p[1].split(' '))
+
+ # 仅保留长度合适的句子对,移除eng_prefixes限制
+ return input_length <= MAX_LENGTH - 1 and output_length <= MAX_LENGTH - 1
+
+
+def filterPairs(pairs):
+ return [pair for pair in pairs if filterPair(pair)]
+
+def prepareData(lang1, lang2, reverse=False):
+ input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
+ print("Read %s sentence pairs" % len(pairs))
+ pairs = filterPairs(pairs)
+ print("前5个句子对示例:")
+ for i, pair in enumerate(pairs[:5]):
+ print(f" 中文: {pair[0]}")
+ print(f" 英文: {pair[1]}")
+ print()
+ print("Trimmed to %s sentence pairs" % len(pairs))
+ print("Counting words...")
+ for pair in pairs:
+ input_lang.addSentence(pair[0])
+ output_lang.addSentence(pair[1])
+ print("Counted words:")
+ print(input_lang.name, input_lang.n_words)
+ print(output_lang.name, output_lang.n_words)
+ return input_lang, output_lang, pairs
+
+
+input_lang, output_lang, pairs = prepareData('cmn', 'eng', True)
+print(random.choice(pairs))
+
+class EncoderRNN(nn.Module):
+ def __init__(self, input_size, hidden_size, dropout_p=0.1):
+ super(EncoderRNN, self).__init__()
+ self.hidden_size = hidden_size
+
+ self.embedding = nn.Embedding(input_size, hidden_size)
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
+ self.dropout = nn.Dropout(dropout_p)
+
+ def forward(self, input):
+ embedded = self.dropout(self.embedding(input))
+ output, hidden = self.gru(embedded)
+ return output, hidden
+
+
+class DecoderRNN(nn.Module):
+ def __init__(self, hidden_size, output_size):
+ super(DecoderRNN, self).__init__()
+ self.embedding = nn.Embedding(output_size, hidden_size)
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
+ self.out = nn.Linear(hidden_size, output_size)
+
+ def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+ batch_size = encoder_outputs.size(0)
+ decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
+ decoder_hidden = encoder_hidden
+ decoder_outputs = []
+
+ for i in range(MAX_LENGTH):
+ decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
+ decoder_outputs.append(decoder_output)
+
+ if target_tensor is not None:
+ # Teacher forcing: Feed the target as the next input
+ decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
+ else:
+ # Without teacher forcing: use its own predictions as the next input
+ _, topi = decoder_output.topk(1)
+ decoder_input = topi.squeeze(-1).detach() # detach from history as input
+
+ decoder_outputs = torch.cat(decoder_outputs, dim=1)
+ decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+ return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop
+
+ def forward_step(self, input, hidden):
+ output = self.embedding(input)
+ output = F.relu(output)
+ output, hidden = self.gru(output, hidden)
+ output = self.out(output)
+ return output, hidden
+
+class BahdanauAttention(nn.Module):
+ def __init__(self, hidden_size):
+ super(BahdanauAttention, self).__init__()
+ self.Wa = nn.Linear(hidden_size, hidden_size)
+ self.Ua = nn.Linear(hidden_size, hidden_size)
+ self.Va = nn.Linear(hidden_size, 1)
+
+ def forward(self, query, keys):
+ scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
+ scores = scores.squeeze(2).unsqueeze(1)
+
+ weights = F.softmax(scores, dim=-1)
+ context = torch.bmm(weights, keys)
+
+ return context, weights
+
+
+class AttnDecoderRNN(nn.Module):
+ def __init__(self, hidden_size, output_size, dropout_p=0.1):
+ super(AttnDecoderRNN, self).__init__()
+ self.embedding = nn.Embedding(output_size, hidden_size)
+ self.attention = BahdanauAttention(hidden_size)
+ self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
+ self.out = nn.Linear(hidden_size, output_size)
+ self.dropout = nn.Dropout(dropout_p)
+
+ def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+ batch_size = encoder_outputs.size(0)
+ decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
+ decoder_hidden = encoder_hidden
+ decoder_outputs = []
+ attentions = []
+
+ for i in range(MAX_LENGTH):
+ decoder_output, decoder_hidden, attn_weights = self.forward_step(
+ decoder_input, decoder_hidden, encoder_outputs
+ )
+ decoder_outputs.append(decoder_output)
+ attentions.append(attn_weights)
+
+ if target_tensor is not None:
+ # Teacher forcing: Feed the target as the next input
+ decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
+ else:
+ # Without teacher forcing: use its own predictions as the next input
+ _, topi = decoder_output.topk(1)
+ decoder_input = topi.squeeze(-1).detach() # detach from history as input
+
+ decoder_outputs = torch.cat(decoder_outputs, dim=1)
+ decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+ attentions = torch.cat(attentions, dim=1)
+
+ return decoder_outputs, decoder_hidden, attentions
+
+ def forward_step(self, input, hidden, encoder_outputs):
+ embedded = self.dropout(self.embedding(input))
+
+ query = hidden.permute(1, 0, 2)
+ context, attn_weights = self.attention(query, encoder_outputs)
+ input_gru = torch.cat((embedded, context), dim=2)
+
+ output, hidden = self.gru(input_gru, hidden)
+ output = self.out(output)
+
+ return output, hidden, attn_weights
+
+
+def indexesFromSentence(lang, sentence):
+ has_chinese = any('\u4e00' <= char <= '\u9fff' for char in sentence)
+ indexes = []
+
+ if has_chinese:
+ for char in sentence:
+ if char in lang.word2index:
+ indexes.append(lang.word2index[char])
+ else:
+ indexes.append(2) # UNK的索引是2
+ else:
+ for word in sentence.split(' '):
+ if word in lang.word2index:
+ indexes.append(lang.word2index[word])
+ else:
+ indexes.append(2) # UNK的索引是2
+ return indexes
+
+
+def tensorFromSentence(lang, sentence):
+ indexes = indexesFromSentence(lang, sentence)
+ indexes.append(EOS_token)
+ return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)
+
+
+def tensorsFromPair(pair):
+ input_tensor = tensorFromSentence(input_lang, pair[0])
+ target_tensor = tensorFromSentence(output_lang, pair[1])
+ return (input_tensor, target_tensor)
+
+
+def get_dataloader(batch_size):
+ # 使用全局的input_lang, output_lang, pairs
+ n = len(pairs)
+ input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
+ target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
+
+ for idx, (inp, tgt) in enumerate(pairs):
+ inp_ids = indexesFromSentence(input_lang, inp)
+ tgt_ids = indexesFromSentence(output_lang, tgt)
+
+ # 确保句子长度不超过MAX_LENGTH-1,然后添加EOS_token
+ inp_ids = inp_ids[:MAX_LENGTH - 1]
+ tgt_ids = tgt_ids[:MAX_LENGTH - 1]
+
+ inp_ids.append(EOS_token)
+ tgt_ids.append(EOS_token)
+
+ input_ids[idx, :len(inp_ids)] = inp_ids
+ target_ids[idx, :len(tgt_ids)] = tgt_ids
+
+ train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
+ torch.LongTensor(target_ids).to(device))
+
+ train_sampler = RandomSampler(train_data)
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+ return input_lang, output_lang, train_dataloader
+
+def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
+ decoder_optimizer, criterion):
+ total_loss = 0
+ for data in dataloader:
+ input_tensor, target_tensor = data
+
+ encoder_optimizer.zero_grad()
+ decoder_optimizer.zero_grad()
+
+ encoder_outputs, encoder_hidden = encoder(input_tensor)
+ decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
+
+ loss = criterion(
+ decoder_outputs.view(-1, decoder_outputs.size(-1)),
+ target_tensor.view(-1)
+ )
+ loss.backward()
+
+ encoder_optimizer.step()
+ decoder_optimizer.step()
+
+ total_loss += loss.item()
+
+ return total_loss / len(dataloader)
+
+
+
+def asMinutes(s):
+ m = math.floor(s / 60)
+ s -= m * 60
+ return '%dm %ds' % (m, s)
+
+
+def timeSince(since, percent):
+ now = time.time()
+ s = now - since
+ es = s / (percent)
+ rs = es - s
+ return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
+
+
+def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
+ print_every=100, plot_every=100):
+ start = time.time()
+ plot_losses = []
+ print_loss_total = 0 # Reset every print_every
+ plot_loss_total = 0 # Reset every plot_every
+
+ encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
+ decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
+ criterion = nn.NLLLoss()
+
+ for epoch in range(1, n_epochs + 1):
+ loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
+ print_loss_total += loss
+ plot_loss_total += loss
+
+ if epoch % print_every == 0:
+ print_loss_avg = print_loss_total / print_every
+ print_loss_total = 0
+ print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
+ epoch, epoch / n_epochs * 100, print_loss_avg))
+
+ if epoch % plot_every == 0:
+ plot_loss_avg = plot_loss_total / plot_every
+ plot_losses.append(plot_loss_avg)
+ plot_loss_total = 0
+
+ showPlot(plot_losses)
+
+
+import matplotlib.pyplot as plt
+
+plt.switch_backend('agg')
+import matplotlib.ticker as ticker
+import numpy as np
+
+
+def showPlot(points):
+ plt.figure()
+ fig, ax = plt.subplots()
+ # this locator puts ticks at regular intervals
+ loc = ticker.MultipleLocator(base=0.2)
+ ax.yaxis.set_major_locator(loc)
+ plt.plot(points)
+
+def evaluate(encoder, decoder, sentence, input_lang, output_lang):
+ with torch.no_grad():
+ input_tensor = tensorFromSentence(input_lang, sentence)
+
+ encoder_outputs, encoder_hidden = encoder(input_tensor)
+ decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)
+
+ _, topi = decoder_outputs.topk(1)
+ decoded_ids = topi.squeeze()
+
+ decoded_words = []
+ for idx in decoded_ids:
+ if idx.item() == EOS_token:
+ decoded_words.append('')
+ break
+ decoded_words.append(output_lang.index2word[idx.item()])
+ return decoded_words, decoder_attn
+
+
+def evaluateRandomly(encoder, decoder, n=10):
+ for i in range(n):
+ pair = random.choice(pairs)
+ print('>', pair[0])
+ print('=', pair[1])
+ output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
+ output_sentence = ' '.join(output_words)
+ print('<', output_sentence)
+ print('')
+
+
+hidden_size = 128
+batch_size = 32
+
+input_lang, output_lang, train_dataloader = get_dataloader(batch_size)
+
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
+
+# train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)
+#
+# ######################################################################
+# #
+# # Set dropout layers to ``eval`` mode
+# encoder.eval()
+# decoder.eval()
+# evaluateRandomly(encoder, decoder)
+
+
+######################################################################
+# Visualizing Attention
+# ---------------------
+#
+# A useful property of the attention mechanism is its highly interpretable
+# outputs. Because it is used to weight specific encoder outputs of the
+# input sequence, we can imagine looking where the network is focused most
+# at each time step.
+#
+# You could simply run ``plt.matshow(attentions)`` to see attention output
+# displayed as a matrix. For a better viewing experience we will do the
+# extra work of adding axes and labels:
+#
+
+def showAttention(input_sentence, output_words, attentions):
+ fig = plt.figure()
+ ax = fig.add_subplot(111)
+ cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
+ fig.colorbar(cax)
+
+ # Set up axes
+ ax.set_xticklabels([''] + input_sentence.split(' ') +
+ [''], rotation=90)
+ ax.set_yticklabels([''] + output_words)
+
+ # Show label at every tick
+ ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+ ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
+
+ plt.show()
+
+
+def evaluateAndShowAttention(input_sentence):
+ # 根据输入句子的语言类型选择正确的语言对象
+ has_chinese = any('\u4e00' <= char <= '\u9fff' for char in input_sentence)
+ if has_chinese:
+ # 中文句子,使用output_lang作为输入语言
+ output_words, attentions = evaluate(encoder, decoder, input_sentence, output_lang, input_lang)
+ else:
+ # 英文句子,使用input_lang作为输入语言
+ output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
+
+ # 显示结果
+ print('input =', input_sentence)
+ print('output =', ' '.join(output_words))
+ showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])
+
+# test_sentences = [
+# 'i am anxious', # 从训练数据前缀中选取
+# 'he is happy',
+# 'you are welcome'
+# ]
+#
+# for sent in test_sentences:
+# evaluateAndShowAttention(sent)
+evaluateAndShowAttention('i am anxious')
+evaluateAndShowAttention('he is happy')
+evaluateAndShowAttention('you are welcome?')
diff --git a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py
new file mode 100644
index 00000000..4af9cc56
--- /dev/null
+++ b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+
+# Device configuration
+# 支持CUDA、MPS和CPU设备
+# 优化前
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# 优化后
+device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
+print(f"使用设备: {device}")
+
+# BiRNN是双向循环神经网络(Bidirectional Recurrent Neural Network)的缩写,
+# 是一种能够同时利用序列过去和未来信息的神经网络架构。
+# Bidirectional recurrent neural network (many-to-one)
+class BiRNN(nn.Module):
+ def __init__(self, input_size, hidden_size, num_layers, num_classes):
+ super(BiRNN, self).__init__()
+ self.hidden_size = hidden_size
+ self.num_layers = num_layers
+ self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
+ self.fc = nn.Linear(hidden_size * 2, num_classes) # 2 for bidirection
+
+ def forward(self, x):
+ # Set initial states
+ h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) # 2 for bidirection
+ c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
+
+ # Forward propagate LSTM
+ out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size*2)
+
+ # Decode the hidden state of the last time step
+ out = self.fc(out[:, -1, :])
+ return out
+
+
+if __name__ == '__main__':
+
+ # Hyper-parameters
+ sequence_length = 28
+ input_size = 28
+ hidden_size = 128
+ num_layers = 2
+ num_classes = 10
+ batch_size = 100
+ num_epochs = 2
+ learning_rate = 0.003
+
+ # MNIST dataset
+ train_dataset = torchvision.datasets.MNIST(root='../../data/',
+ train=True,
+ transform=transforms.ToTensor(),
+ download=True)
+
+ test_dataset = torchvision.datasets.MNIST(root='../../data/',
+ train=False,
+ transform=transforms.ToTensor())
+
+ # Data loader
+ train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+ batch_size=batch_size,
+ shuffle=True)
+
+ test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+ batch_size=batch_size,
+ shuffle=False)
+
+ model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
+
+ # Loss and optimizer
+ criterion = nn.CrossEntropyLoss()
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+ # Train the model
+ total_step = len(train_loader)
+ for epoch in range(num_epochs):
+ for i, (images, labels) in enumerate(train_loader):
+ images = images.reshape(-1, sequence_length, input_size).to(device)
+ labels = labels.to(device)
+
+ # Forward pass
+ outputs = model(images)
+ loss = criterion(outputs, labels)
+
+ # Backward and optimize
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ if (i + 1) % 100 == 0:
+ print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+ .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+ # Test the model
+ with torch.no_grad():
+ correct = 0
+ total = 0
+ for images, labels in test_loader:
+ images = images.reshape(-1, sequence_length, input_size).to(device)
+ labels = labels.to(device)
+ outputs = model(images)
+ _, predicted = torch.max(outputs.data, 1)
+ total += labels.size(0)
+ correct += (predicted == labels).sum().item()
+
+ print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+ # Save the model checkpoint
+ torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
deleted file mode 100644
index a0ecd773..00000000
--- a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# Hyper-parameters
-sequence_length = 28
-input_size = 28
-hidden_size = 128
-num_layers = 2
-num_classes = 10
-batch_size = 100
-num_epochs = 2
-learning_rate = 0.003
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data/',
- train=True,
- transform=transforms.ToTensor(),
- download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data/',
- train=False,
- transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False)
-
-# Bidirectional recurrent neural network (many-to-one)
-class BiRNN(nn.Module):
- def __init__(self, input_size, hidden_size, num_layers, num_classes):
- super(BiRNN, self).__init__()
- self.hidden_size = hidden_size
- self.num_layers = num_layers
- self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
- self.fc = nn.Linear(hidden_size*2, num_classes) # 2 for bidirection
-
- def forward(self, x):
- # Set initial states
- h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection
- c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
-
- # Forward propagate LSTM
- out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size*2)
-
- # Decode the hidden state of the last time step
- out = self.fc(out[:, -1, :])
- return out
-
-model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
-
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
- for i, (images, labels) in enumerate(train_loader):
- images = images.reshape(-1, sequence_length, input_size).to(device)
- labels = labels.to(device)
-
- # Forward pass
- outputs = model(images)
- loss = criterion(outputs, labels)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (i+1) % 100 == 0:
- print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
- .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-with torch.no_grad():
- correct = 0
- total = 0
- for images, labels in test_loader:
- images = images.reshape(-1, sequence_length, input_size).to(device)
- labels = labels.to(device)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
-
- print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py b/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py
new file mode 100644
index 00000000..b200bb68
--- /dev/null
+++ b/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py
@@ -0,0 +1,123 @@
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+
+
+# 卷积神经网络
+
+# Convolutional neural network (two convolutional layers)
+class ConvNet(nn.Module):
+ # 网络层定义
+ def __init__(self, num_classes=10):
+ super(ConvNet, self).__init__()
+ # 第一层卷积层:输入通道1(MNIST灰度图),输出通道16,卷积核5x5,步长1,填充2
+ # 卷积后尺寸保持不变:(28-5+2*2)/1 + 1 = 28
+ self.layer1 = nn.Sequential(
+ nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
+ nn.BatchNorm2d(16), # 批标准化:加速训练,提高稳定性
+ nn.ReLU(), # 激活函数:引入非线性
+ nn.MaxPool2d(kernel_size=2, stride=2) # 池化层:尺寸减半为14x14 无参数的下采样操作
+ )
+ # 第二层卷积层:输入通道16,输出通道32,卷积核5x5,步长1,填充2
+ # 卷积后尺寸保持不变:(14-5+2*2)/1 + 1 = 14
+ self.layer2 = nn.Sequential(
+ nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
+ nn.BatchNorm2d(32), # 批标准化
+ nn.ReLU(), # 激活函数
+ nn.MaxPool2d(kernel_size=2, stride=2) # 池化层:尺寸减半为7x7
+ )
+ # 全连接层:输入尺寸7x7x32,输出类别数
+ self.fc = nn.Linear(7 * 7 * 32, num_classes)
+ # 前向传播定义
+ def forward(self, x):
+ # 前向传播路径
+ out = self.layer1(x) # 输入x经过第一层卷积层,输出尺寸:[batch_size, 16, 14, 14]
+ out = self.layer2(out) # 输出经过第二层卷积层,输出尺寸:[batch_size, 32, 7, 7]
+ out = out.reshape(out.size(0), -1) # 展平:[batch_size, 32*7*7]
+ out = self.fc(out) # 全连接层分类,输出尺寸:[batch_size, num_classes]
+ return out
+
+
+if __name__ == '__main__':
+
+ # Device configuration
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+ # Hyper parameters
+ num_epochs = 5
+ num_classes = 10
+ batch_size = 100
+ learning_rate = 0.001
+
+ # MNIST dataset
+ train_dataset = torchvision.datasets.MNIST(root='../../data/',
+ train=True,
+ transform=transforms.ToTensor(),
+ download=True)
+
+ test_dataset = torchvision.datasets.MNIST(root='../../data/',
+ train=False,
+ transform=transforms.ToTensor())
+
+ # Data loader
+ train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+ batch_size=batch_size,
+ shuffle=True)
+
+ test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+ batch_size=batch_size,
+ shuffle=False)
+
+ model = ConvNet(num_classes).to(device)
+
+ # Loss and optimizer
+ # 损失函数:衡量模型预测与真实标签的差异程度
+ # CrossEntropyLoss适用于分类任务,内部整合了softmax和负对数似然
+ # 公式:loss = -sum(y_true * log(y_pred)),其中y_true是one-hot编码的真实标签
+ criterion = nn.CrossEntropyLoss()
+
+ # 优化器:根据损失函数的梯度来更新模型参数,最小化损失
+ # Adam是一种常用的自适应学习率优化算法,结合了Momentum和RMSProp的优点
+ # 参数说明:
+ # - model.parameters(): 需要优化的模型参数集合
+ # - lr: 学习率,控制参数更新的步长大小
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+ # Train the model
+ total_step = len(train_loader) # 每个epoch的总步数 = 训练数据集大小 / 批大小
+ for epoch in range(num_epochs): # 训练num_epochs轮
+ for i, (images, labels) in enumerate(train_loader): # 遍历每个批次
+ images = images.to(device) # 将图像数据移至指定设备
+ labels = labels.to(device) # 将标签移至指定设备
+
+ # Forward pass(前向传播):模型对输入图像进行预测
+ outputs = model(images) # outputs形状:[batch_size, num_classes]
+ loss = criterion(outputs, labels) # 计算损失:模型预测与真实标签的差异
+
+ # Backward and optimize(反向传播与参数优化)
+ optimizer.zero_grad() # 清除之前的梯度,避免梯度累积
+ loss.backward() # 反向传播:计算所有可训练参数的梯度
+ optimizer.step() # 优化器更新参数:根据梯度调整模型权重
+
+ if (i + 1) % 100 == 0:
+ print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+ .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+ # Test the model
+ model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
+ with torch.no_grad():
+ correct = 0
+ total = 0
+ for images, labels in test_loader:
+ images = images.to(device)
+ labels = labels.to(device)
+ outputs = model(images)
+ _, predicted = torch.max(outputs.data, 1)
+ total += labels.size(0)
+ correct += (predicted == labels).sum().item()
+
+ print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+ # Save the model checkpoint
+ torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/convolutional_neural_network/main.py b/tutorials/02-intermediate/convolutional_neural_network/main.py
deleted file mode 100644
index ec904f1f..00000000
--- a/tutorials/02-intermediate/convolutional_neural_network/main.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-
-# Hyper parameters
-num_epochs = 5
-num_classes = 10
-batch_size = 100
-learning_rate = 0.001
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data/',
- train=True,
- transform=transforms.ToTensor(),
- download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data/',
- train=False,
- transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False)
-
-# Convolutional neural network (two convolutional layers)
-class ConvNet(nn.Module):
- def __init__(self, num_classes=10):
- super(ConvNet, self).__init__()
- self.layer1 = nn.Sequential(
- nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
- nn.BatchNorm2d(16),
- nn.ReLU(),
- nn.MaxPool2d(kernel_size=2, stride=2))
- self.layer2 = nn.Sequential(
- nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
- nn.BatchNorm2d(32),
- nn.ReLU(),
- nn.MaxPool2d(kernel_size=2, stride=2))
- self.fc = nn.Linear(7*7*32, num_classes)
-
- def forward(self, x):
- out = self.layer1(x)
- out = self.layer2(out)
- out = out.reshape(out.size(0), -1)
- out = self.fc(out)
- return out
-
-model = ConvNet(num_classes).to(device)
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
- for i, (images, labels) in enumerate(train_loader):
- images = images.to(device)
- labels = labels.to(device)
-
- # Forward pass
- outputs = model(images)
- loss = criterion(outputs, labels)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (i+1) % 100 == 0:
- print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
- .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
-with torch.no_grad():
- correct = 0
- total = 0
- for images, labels in test_loader:
- images = images.to(device)
- labels = labels.to(device)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
-
- print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py b/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py
new file mode 100644
index 00000000..d9aa9fe4
--- /dev/null
+++ b/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py
@@ -0,0 +1,274 @@
+# ---------------------------------------------------------------------------- #
+# ResNet模型实现:基于论文https://arxiv.org/pdf/1512.03385.pdf #
+# 采用CIFAR-10数据集的模型架构(见论文4.2节) #
+# 部分代码参考PyTorch官方实现: #
+# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py #
+# ---------------------------------------------------------------------------- #
+
+# 导入必要的库
+import torch # PyTorch核心库
+import torch.nn as nn # 神经网络模块
+import torchvision # 计算机视觉库,提供数据集和模型
+import torchvision.transforms as transforms # 图像预处理工具
+from torch.cuda.amp import autocast, GradScaler # 混合精度训练
+
+# 设备配置:优先使用GPU,否则使用CPU,M系列芯片可使用MPS
+if torch.backends.mps.is_available():
+ device = torch.device('mps') # Apple Silicon M系列芯片加速
+elif torch.cuda.is_available():
+ device = torch.device('cuda') # NVIDIA GPU加速
+else:
+ device = torch.device('cpu') # CPU训练
+print(f"使用设备: {device}")
+
+
+# 3x3卷积层封装函数
+# ResNet大量使用3x3卷积,此函数简化代码复用
+def conv3x3(in_channels, out_channels, stride=1):
+ # 定义3x3卷积:
+ # - in_channels: 输入通道数
+ # - out_channels: 输出通道数
+ # - kernel_size: 卷积核大小
+ # - stride: 步长,默认1
+ # - padding: 填充,设置为1保持尺寸不变
+ # - bias: 不使用偏置,因为后续会接批标准化层
+ return nn.Conv2d(in_channels, out_channels, kernel_size=3,
+ stride=stride, padding=1, bias=False)
+
+
+# 残差块(Residual Block)定义
+# ResNet的核心组件,通过残差连接解决深度网络训练问题
+class ResidualBlock(nn.Module):
+ def __init__(self, in_channels, out_channels, stride=1, downsample=None):
+ super(ResidualBlock, self).__init__()
+ # 第一个3x3卷积层:可能改变通道数和尺寸
+ self.conv1 = conv3x3(in_channels, out_channels, stride)
+ self.bn1 = nn.BatchNorm2d(out_channels) # 批标准化层
+ self.relu = nn.ReLU(inplace=True) # ReLU激活函数,inplace=True节省内存
+ # 第二个3x3卷积层:保持通道数和尺寸不变
+ self.conv2 = conv3x3(out_channels, out_channels)
+ self.bn2 = nn.BatchNorm2d(out_channels) # 批标准化层
+ # 下采样模块:当输入输出通道数或尺寸不匹配时使用
+ self.downsample = downsample
+
+ def forward(self, x):
+ # 保存输入作为残差连接
+ residual = x
+
+ # 主路径:两次卷积+激活
+ out = self.conv1(x) # 第一次卷积
+ out = self.bn1(out) # 批标准化
+ out = self.relu(out) # 激活函数
+
+ out = self.conv2(out) # 第二次卷积
+ out = self.bn2(out) # 批标准化
+
+ # 残差路径:如果需要下采样则调整尺寸和通道数
+ if self.downsample:
+ residual = self.downsample(x)
+
+ # 残差连接:主路径输出 + 残差路径输出
+ out += residual
+ out = self.relu(out) # 最终激活
+
+ return out
+
+
+# ResNet主网络定义
+class ResNet(nn.Module):
+ def __init__(self, block, layers, num_classes=10):
+ super(ResNet, self).__init__()
+ # 初始输入通道数
+ self.in_channels = 16
+
+ # 第一层:3x3卷积 + 批标准化 + ReLU
+ # CIFAR-10输入为3通道,输出16通道
+ self.conv = conv3x3(3, 16)
+ self.bn = nn.BatchNorm2d(16)
+ self.relu = nn.ReLU(inplace=True)
+
+ # 构建残差层:
+ # layer1: 16通道,layers[0]个残差块,步长1
+ # layer2: 32通道,layers[1]个残差块,步长2(尺寸减半)
+ # layer3: 64通道,layers[2]个残差块,步长2(尺寸减半)
+ self.layer1 = self.make_layer(block, 16, layers[0])
+ self.layer2 = self.make_layer(block, 32, layers[1], 2)
+ self.layer3 = self.make_layer(block, 64, layers[2], 2)
+
+ # 全局平均池化:将64x8x8特征图转为64x1x1
+ self.avg_pool = nn.AvgPool2d(8)
+
+ # 全连接层:将64维特征映射到10个类别
+ self.fc = nn.Linear(64, num_classes)
+
+ # 创建残差层的辅助函数
+ def make_layer(self, block, out_channels, blocks, stride=1):
+ downsample = None
+
+ # 当步长不为1或输入输出通道数不匹配时,需要下采样
+ if (stride != 1) or (self.in_channels != out_channels):
+ downsample = nn.Sequential(
+ conv3x3(self.in_channels, out_channels, stride=stride),
+ nn.BatchNorm2d(out_channels)
+ )
+
+ layers = []
+ # 添加第一个残差块(可能包含下采样)
+ layers.append(block(self.in_channels, out_channels, stride, downsample))
+ self.in_channels = out_channels # 更新输入通道数
+
+ # 添加剩余的残差块(不需要下采样)
+ for i in range(1, blocks):
+ layers.append(block(out_channels, out_channels))
+
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ # 输入x: [batch_size, 3, 32, 32]
+
+ # 初始卷积层
+ out = self.conv(x) # [batch_size, 16, 32, 32]
+ out = self.bn(out) # 批标准化
+ out = self.relu(out) # 激活
+
+ # 残差层1:保持尺寸不变
+ out = self.layer1(out) # [batch_size, 16, 32, 32]
+
+ # 残差层2:尺寸减半
+ out = self.layer2(out) # [batch_size, 32, 16, 16]
+
+ # 残差层3:尺寸减半
+ out = self.layer3(out) # [batch_size, 64, 8, 8]
+
+ # 全局平均池化
+ out = self.avg_pool(out) # [batch_size, 64, 1, 1]
+
+ # 展平特征
+ out = out.view(out.size(0), -1) # [batch_size, 64]
+
+ # 全连接层分类
+ out = self.fc(out) # [batch_size, 10]
+
+ return out
+# 学习率更新函数
+def update_lr(optimizer, lr):
+ # 遍历优化器中的所有参数组,更新学习率
+ for param_group in optimizer.param_groups:
+ param_group['lr'] = lr
+
+if __name__ == '__main__':
+ # 主函数入口
+
+ # 超参数设置 - 针对Mac优化
+ num_epochs = 20 # 减少训练轮数(原80轮,可根据需要调整)
+ batch_size = 32 # 减小批大小适配Mac内存(原100)
+ learning_rate = 0.001 # 初始学习率
+
+ # 图像预处理模块:数据增强提高模型泛化能力
+ transform = transforms.Compose([
+ transforms.Pad(4), # 填充4个像素,32x32→40x40
+ transforms.RandomHorizontalFlip(), # 随机水平翻转
+ transforms.RandomCrop(32), # 随机裁剪回32x32
+ transforms.ToTensor() # 转换为Tensor格式
+ ])
+
+ # CIFAR-10数据集加载
+ # 训练集:应用数据增强
+ train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
+ train=True, # 训练集
+ transform=transform, # 应用数据增强
+ download=True) # 自动下载
+
+ # 测试集:仅转换为Tensor,不应用数据增强
+ test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
+ train=False, # 测试集
+ transform=transforms.ToTensor())
+
+ # 数据加载器 - 优化版本
+ # 训练数据加载器:多线程加载+内存锁定
+ train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+ batch_size=batch_size, # 批大小
+ shuffle=True, # 打乱数据
+ num_workers=4, # 多线程加载(根据CPU核心数调整)
+ pin_memory=True, # 内存锁定加速数据传输
+ persistent_workers=True) # 保持线程存活
+
+ # 测试数据加载器:多线程加载+内存锁定
+ test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+ batch_size=batch_size,
+ shuffle=False,
+ num_workers=2,
+ pin_memory=True)
+
+ # 初始化ResNet模型
+ # 使用ResidualBlock作为基本单元,[2, 2, 2]表示每个残差层包含2个残差块
+ # 即创建一个ResNet-18模型(计算方式:2*(2+2+2)+2=14层?不,正确计算是:
+ # 初始卷积层(1) + 3个残差层(每个2个块,每个块2层) + 全连接层(1) = 1+3*2*2+1=14层)
+ model = ResNet(ResidualBlock, [2, 2, 2]).to(device)
+ # 损失函数和优化器
+ # 交叉熵损失:适用于分类问题,内部包含softmax
+ criterion = nn.CrossEntropyLoss()
+
+ # Adam优化器:自适应学习率优化器,收敛速度快
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+ # 混合精度训练配置
+ scaler = GradScaler(enabled=(device.type == 'cuda' or device.type == 'mps'))
+
+ # 训练模型
+ total_step = len(train_loader) # 每个epoch的总步数
+ curr_lr = learning_rate # 当前学习率
+
+ # 训练模型
+ total_step = len(train_loader) # 每个epoch的总步数
+ curr_lr = learning_rate # 当前学习率
+
+ for epoch in range(num_epochs): # 遍历每个epoch
+ model.train() # 确保模型处于训练模式
+ for i, (images, labels) in enumerate(train_loader): # 遍历每个batch
+ # 将数据移至指定设备
+ images = images.to(device, non_blocking=True) # 非阻塞传输
+ labels = labels.to(device, non_blocking=True) # 非阻塞传输
+
+ # 前向传播:模型预测(混合精度)
+ with autocast(enabled=(device.type == 'cuda' or device.type == 'mps')):
+ outputs = model(images)
+ loss = criterion(outputs, labels)
+
+ # 反向传播和优化(混合精度)
+ optimizer.zero_grad(set_to_none=True) # 更高效的梯度清除
+ scaler.scale(loss).backward() # 缩放损失并反向传播
+ scaler.step(optimizer) # 更新参数
+ scaler.update() # 更新缩放器
+
+ # 每100步打印一次损失
+ if (i + 1) % 100 == 0:
+ print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
+ .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+ # 每20个epoch衰减一次学习率
+ if (epoch + 1) % 20 == 0:
+ curr_lr /= 3 # 学习率除以3
+ update_lr(optimizer, curr_lr) # 更新优化器的学习率
+ # 测试模型
+ model.eval() # 设置模型为评估模式
+
+ with torch.no_grad(): # 不计算梯度,节省内存和计算资源
+ correct = 0 # 正确预测数
+ total = 0 # 总样本数
+
+ for images, labels in test_loader: # 遍历测试集
+ images = images.to(device)
+ labels = labels.to(device)
+
+ outputs = model(images) # 模型预测
+ _, predicted = torch.max(outputs.data, 1) # 获取预测类别
+
+ total += labels.size(0) # 更新总样本数
+ correct += (predicted == labels).sum().item() # 更新正确预测数
+
+ # 计算并打印准确率
+ print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
+
+ # 保存模型参数到文件
+ torch.save(model.state_dict(), 'resnet.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/deep_residual_network/main.py b/tutorials/02-intermediate/deep_residual_network/main.py
deleted file mode 100644
index 69dbe5fb..00000000
--- a/tutorials/02-intermediate/deep_residual_network/main.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# ---------------------------------------------------------------------------- #
-# An implementation of https://arxiv.org/pdf/1512.03385.pdf #
-# See section 4.2 for the model architecture on CIFAR-10 #
-# Some part of the code was referenced from below #
-# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py #
-# ---------------------------------------------------------------------------- #
-
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# Hyper-parameters
-num_epochs = 80
-batch_size = 100
-learning_rate = 0.001
-
-# Image preprocessing modules
-transform = transforms.Compose([
- transforms.Pad(4),
- transforms.RandomHorizontalFlip(),
- transforms.RandomCrop(32),
- transforms.ToTensor()])
-
-# CIFAR-10 dataset
-train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
- train=True,
- transform=transform,
- download=True)
-
-test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
- train=False,
- transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False)
-
-# 3x3 convolution
-def conv3x3(in_channels, out_channels, stride=1):
- return nn.Conv2d(in_channels, out_channels, kernel_size=3,
- stride=stride, padding=1, bias=False)
-
-# Residual block
-class ResidualBlock(nn.Module):
- def __init__(self, in_channels, out_channels, stride=1, downsample=None):
- super(ResidualBlock, self).__init__()
- self.conv1 = conv3x3(in_channels, out_channels, stride)
- self.bn1 = nn.BatchNorm2d(out_channels)
- self.relu = nn.ReLU(inplace=True)
- self.conv2 = conv3x3(out_channels, out_channels)
- self.bn2 = nn.BatchNorm2d(out_channels)
- self.downsample = downsample
-
- def forward(self, x):
- residual = x
- out = self.conv1(x)
- out = self.bn1(out)
- out = self.relu(out)
- out = self.conv2(out)
- out = self.bn2(out)
- if self.downsample:
- residual = self.downsample(x)
- out += residual
- out = self.relu(out)
- return out
-
-# ResNet
-class ResNet(nn.Module):
- def __init__(self, block, layers, num_classes=10):
- super(ResNet, self).__init__()
- self.in_channels = 16
- self.conv = conv3x3(3, 16)
- self.bn = nn.BatchNorm2d(16)
- self.relu = nn.ReLU(inplace=True)
- self.layer1 = self.make_layer(block, 16, layers[0])
- self.layer2 = self.make_layer(block, 32, layers[1], 2)
- self.layer3 = self.make_layer(block, 64, layers[2], 2)
- self.avg_pool = nn.AvgPool2d(8)
- self.fc = nn.Linear(64, num_classes)
-
- def make_layer(self, block, out_channels, blocks, stride=1):
- downsample = None
- if (stride != 1) or (self.in_channels != out_channels):
- downsample = nn.Sequential(
- conv3x3(self.in_channels, out_channels, stride=stride),
- nn.BatchNorm2d(out_channels))
- layers = []
- layers.append(block(self.in_channels, out_channels, stride, downsample))
- self.in_channels = out_channels
- for i in range(1, blocks):
- layers.append(block(out_channels, out_channels))
- return nn.Sequential(*layers)
-
- def forward(self, x):
- out = self.conv(x)
- out = self.bn(out)
- out = self.relu(out)
- out = self.layer1(out)
- out = self.layer2(out)
- out = self.layer3(out)
- out = self.avg_pool(out)
- out = out.view(out.size(0), -1)
- out = self.fc(out)
- return out
-
-model = ResNet(ResidualBlock, [2, 2, 2]).to(device)
-
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# For updating learning rate
-def update_lr(optimizer, lr):
- for param_group in optimizer.param_groups:
- param_group['lr'] = lr
-
-# Train the model
-total_step = len(train_loader)
-curr_lr = learning_rate
-for epoch in range(num_epochs):
- for i, (images, labels) in enumerate(train_loader):
- images = images.to(device)
- labels = labels.to(device)
-
- # Forward pass
- outputs = model(images)
- loss = criterion(outputs, labels)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (i+1) % 100 == 0:
- print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
- .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
- # Decay learning rate
- if (epoch+1) % 20 == 0:
- curr_lr /= 3
- update_lr(optimizer, curr_lr)
-
-# Test the model
-model.eval()
-with torch.no_grad():
- correct = 0
- total = 0
- for images, labels in test_loader:
- images = images.to(device)
- labels = labels.to(device)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
-
- print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'resnet.ckpt')
diff --git a/tutorials/02-intermediate/language_model/main.py b/tutorials/02-intermediate/language_model/language_model.py
similarity index 79%
rename from tutorials/02-intermediate/language_model/main.py
rename to tutorials/02-intermediate/language_model/language_model.py
index ef135bb7..e534a71d 100644
--- a/tutorials/02-intermediate/language_model/main.py
+++ b/tutorials/02-intermediate/language_model/language_model.py
@@ -8,7 +8,9 @@
# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# 支持CUDA、MPS和CPU设备
+device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
+print(f"使用设备: {device}")
# Hyper-parameters
embed_size = 128
@@ -31,22 +33,22 @@
class RNNLM(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
super(RNNLM, self).__init__()
- self.embed = nn.Embedding(vocab_size, embed_size)
- self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
- self.linear = nn.Linear(hidden_size, vocab_size)
-
+ self.embed = nn.Embedding(vocab_size, embed_size) # 词嵌入层
+ self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) # LSTM层
+ self.linear = nn.Linear(hidden_size, vocab_size) # 输出层
+
def forward(self, x, h):
- # Embed word ids to vectors
- x = self.embed(x)
-
- # Forward propagate LSTM
- out, (h, c) = self.lstm(x, h)
-
- # Reshape output to (batch_size*sequence_length, hidden_size)
- out = out.reshape(out.size(0)*out.size(1), out.size(2))
-
- # Decode hidden states of all time steps
- out = self.linear(out)
+ # 词嵌入:将单词ID转换为向量表示
+ x = self.embed(x) # [batch_size, seq_length, embed_size]
+
+ # LSTM前向传播
+ out, (h, c) = self.lstm(x, h) # out: [batch_size, seq_length, hidden_size]
+
+ # 重塑输出以匹配全连接层输入格式
+ out = out.reshape(out.size(0) * out.size(1), out.size(2)) # [batch_size*seq_length, hidden_size]
+
+ # 预测下一个单词的概率分布
+ out = self.linear(out) # [batch_size*seq_length, vocab_size]
return out, (h, c)
model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)
diff --git a/tutorials/02-intermediate/recurrent_neural_network/main.py b/tutorials/02-intermediate/recurrent_neural_network/main.py
deleted file mode 100644
index c138c5ad..00000000
--- a/tutorials/02-intermediate/recurrent_neural_network/main.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-
-
-# Device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# Hyper-parameters
-sequence_length = 28
-input_size = 28
-hidden_size = 128
-num_layers = 2
-num_classes = 10
-batch_size = 100
-num_epochs = 2
-learning_rate = 0.01
-
-# MNIST dataset
-train_dataset = torchvision.datasets.MNIST(root='../../data/',
- train=True,
- transform=transforms.ToTensor(),
- download=True)
-
-test_dataset = torchvision.datasets.MNIST(root='../../data/',
- train=False,
- transform=transforms.ToTensor())
-
-# Data loader
-train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True)
-
-test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False)
-
-# Recurrent neural network (many-to-one)
-class RNN(nn.Module):
- def __init__(self, input_size, hidden_size, num_layers, num_classes):
- super(RNN, self).__init__()
- self.hidden_size = hidden_size
- self.num_layers = num_layers
- self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
- self.fc = nn.Linear(hidden_size, num_classes)
-
- def forward(self, x):
- # Set initial hidden and cell states
- h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
- c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
-
- # Forward propagate LSTM
- out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size)
-
- # Decode the hidden state of the last time step
- out = self.fc(out[:, -1, :])
- return out
-
-model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
-
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-# Train the model
-total_step = len(train_loader)
-for epoch in range(num_epochs):
- for i, (images, labels) in enumerate(train_loader):
- images = images.reshape(-1, sequence_length, input_size).to(device)
- labels = labels.to(device)
-
- # Forward pass
- outputs = model(images)
- loss = criterion(outputs, labels)
-
- # Backward and optimize
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if (i+1) % 100 == 0:
- print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
- .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
-
-# Test the model
-model.eval()
-with torch.no_grad():
- correct = 0
- total = 0
- for images, labels in test_loader:
- images = images.reshape(-1, sequence_length, input_size).to(device)
- labels = labels.to(device)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
-
- print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
-
-# Save the model checkpoint
-torch.save(model.state_dict(), 'model.ckpt')
\ No newline at end of file
diff --git a/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py b/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py
new file mode 100644
index 00000000..7f0300fc
--- /dev/null
+++ b/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py
@@ -0,0 +1,186 @@
+# 导入PyTorch核心库
+import torch
+import torch.nn as nn
+# 导入TorchVision用于数据加载和预处理
+import torchvision
+import torchvision.transforms as transforms
+
+# 设备配置:自动选择GPU(如果可用)否则使用CPU
+# MPS是Apple Silicon GPU支持,这里也可以加上对MPS的支持
+device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
+
+"""
+RNN(循环神经网络)的用途与应用场景
+RNN(Recurrent Neural Network,循环神经网络)是一类专门用于处理序列数据的深度学习模型,
+其核心特点是能够记忆之前的信息并用于当前决策,这使得它在各种需要处理时序依赖关系的任务中表现出色。
+RNN的核心特性
+RNN通过在网络中引入循环连接,使模型能够:
+
+处理任意长度的序列数据
+捕捉序列中的时间依赖关系
+保留序列的上下文信息
+
+RNN的主要应用场景
+1. 自然语言处理(NLP)
+文本分类:情感分析、垃圾邮件检测、新闻分类
+语言建模:预测下一个词的概率分布
+机器翻译:将一种语言翻译成另一种语言
+命名实体识别:识别文本中的人名、地名、组织名等
+文本生成:自动生成文章、诗歌、对话等
+2. 时间序列预测
+股票价格预测:基于历史价格预测未来走势
+天气预报:基于气象数据预测未来天气
+电力负荷预测:预测未来电力需求
+销售预测:预测产品未来销量
+3. 语音处理
+语音识别:将语音转换为文本
+语音合成:将文本转换为语音
+说话人识别:识别说话人的身份
+4. 图像与视频分析
+图像描述生成:为图像生成文字描述
+视频分析:行为识别、动作检测
+手写体识别:如代码示例中的MNIST数字分类
+"""
+# 定义RNN模型类,继承自nn.Module
+class RNN(nn.Module):
+ def __init__(self, input_size, hidden_size, num_layers, num_classes):
+ """
+ RNN模型初始化函数
+ :param input_size: 输入特征维度 (MNIST图像的每行像素数:28)
+ :param hidden_size: 隐藏层维度 (LSTM单元的隐藏状态大小:128)
+ :param num_layers: LSTM层数 (2层)
+ :param num_classes: 分类数量 (MNIST有10个数字类别)
+ """
+ super(RNN, self).__init__()
+ self.hidden_size = hidden_size # 隐藏层大小
+ self.num_layers = num_layers # LSTM层数
+
+ # 定义LSTM层:
+ # - input_size: 输入特征维度
+ # - hidden_size: 隐藏层维度
+ # - num_layers: LSTM层数
+ # - batch_first=True: 输入输出形状为(batch_size, seq_length, feature_size)
+ self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
+
+ # 全连接层:将LSTM输出映射到分类结果
+ self.fc = nn.Linear(hidden_size, num_classes)
+
+ def forward(self, x):
+ """
+ 前向传播函数
+ :param x: 输入张量,形状为(batch_size, sequence_length, input_size)
+ :return: 输出张量,形状为(batch_size, num_classes)
+ """
+ # 初始化LSTM的隐藏状态h0和细胞状态c0
+ # 形状:(num_layers, batch_size, hidden_size)
+ h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+ c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+
+ # 前向传播LSTM
+ # out: LSTM的输出,形状为(batch_size, seq_length, hidden_size)
+ # _: 包含最终隐藏状态和细胞状态的元组(这里未使用)
+ out, _ = self.lstm(x, (h0, c0))
+
+ # 解码最后一个时间步的隐藏状态用于分类
+ # out[:, -1, :] 表示取所有样本的最后一个时间步的隐藏状态
+ out = self.fc(out[:, -1, :])
+ return out
+
+
+if __name__ == '__main__':
+ # 超参数设置
+ sequence_length = 28 # 序列长度 (MNIST图像的行数:28)
+ input_size = 28 # 输入特征维度 (MNIST图像的列数:28)
+ hidden_size = 128 # 隐藏层维度
+ num_layers = 2 # LSTM层数
+ num_classes = 10 # 分类数量 (0-9数字)
+ batch_size = 100 # 批次大小
+ num_epochs = 2 # 训练轮数
+ learning_rate = 0.01 # 学习率
+
+ # MNIST数据集加载
+ # 训练集
+ train_dataset = torchvision.datasets.MNIST(
+ root='../../data/', # 数据集保存路径
+ train=True, # 训练集
+ transform=transforms.ToTensor(), # 转换为Tensor并归一化到[0,1]
+ download=True # 自动下载(如果本地没有)
+ )
+
+ # 测试集
+ test_dataset = torchvision.datasets.MNIST(
+ root='../../data/',
+ train=False, # 测试集
+ transform=transforms.ToTensor()
+ )
+
+ # 数据加载器
+ train_loader = torch.utils.data.DataLoader(
+ dataset=train_dataset,
+ batch_size=batch_size,
+ shuffle=True # 训练时打乱数据
+ )
+
+ test_loader = torch.utils.data.DataLoader(
+ dataset=test_dataset,
+ batch_size=batch_size,
+ shuffle=False # 测试时不打乱数据
+ )
+
+ # 实例化RNN模型(many-to-one架构:多个时间步输入,一个输出)
+ model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
+
+ # 损失函数和优化器
+ # 交叉熵损失:适用于多分类任务
+ criterion = nn.CrossEntropyLoss()
+ # Adam优化器:自适应学习率优化算法
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+ # 训练模型
+ total_step = len(train_loader) # 每个epoch的总步数
+ for epoch in range(num_epochs): # 遍历每个epoch
+ for i, (images, labels) in enumerate(train_loader): # 遍历每个批次
+ # 将图像重塑为序列数据:
+ # MNIST图像原始形状:(batch_size, 1, 28, 28)
+ # 重塑后形状:(batch_size, sequence_length=28, input_size=28)
+ # 即把28x28的图像看作28个时间步,每个时间步输入28个像素
+ images = images.reshape(-1, sequence_length, input_size).to(device)
+ labels = labels.to(device) # 标签移至设备
+
+ # 前向传播
+ outputs = model(images) # 模型预测
+ loss = criterion(outputs, labels) # 计算损失
+
+ # 反向传播和优化
+ optimizer.zero_grad() # 清除梯度
+ loss.backward() # 反向传播计算梯度
+ optimizer.step() # 更新参数
+
+ # 打印训练信息
+ if (i + 1) % 100 == 0:
+ print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
+ .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+
+ # 测试模型
+ model.eval() # 设置模型为评估模式(关闭dropout等)
+ with torch.no_grad(): # 关闭梯度计算,节省内存和计算
+ correct = 0 # 正确预测数
+ total = 0 # 总样本数
+ for images, labels in test_loader: # 遍历测试集
+ # 重塑图像并移至设备
+ images = images.reshape(-1, sequence_length, input_size).to(device)
+ labels = labels.to(device)
+ outputs = model(images) # 模型预测
+
+ # 获取预测结果:torch.max返回最大值和索引,索引即为预测类别
+ _, predicted = torch.max(outputs.data, 1)
+
+ total += labels.size(0) # 更新总样本数
+ correct += (predicted == labels).sum().item() # 更新正确预测数
+
+ # 打印测试准确率
+ print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
+
+ # 保存模型权重
+ torch.save(model.state_dict(), 'model.ckpt')
+ print("Model weights saved to 'model.ckpt'")
\ No newline at end of file
diff --git a/tutorials/03-advanced/generative_adversarial_network/main.py b/tutorials/03-advanced/generative_adversarial_network/generative_adversarial_network.py
similarity index 100%
rename from tutorials/03-advanced/generative_adversarial_network/main.py
rename to tutorials/03-advanced/generative_adversarial_network/generative_adversarial_network.py
diff --git a/tutorials/compilers/Inductor_demo.py b/tutorials/compilers/Inductor_demo.py
new file mode 100644
index 00000000..1fbff4a1
--- /dev/null
+++ b/tutorials/compilers/Inductor_demo.py
@@ -0,0 +1,15 @@
+import torch
+
+def foo1(x1, x2):
+ a = torch.neg(x1)
+ b = torch.maximum(x2, a)
+ y = torch.cat([b], dim=0)
+ return y
+
+
+# TORCH_COMPILE_DEBUG=1 python xx.py
+if __name__ == '__main__':
+ x1 = torch.randint(256, (1, 8), dtype=torch.uint8)
+ x2 = torch.randint(256, (8390, 8), dtype=torch.uint8)
+ compiled_foo1 = torch.compile(foo1)
+ result = compiled_foo1(x1, x2)
diff --git a/tutorials/compilers/basic_demo.py b/tutorials/compilers/basic_demo.py
new file mode 100644
index 00000000..e9fd7d2b
--- /dev/null
+++ b/tutorials/compilers/basic_demo.py
@@ -0,0 +1,203 @@
+import torch
+
+# def foo(x, y):
+# a = torch.sin(x)
+# b = torch.cos(y)
+# return a + b
+#
+#
+# opt_foo1 = torch.compile(foo)
+# print(opt_foo1(torch.randn(3, 3), torch.randn(3, 3)))
+#
+#
+# @torch.compile
+# def opt_foo2(x, y):
+# a = torch.sin(x)
+# b = torch.cos(y)
+# return a + b
+#
+#
+# print(opt_foo2(torch.randn(3, 3), torch.randn(3, 3)))
+#
+# def inner(x):
+# return torch.sin(x)
+#
+#
+# @torch.compile
+# def outer(x, y):
+# a = inner(x)
+# b = torch.cos(y)
+# return a + b
+#
+#
+# print(outer(torch.randn(3, 3), torch.randn(3, 3)))
+#
+
+# t = torch.randn(10, 100)
+#
+#
+# class MyModule(torch.nn.Module):
+# def __init__(self):
+# super().__init__()
+# self.lin = torch.nn.Linear(3, 3)
+#
+# def forward(self, x):
+# return torch.nn.functional.relu(self.lin(x))
+#
+#
+# mod1 = MyModule()
+# mod1.compile()
+# print(mod1(torch.randn(3, 3)))
+#
+# mod2 = MyModule()
+# mod2 = torch.compile(mod2)
+# print(mod2(torch.randn(3, 3)))
+
+
+# Demonstrating Speedups 展示加速效果
+
+def foo3(x):
+ y = x + 1
+ z = torch.nn.functional.relu(y)
+ u = z * 2
+ return u
+
+
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+ start.record()
+ result = fn()
+ end.record()
+ torch.cuda.synchronize()
+ return result, start.elapsed_time(end) / 1000
+
+
+opt_foo3 = torch.compile(foo3)
+inp = torch.randn(4096, 4096).cuda()
+
+
+def first_run():
+ torch._logging.set_logs(graph_code=True)
+ """
+ 请注意,torch.compile 似乎比即时执行要花费长得多的时间才能完成。这是因为 torch.compile 在最初几次执行时需要额外时间来编译模型。
+ torch.compile 会尽可能重用已编译的代码,因此如果我们多运行几次优化后的模型,应该会看到与即时执行相比有显著的性能提升。
+ """
+ print("compile:", timed(lambda: opt_foo3(inp))[1])
+ print("eager:", timed(lambda: foo3(inp))[1])
+
+
+"""
+eager time 0: 0.027955583572387695
+eager time 1: 0.0004986880123615265
+eager time 2: 0.00045683199167251585
+eager time 3: 0.00045158401131629945
+eager time 4: 0.00045363199710845946
+eager time 5: 0.00045363199710845946
+eager time 6: 0.0004556800127029419
+eager time 7: 0.0004505600035190582
+eager time 8: 0.00045043200254440307
+eager time 9: 0.0004546560049057007
+~~~~~~~~~~
+compile time 0: 0.434231201171875
+compile time 1: 0.00026624000072479246
+compile time 2: 0.00023552000522613525
+compile time 3: 0.0002234240025281906
+compile time 4: 0.00021913599967956544
+compile time 5: 0.00022220799326896668
+compile time 6: 0.0002181120067834854
+compile time 7: 0.0002242559939622879
+compile time 8: 0.0002181120067834854
+compile time 9: 0.00022118400037288665
+~~~~~~~~~~
+(eval) eager median: 0.0004541440010070801, compile median: 0.00022281599789857864, speedup: 2.038201948200314x
+"""
+
+
+def many_runs():
+ # turn off logging for now to prevent spam
+ torch._logging.set_logs(graph_code=False)
+ eager_times = []
+ for i in range(10):
+ _, eager_time = timed(lambda: foo3(inp))
+ eager_times.append(eager_time)
+ print(f"eager time {i}: {eager_time}")
+ print("~" * 10)
+
+ compile_times = []
+ for i in range(10):
+ _, compile_time = timed(lambda: opt_foo3(inp))
+ compile_times.append(compile_time)
+ print(f"compile time {i}: {compile_time}")
+ print("~" * 10)
+
+ import numpy as np
+
+ eager_med = np.median(eager_times)
+ compile_med = np.median(compile_times)
+ speedup = eager_med / compile_med
+ assert speedup > 1
+ print(
+ f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x"
+ )
+ print("~" * 10)
+
+
+def bar1(a, b):
+ x = a / (torch.abs(a) + 1)
+ if b.sum() < 0:
+ b = b * -1
+ return x * b
+
+
+def bar(a, b):
+ x = a / (torch.abs(a) + 1)
+ b = torch.where(b.sum() < 0, -b, b)
+ return x * b
+
+
+# Graph Breaks 图中断
+"""“图中断”这一术语源于torch.compile尝试捕获并优化PyTorch操作图这一事实。当遇到不支持的Python代码时,这个图就必须被“中断”。
+图中断会导致优化机会的损失,这可能仍然不尽如人意,但总比出现无声的错误或硬崩溃要好。"""
+
+
+
+@torch.compile(fullgraph=True)
+def bar_fixed(a, b):
+ x = a / (torch.abs(a) + 1)
+
+ def true_branch(y):
+ return y * -1
+
+ def false_branch(y):
+ # NOTE: torch.cond doesn't allow aliased outputs
+ return y.clone()
+
+ x = torch.cond(b.sum() < 0, true_branch, false_branch, (b,))
+ return x * b
+
+
+def graph_breaks_fixed_demo():
+ torch._logging.set_logs(graph_code=True)
+ inp1 = torch.ones(10)
+ inp2 = torch.ones(10)
+ fixed = bar_fixed(inp1, inp2)
+ fixed1 = bar_fixed(inp1, -inp2)
+ print(f"fixed: {fixed}")
+ print(f"fixed1: {fixed1}")
+
+
+def graph_breaks_demo():
+ torch._logging.set_logs(graph_code=True)
+ opt_bar = torch.compile(bar)
+ inp1 = torch.ones(10)
+ inp2 = torch.ones(10)
+ opt_bar(inp1, inp2)
+ opt_bar(inp1, -inp2)
+
+
+if __name__ == '__main__':
+ graph_breaks_fixed_demo()
diff --git a/tutorials/compilers/benchmark_time.py b/tutorials/compilers/benchmark_time.py
new file mode 100644
index 00000000..53e9a642
--- /dev/null
+++ b/tutorials/compilers/benchmark_time.py
@@ -0,0 +1,68 @@
+import torch
+import torch.utils.benchmark as benchmark
+from torch.utils.benchmark import Language
+from torch.utils.benchmark import Timer
+import pickle
+import re
+from torch.utils.benchmark import CallgrindStats, FunctionCounts
+
+
+def cudaclu_timer():
+ cpp_timer = Timer(
+ "x * y;",
+ """
+ auto x = torch::ones({128});
+ auto y = torch::ones({128});
+ """,
+ language=Language.CPP,
+ )
+
+ print(cpp_timer.blocked_autorange(min_run_time=1))
+
+
+# 使用 Callgrind 进行 A/B 测试
+"""
+指令计数最有用之处在于它们允许对计算进行精细比较,这在分析性能时至关重要。
+为了实际演示这一点,让我们将两个大小为 128 的 Tensor 相乘与一个 {128} x {1} 的乘法进行比较,后者将广播第二个 Tensor。
+"""
+
+
+def call_grind_timer():
+ broadcasting_stats = Timer(
+ "x * y;",
+ """
+ auto x = torch::ones({128});
+ auto y = torch::ones({1});
+ """,
+ language=Language.CPP,
+ ).collect_callgrind().as_standardized().stats(inclusive=False)
+ # Let's round trip `broadcasting_stats` just to show that we can.
+ broadcasting_stats = pickle.loads(pickle.dumps(broadcasting_stats))
+
+ cpp_timer = Timer(
+ "x * y;",
+ """
+ auto x = torch::ones({128});
+ auto y = torch::ones({128});
+ """,
+ language=Language.CPP,
+ )
+
+ print(cpp_timer.blocked_autorange(min_run_time=1))
+ stats: CallgrindStats = cpp_timer.collect_callgrind()
+ inclusive_stats = stats.as_standardized().stats(inclusive=False)
+ print(inclusive_stats[:10])
+ # And now to diff the two tasks:
+ delta = broadcasting_stats - inclusive_stats
+
+ def extract_fn_name(fn: str):
+ """Trim everything except the function name."""
+ fn = ":".join(fn.split(":")[1:])
+ return re.sub(r"\(.+\)", "(...)", fn)
+
+ # We use `.transform` to make the diff readable:
+ print(delta.transform(extract_fn_name))
+
+
+if __name__ == '__main__':
+ cudaclu_timer()
diff --git a/tutorials/compilers/compiled_autograd_demo.py b/tutorials/compilers/compiled_autograd_demo.py
new file mode 100644
index 00000000..49e90b36
--- /dev/null
+++ b/tutorials/compilers/compiled_autograd_demo.py
@@ -0,0 +1,58 @@
+import torch
+import warnings
+from torchvision.models import densenet121
+import numpy as np
+
+
+class Model(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.linear = torch.nn.Linear(10, 10)
+
+ def forward(self, x):
+ return self.linear(x)
+
+
+"""
+Python 解释器调用 Dynamo,因为此调用被装饰了 @torch.compile。
+Dynamo 拦截 Python 字节码,模拟其执行并将操作记录到图中。
+AOTDispatcher 禁用钩子,并调用自动梯度引擎来计算 model.linear.weight 和
+model.linear.bias 的梯度,并将操作记录到图中。使用 torch.autograd.Function,AOTDispatcher 重写了 train 的前向和反向传播实现。
+Inductor 生成一个对应于 AOTDispatcher 前向和反向传播优化实现的函数。
+Dynamo 设置优化后的函数,以便 Python 解释器接下来进行评估。
+Python 解释器执行优化后的函数,该函数执行 loss = model(x).sum()。
+Python 解释器执行 loss.backward(),调用自动梯度引擎,该引擎会路由到已编译的自动梯度引擎,因为我们将 torch._dynamo.config.compiled_autograd = True 设置为 True。
+已编译的自动梯度计算 model.linear.weight 和 model.linear.bias 的梯度,并将操作记录到图中,包括它遇到的任何钩子。
+在此过程中,它将记录 AOTDispatcher 之前重写的反向传播。然后,已编译的自动梯度生成一个新函数,该函数对应于 loss.backward()
+的完全跟踪实现,并以推理模式使用 torch.compile 执行它。
+相同的步骤将递归应用于已编译的自动梯度图,但这次 AOTDispatcher 将不需要划分图。
+
+
+"""
+
+
+def train_demo():
+ model = Model()
+ x = torch.randn(10)
+ torch._dynamo.config.compiled_autograd = True
+ @torch.compile
+ def train(model, x):
+ loss = model(x).sum()
+ loss.backward()
+ train(model, x)
+
+def train_demo1():
+ model = Model()
+ x = torch.randn(10)
+ torch._dynamo.config.compiled_autograd = True
+ @torch.compile
+ def train(model, x):
+ model = torch.compile(model)
+ loss = model(x).sum()
+ torch._dynamo.config.compiled_autograd = True
+ torch.compile(lambda: loss.backward(), fullgraph=True)()
+ train(model, x)
+# TORCH_LOGS="compiled_autograd" CUDA_VISIBLE_DEVICES=1 python compiled_autograd_demo.py
+# TORCH_LOGS="compiled_autograd_verbose" CUDA_VISIBLE_DEVICES=1 python compiled_autograd_demo.py
+if __name__ == '__main__':
+ train_demo1()
diff --git a/tutorials/compilers/custom_onnxscript_demo.py b/tutorials/compilers/custom_onnxscript_demo.py
new file mode 100644
index 00000000..d73705e6
--- /dev/null
+++ b/tutorials/compilers/custom_onnxscript_demo.py
@@ -0,0 +1,43 @@
+import torch
+import onnxscript
+
+# Opset 18 is the standard supported version as of PyTorch 2.6
+from onnxscript import opset18 as op
+class GeluModel(torch.nn.Module):
+ def forward(self, input_x):
+ return torch.ops.aten.gelu(input_x)
+
+
+# Create a namespace for the custom operator using ONNX Script
+# ``com.microsoft`` is an official ONNX Runtime namespace
+microsoft_op = onnxscript.values.Opset(domain="com.microsoft", version=1)
+
+# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator.
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+# NOTE: All attributes must be annotated with type hints.
+# The function must be scripted using the ``@onnxscript.script()`` decorator when
+# using operators from custom domains. This may be improved in future versions.
+from onnxscript import FLOAT
+
+
+@onnxscript.script(microsoft_op)
+def custom_aten_gelu(self: FLOAT, approximate: str = "none") -> FLOAT:
+ return microsoft_op.Gelu(self)
+x = torch.tensor([1.0])
+
+onnx_program = torch.onnx.export(
+ GeluModel().eval(),
+ (x,),
+ dynamo=True,
+ custom_translation_table={
+ torch.ops.aten.gelu.default: custom_aten_gelu,
+ },
+)
+
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+print(onnx_program.model)
+
+result = onnx_program(x)[0]
+print(f"Result: {result}")
+torch.testing.assert_close(result, torch.ops.aten.gelu(x))
\ No newline at end of file
diff --git a/tutorials/compilers/fx_demo.py b/tutorials/compilers/fx_demo.py
new file mode 100644
index 00000000..60194dae
--- /dev/null
+++ b/tutorials/compilers/fx_demo.py
@@ -0,0 +1,119 @@
+import torch
+import torch.fx
+import torchvision.models as models
+import statistics, tabulate, time
+from typing import Any, Dict, List
+from torch.fx import Interpreter
+
+
+# 创建性能分析解释器
+class ProfilingInterpreter(Interpreter):
+ def __init__(self, mod: torch.nn.Module):
+ # Rather than have the user symbolically trace their model,
+ # we're going to do it in the constructor. As a result, the
+ # user can pass in any ``Module`` without having to worry about
+ # symbolic tracing APIs
+ gm = torch.fx.symbolic_trace(mod)
+ super().__init__(gm)
+
+ # We are going to store away two things here:
+ #
+ # 1. A list of total runtimes for ``mod``. In other words, we are
+ # storing away the time ``mod(...)`` took each time this
+ # interpreter is called.
+ self.total_runtime_sec: List[float] = []
+ # 2. A map from ``Node`` to a list of times (in seconds) that
+ # node took to run. This can be seen as similar to (1) but
+ # for specific sub-parts of the model.
+ self.runtimes_sec: Dict[torch.fx.Node, List[float]] = {}
+
+ ######################################################################
+ # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run``
+ # method is the top-level entry point for execution of the model. We will
+ # want to intercept this so that we can record the total runtime of the
+ # model.
+
+ def run(self, *args) -> Any:
+ # Record the time we started running the model
+ t_start = time.time()
+ # Run the model by delegating back into Interpreter.run()
+ return_val = super().run(*args)
+ # Record the time we finished running the model
+ t_end = time.time()
+ # Store the total elapsed time this model execution took in the
+ # ``ProfilingInterpreter``
+ self.total_runtime_sec.append(t_end - t_start)
+ return return_val
+
+ ######################################################################
+ # Now, let's override ``run_node``. ``Interpreter`` calls ``run_node`` each
+ # time it executes a single node. We will intercept this so that we
+ # can measure and record the time taken for each individual call in
+ # the model.
+
+ def run_node(self, n: torch.fx.Node) -> Any:
+ # Record the time we started running the op
+ t_start = time.time()
+ # Run the op by delegating back into Interpreter.run_node()
+ return_val = super().run_node(n)
+ # Record the time we finished running the op
+ t_end = time.time()
+ # If we don't have an entry for this node in our runtimes_sec
+ # data structure, add one with an empty list value.
+ self.runtimes_sec.setdefault(n, [])
+ # Record the total elapsed time for this single invocation
+ # in the runtimes_sec data structure
+ self.runtimes_sec[n].append(t_end - t_start)
+ return return_val
+
+ ######################################################################
+ # Finally, we are going to define a method (one which doesn't override
+ # any ``Interpreter`` method) that provides us a nice, organized view of
+ # the data we have collected.
+
+ def summary(self, should_sort: bool = False) -> str:
+ # Build up a list of summary information for each node
+ node_summaries: List[List[Any]] = []
+ # Calculate the mean runtime for the whole network. Because the
+ # network may have been called multiple times during profiling,
+ # we need to summarize the runtimes. We choose to use the
+ # arithmetic mean for this.
+ mean_total_runtime = statistics.mean(self.total_runtime_sec)
+
+ # For each node, record summary statistics
+ for node, runtimes in self.runtimes_sec.items():
+ # Similarly, compute the mean runtime for ``node``
+ mean_runtime = statistics.mean(runtimes)
+ # For easier understanding, we also compute the percentage
+ # time each node took with respect to the whole network.
+ pct_total = mean_runtime / mean_total_runtime * 100
+ # Record the node's type, name of the node, mean runtime, and
+ # percent runtime.
+ node_summaries.append(
+ [node.op, str(node), mean_runtime, pct_total])
+
+ # One of the most important questions to answer when doing performance
+ # profiling is "Which op(s) took the longest?". We can make this easy
+ # to see by providing sorting functionality in our summary view
+ if should_sort:
+ node_summaries.sort(key=lambda s: s[2], reverse=True)
+
+ # Use the ``tabulate`` library to create a well-formatted table
+ # presenting our summary information
+ headers: List[str] = [
+ 'Op type', 'Op', 'Average runtime (s)', 'Pct total runtime'
+ ]
+ return tabulate.tabulate(node_summaries, headers=headers)
+
+
+if __name__ == '__main__':
+ rn18 = models.resnet18()
+ rn18.eval()
+ input = torch.randn(5, 3, 224, 224)
+ # output = rn18(input)
+ # traced_rn18 = torch.fx.symbolic_trace(rn18)
+ # print(traced_rn18.graph)
+
+ interp = ProfilingInterpreter(rn18)
+ interp.run(input)
+ print(interp.summary(True))
diff --git a/tutorials/compilers/nvidia_demo.py b/tutorials/compilers/nvidia_demo.py
new file mode 100644
index 00000000..1c496595
--- /dev/null
+++ b/tutorials/compilers/nvidia_demo.py
@@ -0,0 +1,191 @@
+# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in
+# order to reproduce the speedup numbers shown below and documented elsewhere.
+
+import torch
+import warnings
+from torchvision.models import densenet121
+import numpy as np
+
+gpu_ok = False
+if torch.cuda.is_available():
+ device_cap = torch.cuda.get_device_capability()
+ if device_cap in ((7, 0), (8, 0), (9, 0)):
+ gpu_ok = True
+
+if not gpu_ok:
+ warnings.warn(
+ "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower "
+ "than expected."
+ )
+
+
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+ start.record()
+ result = fn()
+ end.record()
+ torch.cuda.synchronize()
+ return result, start.elapsed_time(end) / 1000
+
+
+# Generates random input and targets data for the model, where `b` is
+# batch size.
+def generate_data(b):
+ return (
+ torch.randn(b, 3, 128, 128).to().cuda(),
+ torch.randint(1000, (b,)).cuda(),
+ )
+
+
+N_ITERS = 10
+
+
+def init_model():
+ return densenet121().cuda()
+
+
+model = init_model()
+
+# Note that we generally recommend directly compiling a torch.nn.Module by calling
+# its .compile() method.
+model_opt = init_model()
+model_opt.compile(mode="reduce-overhead")
+
+
+def first_demo():
+ inp = generate_data(16)[0]
+ with torch.no_grad():
+ print("eager:", timed(lambda: model(inp))[1])
+ print("compile:", timed(lambda: model_opt(inp))[1])
+
+
+"""
+(eval) eager median: 0.01525604772567749, compile median: 0.003931119918823242, speedup: 3.8808400762916184x
+eager eval time 0: 0.2951763916015625
+eager eval time 1: 0.01678335952758789
+eager eval time 2: 0.015734944343566894
+eager eval time 3: 0.015243231773376465
+eager eval time 4: 0.015268863677978516
+eager eval time 5: 0.01522979164123535
+eager eval time 6: 0.015177727699279785
+eager eval time 7: 0.015617024421691895
+eager eval time 8: 0.015202367782592773
+eager eval time 9: 0.015126527786254883
+~~~~~~~~~~
+compile eval time 0: 5.565470703125
+compile eval time 1: 0.24912281799316408 第二次还是慢了,尽管比第一次运行快得多。这是因为 "reduce-overhead" 模式会为 CUDA 图运行几次预热迭代。
+compile eval time 2: 0.00450867223739624
+compile eval time 3: 0.004577280044555664
+compile eval time 4: 0.003706687927246094
+compile eval time 5: 0.0037672960758209227
+compile eval time 6: 0.003935231924057007
+compile eval time 7: 0.003768320083618164
+compile eval time 8: 0.003927007913589477
+compile eval time 9: 0.0038635520935058594
+"""
+
+
+def predict_many_demo():
+ eager_times = []
+ for i in range(N_ITERS):
+ inp = generate_data(16)[0]
+ with torch.no_grad():
+ _, eager_time = timed(lambda: model(inp))
+ eager_times.append(eager_time)
+ print(f"eager eval time {i}: {eager_time}")
+
+ print("~" * 10)
+
+ compile_times = []
+ for i in range(N_ITERS):
+ inp = generate_data(16)[0]
+ with torch.no_grad():
+ _, compile_time = timed(lambda: model_opt(inp))
+ compile_times.append(compile_time)
+ print(f"compile eval time {i}: {compile_time}")
+ print("~" * 10)
+
+ import numpy as np
+
+ eager_med = np.median(eager_times)
+ compile_med = np.median(compile_times)
+ speedup = eager_med / compile_med
+ assert speedup > 1
+ print(
+ f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x"
+ )
+ print("~" * 10)
+
+
+opt = torch.optim.Adam(model.parameters())
+
+
+def train(mod, data):
+ opt.zero_grad(True)
+ pred = mod(data[0])
+ loss = torch.nn.CrossEntropyLoss()(pred, data[1])
+ loss.backward()
+ opt.step()
+
+"""
+eager train time 0: 0.6821947631835937
+eager train time 1: 0.0516577262878418
+eager train time 2: 0.048728256225585936
+eager train time 3: 0.047841407775878905
+eager train time 4: 0.04823257446289062
+eager train time 5: 0.048595008850097654
+eager train time 6: 0.057622528076171874
+eager train time 7: 0.05626262283325195
+eager train time 8: 0.057923583984375
+eager train time 9: 0.058123264312744144
+~~~~~~~~~~
+
+compile train time 0: 141.419421875
+compile train time 1: 8.4247080078125
+compile train time 2: 0.018790399551391602
+compile train time 3: 0.010836992263793945
+compile train time 4: 0.010805248260498047
+compile train time 5: 0.010437631607055664
+compile train time 6: 0.010218496322631837
+compile train time 7: 0.012146688461303711
+compile train time 8: 0.012992511749267579
+compile train time 9: 0.012563455581665038
+~~~~~~~~~~
+(train) eager median: 0.05396017456054687, compile median: 0.012355072021484375, speedup: 4.3674512351457695x
+"""
+def train_many_demo():
+ eager_times = []
+ for i in range(N_ITERS):
+ inp = generate_data(16)
+ _, eager_time = timed(lambda: train(model, inp))
+ eager_times.append(eager_time)
+ print(f"eager train time {i}: {eager_time}")
+ print("~" * 10)
+ # Note that because we are compiling a regular Python function, we do not
+ # call any .compile() method.
+ train_opt = torch.compile(train, mode="reduce-overhead")
+
+ compile_times = []
+ for i in range(N_ITERS):
+ inp = generate_data(16)
+ _, compile_time = timed(lambda: train_opt(model, inp))
+ compile_times.append(compile_time)
+ print(f"compile train time {i}: {compile_time}")
+ print("~" * 10)
+
+ eager_med = np.median(eager_times)
+ compile_med = np.median(compile_times)
+ speedup = eager_med / compile_med
+ assert speedup > 1
+ print(
+ f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x"
+ )
+ print("~" * 10)
+
+
+if __name__ == '__main__':
+ train_many_demo()
diff --git a/tutorials/compilers/onnx_demo.py b/tutorials/compilers/onnx_demo.py
new file mode 100644
index 00000000..b3fba746
--- /dev/null
+++ b/tutorials/compilers/onnx_demo.py
@@ -0,0 +1,109 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import onnx
+import onnxruntime
+import onnxscript
+import os
+
+
+def get_version():
+ print(torch.__version__)
+ print(onnxscript.__version__)
+ print(onnxruntime.__version__)
+
+
+"""简单的图像分类器模型"""
+
+
+class ImageClassifierModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.conv1 = nn.Conv2d(1, 6, 5)
+ self.conv2 = nn.Conv2d(6, 16, 5)
+ self.fc1 = nn.Linear(16 * 5 * 5, 120)
+ self.fc2 = nn.Linear(120, 84)
+ self.fc3 = nn.Linear(84, 10)
+
+ def forward(self, x: torch.Tensor):
+ x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+ x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+ x = torch.flatten(x, 1)
+ x = F.relu(self.fc1(x))
+ x = F.relu(self.fc2(x))
+ x = self.fc3(x)
+ return x
+
+
+def onnx_demo():
+ # 创建示例输入(固定随机种子以确保可重复性)
+ torch.manual_seed(42)
+ example_inputs = (torch.randn(1, 1, 32, 32),)
+ onnx_inputs = [tensor.numpy(force=True) for tensor in example_inputs]
+ print(f"Input length: {len(onnx_inputs)}")
+ print(f"Sample input shape: {onnx_inputs[0].shape}")
+
+ # 创建一个PyTorch模型实例,用于导出和比较
+ torch_model = ImageClassifierModel()
+
+ # 生成ONNX模型(传入同一个模型实例)
+ model_2_onnx(torch_model, example_inputs)
+
+ # 加载ONNX模型并运行推理
+ ort_session = onnxruntime.InferenceSession(
+ "./image_classifier_model.onnx", providers=["CPUExecutionProvider"]
+ )
+
+ onnxruntime_input = {input_arg.name: input_value for input_arg, input_value in
+ zip(ort_session.get_inputs(), onnx_inputs)}
+
+ # ONNX Runtime returns a list of outputs
+ onnxruntime_outputs = ort_session.run(None, onnxruntime_input)[0]
+
+ # 使用同一个PyTorch模型进行推理
+ torch_outputs = torch_model(*example_inputs)
+
+ print(f"PyTorch output shape: {torch_outputs.shape}")
+ print(f"ONNX Runtime output shape: {onnxruntime_outputs.shape}")
+
+ # 直接比较整个张量而不是逐个元素比较
+ try:
+ torch.testing.assert_close(torch_outputs, torch.tensor(onnxruntime_outputs), rtol=1e-3, atol=1e-3)
+ print("PyTorch and ONNX Runtime output matched!")
+ except AssertionError as e:
+ print(f"Outputs didn't match: {e}")
+ # 输出详细的差异信息
+ print("\nPyTorch output:")
+ print(torch_outputs)
+ print("\nONNX Runtime output:")
+ print(onnxruntime_outputs)
+ print("\nDifference:")
+ print(torch_outputs - torch.tensor(onnxruntime_outputs))
+
+ print(f"Output length: {onnxruntime_outputs.shape[1]}")
+ print(f"Sample output: {onnxruntime_outputs[0][:5]}...")
+
+
+def model_2_onnx(torch_model, example_inputs):
+ # 导出模型为ONNX格式,使用默认的opset版本
+ torch.onnx.export(
+ torch_model,
+ example_inputs,
+ "image_classifier_model.onnx",
+ export_params=True,
+ # 不指定opset_version,让PyTorch自动选择合适的版本
+ do_constant_folding=True,
+ input_names=['input'],
+ output_names=['output'],
+ dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
+ )
+
+ # 验证导出的模型
+ onnx_model = onnx.load("image_classifier_model.onnx")
+ onnx.checker.check_model(onnx_model)
+ print("ONNX model generated and validated successfully!")
+ print(f"ONNX model opset version: {onnx_model.opset_import[0].version}")
+
+
+if __name__ == '__main__':
+ onnx_demo()
\ No newline at end of file
diff --git a/tutorials/compilers/onnxscript_demo.py b/tutorials/compilers/onnxscript_demo.py
new file mode 100644
index 00000000..80ac8aee
--- /dev/null
+++ b/tutorials/compilers/onnxscript_demo.py
@@ -0,0 +1,43 @@
+import torch
+import onnxscript
+
+# Opset 18 is the standard supported version as of PyTorch 2.6
+from onnxscript import opset18 as op
+
+
+# Create a model that uses the operator torch.ops.aten.add.Tensor
+class Model(torch.nn.Module):
+ def forward(self, input_x, input_y):
+ return torch.ops.aten.add.Tensor(input_x, input_y)
+
+
+# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator.
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+# All attributes must be annotated with type hints.
+def custom_aten_add(self, other, alpha: float = 1.0):
+ if alpha != 1.0:
+ alpha = op.CastLike(alpha, other)
+ other = op.Mul(other, alpha)
+ # To distinguish the custom implementation from the builtin one, we switch the order of the inputs
+ return op.Add(other, self)
+
+
+x = torch.tensor([1.0])
+y = torch.tensor([2.0])
+
+# Then we provide the custom implementation to the ONNX exporter as a ``custom_translation_table``.
+onnx_program = torch.onnx.export(
+ Model().eval(),
+ (x, y),
+ dynamo=True,
+ custom_translation_table={
+ torch.ops.aten.add.Tensor: custom_aten_add,
+ },
+)
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+print(onnx_program.model)
+
+result = onnx_program(x, y)[0]
+print(f"Result: {result}")
+torch.testing.assert_close(result, torch.tensor([3.0]))
\ No newline at end of file
diff --git a/tutorials/compilers/set_stance_demo.py b/tutorials/compilers/set_stance_demo.py
new file mode 100644
index 00000000..255bb49f
--- /dev/null
+++ b/tutorials/compilers/set_stance_demo.py
@@ -0,0 +1,100 @@
+import torch
+
+
+@torch.compile
+def my_big_model(x):
+ return torch.relu(x)
+
+
+# fail_on_recompile 防止重新编译
+def fail_on_recompile():
+ # first compilation
+ my_big_model(torch.randn(3))
+
+ with torch.compiler.set_stance("fail_on_recompile"):
+ my_big_model(torch.randn(3)) # no recompilation - OK
+ try:
+ # 这里 shape 改变了,会触发 recompilation
+ my_big_model(torch.randn(4)) # recompilation - error
+ except Exception as e:
+ print(e)
+
+
+@torch.compile
+def my_huge_model(x):
+ if torch.compiler.is_compiling():
+ return x + 1
+ else:
+ return x - 1
+
+
+"""
+报错过于 disruptive,我们可以改用 "eager_on_recompile",它将导致 torch.compile 回退到立即执行模式而不是报错。
+如果预计重新编译不会频繁发生,但一旦需要,我们宁愿承担立即执行的成本而不是重新编译的成本,那么这可能很有用。
+"""
+
+
+def eager_on_recompile():
+ # first compilation
+ print(my_huge_model(torch.zeros(3))) # 1
+ with torch.compiler.set_stance("eager_on_recompile"):
+ print(my_huge_model(torch.zeros(3))) # 1
+ print(my_huge_model(torch.zeros(4))) # -1
+ print(my_huge_model(torch.zeros(3))) # 1
+
+
+# 衡量性能提升
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+ start.record()
+ result = fn()
+ end.record()
+ torch.cuda.synchronize()
+ return result, start.elapsed_time(end) / 1000
+
+
+@torch.compile
+def my_gigantic_model(x, y):
+ x = x @ y
+ x = x @ y
+ x = x @ y
+ return x
+
+
+"""
+eager: 0.0004822399914264679
+compiled: 0.00010444799810647964
+"""
+
+
+def force_eager_demo():
+ inps = torch.randn(5, 5), torch.randn(5, 5)
+ with torch.compiler.set_stance("force_eager"):
+ print("eager:", timed(lambda: my_gigantic_model(*inps))[1])
+ # warmups
+ for _ in range(3):
+ my_gigantic_model(*inps)
+ print("compiled:", timed(lambda: my_gigantic_model(*inps))[1])
+
+
+@torch.compile
+def my_humongous_model(x):
+ return torch.sin(x, x)
+
+def fast_find_error():
+ try:
+ # sin() takes 1 positional argument but 2 were given
+ with torch.compiler.set_stance("force_eager"):
+ print(my_humongous_model(torch.randn(3)))
+ # this call to the compiled model won't run
+ print(my_humongous_model(torch.randn(3)))
+ except Exception as e:
+ print(e)
+
+if __name__ == '__main__':
+ fast_find_error()
+
diff --git a/tutorials/compilers/torch_compile.py b/tutorials/compilers/torch_compile.py
new file mode 100644
index 00000000..77ffce45
--- /dev/null
+++ b/tutorials/compilers/torch_compile.py
@@ -0,0 +1,45 @@
+import torch
+import torch.utils.benchmark as benchmark
+from torch.utils.benchmark import Language
+from torch.utils.benchmark import Timer
+model = torch.nn.Sequential(
+ *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+input = torch.rand(1024, device="cuda")
+output = model(input)
+output.sum().backward()
+opt = torch.optim.Adam(model.parameters(), lr=0.01)
+
+
+@torch.compile(fullgraph=False)
+def fn():
+ opt.step()
+
+
+def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+ t0 = benchmark.Timer(
+ stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+ )
+ return t0.blocked_autorange().mean * 1e6
+
+
+def warmup():
+ for _ in range(5):
+ fn()
+
+
+def diff():
+ eager_runtime = benchmark_torch_function_in_microseconds(opt.step)
+ compiled_runtime = benchmark_torch_function_in_microseconds(fn)
+
+ assert eager_runtime > compiled_runtime
+
+ print(f"eager runtime: {eager_runtime}us")
+ print(f"compiled runtime: {compiled_runtime}us")
+
+
+
+
+if __name__ == '__main__':
+ diff()
+
diff --git a/tutorials/ddp/basic_ddp_demo.py b/tutorials/ddp/basic_ddp_demo.py
new file mode 100644
index 00000000..3058607d
--- /dev/null
+++ b/tutorials/ddp/basic_ddp_demo.py
@@ -0,0 +1,137 @@
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.optim as optim
+import os
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import os
+import sys
+import tempfile
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+import torch.multiprocessing as mp
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+
+# On Windows platform, the torch.distributed package only
+# supports Gloo backend, FileStore and TcpStore.
+# For FileStore, set init_method parameter in init_process_group
+# to a local file. Example as follow:
+# init_method="file:///f:/libtmp/some_file"
+# dist.init_process_group(
+# "gloo",
+# rank=rank,
+# init_method=init_method,
+# world_size=world_size)
+# For TcpStore, same way as on Linux.
+
+def setup(rank, world_size):
+ os.environ['MASTER_ADDR'] = 'localhost'
+ os.environ['MASTER_PORT'] = '12355'
+
+ # We want to be able to train our model on an `accelerator `__
+ # such as CUDA, MPS, MTIA, or XPU.
+ acc = torch.accelerator.current_accelerator()
+ print('Accelerator:', acc)
+ backend = torch.distributed.get_default_backend_for_device(acc)
+ # initialize the process group
+ dist.init_process_group(backend, rank=rank, world_size=world_size)
+
+
+def cleanup():
+ dist.destroy_process_group()
+
+
+class ToyModel(nn.Module):
+ def __init__(self):
+ super(ToyModel, self).__init__()
+ self.net1 = nn.Linear(10, 10)
+ self.relu = nn.ReLU()
+ self.net2 = nn.Linear(10, 5)
+
+ def forward(self, x):
+ return self.net2(self.relu(self.net1(x)))
+
+
+def demo_basic(rank, world_size):
+ print(f"Running basic DDP example on rank {rank}.")
+ setup(rank, world_size)
+
+ # create model and move it to GPU with id rank
+ model = ToyModel().to(rank)
+ ddp_model = DDP(model, device_ids=[rank])
+
+ loss_fn = nn.MSELoss()
+ optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+ optimizer.zero_grad()
+ outputs = ddp_model(torch.randn(20, 10))
+ labels = torch.randn(20, 5).to(rank)
+ loss_fn(outputs, labels).backward()
+ optimizer.step()
+
+ cleanup()
+ print(f"Finished running basic DDP example on rank {rank}.")
+
+
+def run_demo(demo_fn, world_size):
+ mp.spawn(demo_fn,
+ args=(world_size,),
+ nprocs=world_size,
+ join=True)
+
+
+def demo_checkpoint(rank, world_size):
+ print(f"Running DDP checkpoint example on rank {rank}.")
+ setup(rank, world_size)
+
+ model = ToyModel().to(rank)
+ ddp_model = DDP(model, device_ids=[rank])
+
+ CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
+ print(f"Checkpoint path: {CHECKPOINT_PATH}")
+ if rank == 0:
+ # All processes should see same parameters as they all start from same
+ # random parameters and gradients are synchronized in backward passes.
+ # Therefore, saving it in one process is sufficient.
+ torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
+
+ # Use a barrier() to make sure that process 1 loads the model after process
+ # 0 saves it.
+ dist.barrier()
+ # We want to be able to train our model on an `accelerator `__
+ # such as CUDA, MPS, MTIA, or XPU.
+ acc = torch.accelerator.current_accelerator()
+ # configure map_location properly
+ map_location = {f'{acc}:0': f'{acc}:{rank}'}
+ ddp_model.load_state_dict(
+ torch.load(CHECKPOINT_PATH, map_location=map_location, weights_only=True))
+
+ loss_fn = nn.MSELoss()
+ optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+ optimizer.zero_grad()
+ outputs = ddp_model(torch.randn(20, 10))
+ labels = torch.randn(20, 5).to(rank)
+
+ loss_fn(outputs, labels).backward()
+ optimizer.step()
+
+ # Not necessary to use a dist.barrier() to guard the file deletion below
+ # as the AllReduce ops in the backward pass of DDP already served as
+ # a synchronization.
+
+ if rank == 0:
+ os.remove(CHECKPOINT_PATH)
+
+ cleanup()
+ print(f"Finished running DDP checkpoint example on rank {rank}.")
+
+
+if __name__ == '__main__':
+ run_demo(demo_checkpoint, world_size=2)
diff --git a/tutorials/ddp/distributed_demo.py b/tutorials/ddp/distributed_demo.py
new file mode 100644
index 00000000..272a27ed
--- /dev/null
+++ b/tutorials/ddp/distributed_demo.py
@@ -0,0 +1,79 @@
+"""run.py:"""
+# !/usr/bin/env python
+import os
+import sys
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+# def run(rank, size):
+# """ Distributed function to be implemented later. """
+# """ 分布式函数示例 """
+# print(f"Rank {rank} of {size} is running.")
+# # 简单的分布式通信示例
+# tensor = torch.tensor([rank], dtype=torch.float32)
+# dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+# print(f"Rank {rank} reduced tensor: {tensor.item()}")
+#
+# """Blocking point-to-point communication."""
+#
+#
+# def run(rank, size):
+# tensor = torch.zeros(1)
+# if rank == 0:
+# tensor += 1
+# # Send the tensor to process 1
+# dist.send(tensor=tensor, dst=1)
+# else:
+# # Receive tensor from process 0
+# dist.recv(tensor=tensor, src=0)
+# print('Rank ', rank, ' has data ', tensor[0])
+"""Non-blocking point-to-point communication."""
+
+
+# def run(rank, size):
+# tensor = torch.zeros(1)
+# req = None
+# if rank == 0:
+# tensor += 1
+# # Send the tensor to process 1
+# req = dist.isend(tensor=tensor, dst=1)
+# print('Rank 0 started sending')
+# else:
+# # Receive tensor from process 0
+# req = dist.irecv(tensor=tensor, src=0)
+# print('Rank 1 started receiving')
+# req.wait()
+# print('Rank ', rank, ' has data ', tensor[0])
+def run(rank, size):
+ """ Simple collective communication. """
+ group = dist.new_group([0, 1])
+ tensor = torch.ones(1)
+ print(f"tensor: {tensor}")
+ dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)
+ print('Rank ', rank, ' has data ', tensor[0])
+
+
+def init_process(rank, size, fn, backend='gloo'):
+ """ Initialize the distributed environment. """
+ os.environ['MASTER_ADDR'] = '127.0.0.1'
+ os.environ['MASTER_PORT'] = '29500'
+ dist.init_process_group(backend, rank=rank, world_size=size)
+ fn(rank, size)
+
+
+if __name__ == "__main__":
+ world_size = 2
+ processes = []
+ if "google.colab" in sys.modules:
+ print("Running in Google Colab")
+ mp.get_context("spawn")
+ else:
+ mp.set_start_method("spawn")
+ for rank in range(world_size):
+ p = mp.Process(target=init_process, args=(rank, world_size, run))
+ p.start()
+ processes.append(p)
+
+ for p in processes:
+ p.join()
diff --git a/tutorials/ddp/pipelining_tutorial.py b/tutorials/ddp/pipelining_tutorial.py
new file mode 100644
index 00000000..f0337ab6
--- /dev/null
+++ b/tutorials/ddp/pipelining_tutorial.py
@@ -0,0 +1,120 @@
+import os
+import torch.distributed as dist
+import torch
+from torch.distributed.pipelining import pipeline, SplitPoint, PipelineStage, ScheduleGPipe
+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+
+global rank, device, pp_group, stage_index, num_stages
+
+
+@dataclass
+class ModelArgs:
+ dim: int = 512
+ n_layers: int = 8
+ n_heads: int = 8
+ vocab_size: int = 10000
+
+
+class Transformer(nn.Module):
+ def __init__(self, model_args: ModelArgs):
+ super().__init__()
+
+ self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
+
+ # Using a ModuleDict lets us delete layers witout affecting names,
+ # ensuring checkpoints will correctly save and load.
+ self.layers = torch.nn.ModuleDict()
+ for layer_id in range(model_args.n_layers):
+ self.layers[str(layer_id)] = nn.TransformerDecoderLayer(model_args.dim, model_args.n_heads)
+
+ self.norm = nn.LayerNorm(model_args.dim)
+ self.output = nn.Linear(model_args.dim, model_args.vocab_size)
+
+ def forward(self, tokens: torch.Tensor):
+ # Handling layers being 'None' at runtime enables easy pipeline splitting
+ h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+
+ for layer in self.layers.values():
+ h = layer(h, h)
+
+ h = self.norm(h) if self.norm else h
+ output = self.output(h).clone() if self.output else h
+ return output
+
+
+def init_distributed():
+ global rank, device, pp_group, stage_index, num_stages
+ rank = int(os.environ["LOCAL_RANK"])
+ world_size = int(os.environ["WORLD_SIZE"])
+ device = torch.device(f"cuda:{rank}") if torch.cuda.is_available() else torch.device("cpu")
+ dist.init_process_group()
+
+ # This group can be a sub-group in the N-D parallel case
+ pp_group = dist.new_group()
+ stage_index = rank
+ num_stages = world_size
+
+
+def manual_model_split(model) -> PipelineStage:
+ if stage_index == 0:
+ # prepare the first stage model
+ for i in range(4, 8):
+ del model.layers[str(i)]
+ model.norm = None
+ model.output = None
+
+ elif stage_index == 1:
+ # prepare the second stage model
+ for i in range(4):
+ del model.layers[str(i)]
+ model.tok_embeddings = None
+
+ stage = PipelineStage(
+ model,
+ stage_index,
+ num_stages,
+ device,
+ )
+ return stage
+
+
+if __name__ == "__main__":
+ init_distributed()
+ num_microbatches = 4
+ model_args = ModelArgs()
+ model = Transformer(model_args)
+
+ # Dummy data
+ x = torch.ones(32, 500, dtype=torch.long)
+ y = torch.randint(0, model_args.vocab_size, (32, 500), dtype=torch.long)
+ example_input_microbatch = x.chunk(num_microbatches)[0]
+
+ # Option 1: Manual model splitting
+ stage = manual_model_split(model)
+
+ # Option 2: Tracer model splitting
+ # stage = tracer_model_split(model, example_input_microbatch)
+
+ model.to(device)
+ x = x.to(device)
+ y = y.to(device)
+
+
+ def tokenwise_loss_fn(outputs, targets):
+ loss_fn = nn.CrossEntropyLoss()
+ outputs = outputs.reshape(-1, model_args.vocab_size)
+ targets = targets.reshape(-1)
+ return loss_fn(outputs, targets)
+
+
+ schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn)
+
+ if rank == 0:
+ schedule.step(x)
+ elif rank == 1:
+ losses = []
+ output = schedule.step(target=y, losses=losses)
+ print(f"losses: {losses}")
+ dist.destroy_process_group()
diff --git a/tutorials/ddp/pytorch_elastic.py b/tutorials/ddp/pytorch_elastic.py
new file mode 100644
index 00000000..01a44ff6
--- /dev/null
+++ b/tutorials/ddp/pytorch_elastic.py
@@ -0,0 +1,47 @@
+import os
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+class ToyModel(nn.Module):
+ def __init__(self):
+ super(ToyModel, self).__init__()
+ self.net1 = nn.Linear(10, 10)
+ self.relu = nn.ReLU()
+ self.net2 = nn.Linear(10, 5)
+
+ def forward(self, x):
+ return self.net2(self.relu(self.net1(x)))
+
+
+def demo_basic():
+ torch.accelerator.set_device_index(int(os.environ["LOCAL_RANK"]))
+ acc = torch.accelerator.current_accelerator()
+ print('Accelerator:', acc)
+ backend = torch.distributed.get_default_backend_for_device(acc)
+ dist.init_process_group(backend)
+ rank = dist.get_rank()
+ print(f"Start running basic DDP example on rank {rank}.")
+ # create model and move it to GPU with id rank
+ device_id = rank % torch.accelerator.device_count()
+ print(f"Device ID: {device_id}")
+ model = ToyModel().to(device_id)
+ ddp_model = DDP(model, device_ids=[device_id])
+ loss_fn = nn.MSELoss()
+ optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+ optimizer.zero_grad()
+ outputs = ddp_model(torch.randn(20, 10))
+ labels = torch.randn(20, 5).to(device_id)
+ loss_fn(outputs, labels).backward()
+ optimizer.step()
+ dist.destroy_process_group()
+ print(f"Finished running basic DDP example on rank {rank}.")
+"""
+torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py
+"""
+if __name__ == "__main__":
+ demo_basic()
diff --git a/tutorials/tcp/libuv_demo.py b/tutorials/tcp/libuv_demo.py
new file mode 100644
index 00000000..1c9d5ac8
--- /dev/null
+++ b/tutorials/tcp/libuv_demo.py
@@ -0,0 +1,30 @@
+import logging
+import os
+
+from time import perf_counter
+
+import torch
+import torch.distributed as dist
+
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger(__name__)
+
+if __name__ == '__main__':
+ # Env var are preset when launching the benchmark
+ env_rank = os.environ.get("RANK", 0)
+ env_world_size = os.environ.get("WORLD_SIZE", 1)
+ env_master_addr = os.environ.get("MASTER_ADDR", "localhost")
+ env_master_port = os.environ.get("MASTER_PORT", "23456")
+
+ start = perf_counter()
+ tcp_store = dist.TCPStore(
+ env_master_addr,
+ int(env_master_port),
+ world_size=int(env_world_size),
+ is_master=(int(env_rank) == 0),
+ )
+ end = perf_counter()
+ time_elapsed = end - start
+ logger.info(
+ f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds."
+ )
diff --git a/tutorials/torch-distributed/init-process-group.py b/tutorials/torch-distributed/init-process-group.py
new file mode 100644
index 00000000..c034fe29
--- /dev/null
+++ b/tutorials/torch-distributed/init-process-group.py
@@ -0,0 +1,25 @@
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+
+def init_process(rank, world_size):
+ print(f"进程已启动: 此进程的 rank 是 {rank}")
+
+
+def main():
+ os.environ['MASTER_ADDR'] = 'localhost'
+ os.environ['MASTER_PORT'] = '29500'
+ world_size = torch.cuda.device_count()
+ print(f"准备启动 {world_size} 个进程...")
+ mp.spawn(
+ init_process,
+ args=(world_size,),
+ nprocs=world_size,
+ join=True
+ )
+
+if __name__ == "__main__":
+ main()
diff --git a/tutorials/torch-distributed/readme.md b/tutorials/torch-distributed/readme.md
new file mode 100644
index 00000000..0373b825
--- /dev/null
+++ b/tutorials/torch-distributed/readme.md
@@ -0,0 +1,94 @@
+
+init_process_group
+
+```python
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+def init_process(rank, world_size):
+ print(f"进程已启动: 此进程的 rank 是 {rank}")
+
+ # 设置当前进程使用的 GPU
+ torch.cuda.set_device(rank)
+
+ try:
+ # 加入进程组
+ print(f"进程 {rank} 正在加入进程组...")
+ dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+ print(f"进程 {rank} 已成功加入进程组")
+
+ # 验证身份
+ assert rank == dist.get_rank()
+ assert world_size == dist.get_world_size()
+
+ # 准备当前进程的信息
+ process_info = (
+ f"\n进程 {rank} 信息:\n"
+ f"- Device: {torch.cuda.current_device()}\n"
+ f"- GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}\n"
+ )
+
+ # 将字符串转换为固定长度的张量
+ max_len = 100 # 确保足够长以容纳信息
+ process_info_tensor = torch.zeros(max_len, dtype=torch.int32, device='cuda')
+ process_info_bytes = process_info.encode('utf-8')
+ process_info_tensor[:len(process_info_bytes)] = torch.tensor([b for b in process_info_bytes], dtype=torch.int32)
+
+ # 创建用于收集所有进程信息的张量列表
+ gathered_tensors = [torch.zeros(max_len, dtype=torch.int32, device='cuda') for _ in range(world_size)]
+
+ # 使用 all_gather 收集所有进程的信息
+ dist.all_gather(gathered_tensors, process_info_tensor)
+
+
+ if rank == 0:
+ print("=============== 所有进程信息 ===============")
+ for tensor in gathered_tensors:
+ info_bytes = tensor.cpu().numpy().astype('uint8').tobytes()
+ info_str = info_bytes.decode('utf-8', 'ignore').strip('\x00')
+ print(info_str)
+
+ # 创建张量并进行通信
+ tensor = torch.ones(1).cuda() * rank
+ print(f"进程 {rank} 的原始张量值: {tensor.item()}")
+
+ # 所有进程同步点
+ dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+ print(f"进程 {rank} 的最终张量值: {tensor.item()}")
+
+ finally:
+ dist.destroy_process_group()
+
+def main():
+ os.environ['MASTER_ADDR'] = 'localhost'
+ os.environ['MASTER_PORT'] = '29500'
+
+ world_size = torch.cuda.device_count()
+ print(f"准备启动 {world_size} 个进程...")
+
+ mp.spawn(
+ init_process,
+ args=(world_size,),
+ nprocs=world_size,
+ join=True
+ )
+
+ #! 等价于通过以下代码启动进程
+ # processes = []
+ # for rank in range(world_size):
+ # p = mp.Process(target=init_process, args=(rank, world_size))
+ # p.start()
+ # processes.append(p)
+
+ # # 相当于 join=True 的效果
+ # for p in processes:
+ # p.join()
+
+if __name__ == "__main__":
+ main()
+```
+
+
\ No newline at end of file
diff --git a/tutorials/torch-layout/demo1.py b/tutorials/torch-layout/demo1.py
new file mode 100644
index 00000000..72f49858
--- /dev/null
+++ b/tutorials/torch-layout/demo1.py
@@ -0,0 +1,144 @@
+import torch
+
+
+def stride_demo():
+ """
+ 连续和非连续布局
+ 1.按照张量的逻辑形状(比如 2x3、3x2),以C 风格(行优先) 遍历每一个元素时,访问的内存地址是否是连续递增的。
+ 为什么内存布局重要?
+ 性能影响:连续布局的张量访问内存时,CPU/GPU 的缓存命中率更高,运算速度更快;非连续张量可能因为内存跳跃访问导致性能下降。
+ 操作限制:部分 PyTorch 操作(如view、resize_)仅支持连续张量,非连续张量会抛出RuntimeError。
+ 内存效率:像转置这样的操作通过修改布局而非复制数据,能节省大量内存。
+ """
+ # 从0到12 分成三行四列
+ x = torch.arange(12).view(3, 4)
+ print(f"view: {x}")
+ print(f"shape: {x.shape}")
+ # 在指定维度上从一个元素调到下一个元素所需的距离 dim 是两个数字,为了访问下一行需要往前移动4步,下一列应该向前移动1步
+ print(f"stride: {x.stride()}")
+ print(f"is_contiguous: {x.is_contiguous()}")
+ print("\n***********************************\n")
+ y = x.t()
+
+ # 这里没有真正的转置,只是改变了视图的 stride “如何从内存中读取数据”
+ print(f"v_t is_contiguous: {y.is_contiguous()}")
+ print(f"v_t: {y}")
+ print(f"v_t shape: {y.shape}")
+ print(f"v_t stride: {y.stride()}")
+ print("\n***********************************\n")
+ z = y.contiguous()
+ print(f"v_t is_contiguous: {z.is_contiguous()}")
+ print(f"v_t: {z}")
+ print(f"v_t shape: {z.shape}")
+ print(f"v_t stride: {z.stride()}")
+
+
+def storage_demo():
+ print(f"PyTorch版本: {torch.__version__}")
+ print(
+ f"操作系统: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
+ )
+
+ x = torch.tensor([[0, 1, 2], [3, 4, 5]], dtype=torch.float32)
+ storage_x = x.storage()
+ # 1. storage永远是一维的,不管Tensor是几维
+ print(f"storage:\n{storage_x}\n")
+ print("storage的类型:", type(storage_x)) # torch.storage._TypedStorage
+ print("storage的长度:", len(storage_x)) # 6(元素总数)
+ print("storage_x的id:", id(storage_x))
+
+ # 2. 转置后的Tensor共享同一个storage(物理内存没复制)
+ y = x.t()
+ storage_y = y.storage()
+ print("x的数据指针:", x.data_ptr())
+ print("y的数据指针:", y.data_ptr())
+ print("x和y的数据指针是否相同:", x.data_ptr() == y.data_ptr())
+
+ print("storage_y的id:", id(storage_y))
+
+
+def shared_storage_demo():
+ """
+ 验证多个tensor共享storage时,修改一个会影响其他所有tensor
+ """
+ print("=" * 60)
+ print("验证多个tensor共享storage的行为")
+ print("=" * 60)
+
+ # 创建原始tensor
+ x = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=torch.float32)
+
+ # 创建多个共享storage的tensor
+ y = x.t() # 转置
+ z = x.view(-1) # 展平成一维
+ w = x[1:, 2:] # 切片
+ v = x.reshape(2, 6) # reshape(如果可能,会共享storage)
+
+ print("\n【初始状态】")
+ print(f"x (原始) =\n{x}")
+ print(f"y (转置) =\n{y}")
+ print(f"z (展平) = {z}")
+ print(f"w (切片) =\n{w}")
+ print(f"v (reshape) =\n{v}")
+
+ # 验证它们是否共享存储(通过data_ptr比较)
+ print("\n【内存地址验证】")
+ print(f"x.data_ptr() = {x.data_ptr()}")
+ print(f"y.data_ptr() = {y.data_ptr()}")
+ print(f"z.data_ptr() = {z.data_ptr()}")
+ print(f"w.data_ptr() = {w.data_ptr()} (切片会有偏移)")
+ print(f"v.data_ptr() = {v.data_ptr()}")
+ print(f"\nx和y是否共享底层内存: {x.data_ptr() == y.data_ptr()}")
+ print(f"x和z是否共享底层内存: {x.data_ptr() == z.data_ptr()}")
+ print(f"x和v是否共享底层内存: {x.data_ptr() == v.data_ptr()}")
+
+ # 修改x的一个元素
+ print("\n【修改x[0, 0] = 999】")
+ x[0, 0] = 999
+
+ print(f"x =\n{x}")
+ print(f"y =\n{y}") # y[0, 0]应该也变成999
+ print(f"z = {z}") # z[0]应该也变成999
+ print(f"w =\n{w}") # w不受影响,因为它是切片[1:, 2:]
+ print(f"v =\n{v}") # v[0, 0]应该也变成999
+
+ # 修改y的一个元素
+ print("\n【修改y[1, 1] = 888】")
+ y[1, 1] = 888
+
+ print(f"x =\n{x}") # x[1, 1]应该也变成888
+ print(f"y =\n{y}")
+ print(f"z = {z}") # z[5]应该也变成888
+ print(f"v =\n{v}") # v相应位置也会变化
+
+ # 修改z的一个元素
+ print("\n【修改z[10] = 777】")
+ z[10] = 777
+
+ print(f"x =\n{x}") # x[2, 2]应该也变成777
+ print(f"y =\n{y}")
+ print(f"z = {z}")
+ print(f"w =\n{w}") # w[1, 0]应该也变成777(因为w是x[1:, 2:])
+
+ # 对比:使用clone()创建真正的副本
+ print("\n【对比:使用clone()创建独立副本】")
+ x_copy = x.clone()
+ print(f"x_copy.data_ptr() = {x_copy.data_ptr()}")
+ print(f"x_copy与x是否共享内存: {x_copy.data_ptr() == x.data_ptr()}")
+
+ x[0, 1] = 666
+ print("\n修改x[0, 1] = 666后:")
+ print(f"x =\n{x}")
+ print(f"x_copy =\n{x_copy} (不受影响)")
+
+ print("\n" + "=" * 60)
+
+
+if __name__ == "__main__":
+ # shared_storage_demo()
+ x = torch.randn(2, 3, 4)
+ print(f"x: {x}")
+ print(f"x shape: {x.shape}")
+ print(f"x stride: {x.stride()}")
+ # import json
+ # print(json.dumps(x.tolist(), indent=4))