diff --git a/.gitignore b/.gitignore index 7eade253..40ac8ab9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,17 @@ *.pkl *.zip +*.pth +*.txt +*.ckpt +*.pyc +*.onnx +*.data +*.lock +*/__pycache__/ +*.DS_Store +*.idea/ +*.pytest_cache/ +*.ruff_cache/ data/ .ipynb_checkpoints diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..9227b116 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Python: 当前文件", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": true + } + ] +} + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..a6c53728 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,14 @@ +{ + "python.defaultInterpreterPath": "${workspaceFolder}/pytorch-tutorial/bin/python", + "python.terminal.activateEnvironment": true, + "python.analysis.typeCheckingMode": "basic", + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.fixAll": "explicit", + "source.organizeImports": "explicit" + } + } +} + diff --git a/README.md b/README.md index 59ac3300..6b0b5bb3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ + +## uv +```bash +uv venv --python 3.11 +uv venv +source .venv/bin/activate +uv add xxx +uv sync +```

-------------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..8bf6d2a4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,59 @@ +[project] +name = "tutorials" +version = "0.1.0" +description = "Add your description here" +requires-python = ">=3.9" +authors = [ + { name = "lihanghang", email = "lihanghang@guazi.com" } # 对象格式,符合新版规范 +] +dependencies = [ + "matplotlib>=3.9.4", + "onnx>=1.19.1", + "onnxruntime>=1.20.1", + "onnxscript>=0.5.7", + "pandas>=2.3.3", + "setuptools>=80.9.0", + "tabulate>=0.9.0", + "tensordict>=0.10.0", + "torchvision>=0.23.0", +] + +[tool.poetry.dependencies] +aiohttp = "3.12.14" # 异步HTTP客户端/服务器库 +urllib3 = "2.6.2" # HTTP客户端库,提供连接池和线程安全 +orjson = ">=3.9.14,<4.0.0" # 高性能JSON序列化/反序列化库 +uuid = "^1.30" # 用于生成和操作UUID的库 +torch = "2.8.0" # PyTorch核心库,深度学习框架 +contourpy = "1.3.0" # 用于绘制等高线的Python库,matplotlib依赖 +cycler = "0.12.1" # 用于生成循环样式的工具库,matplotlib依赖 +fonttools = "4.60.2" # 用于处理字体文件的库,matplotlib依赖 +kiwisolver = "1.4.7" # 用于约束求解的库,matplotlib依赖 +pyparsing = "3.3.1" # 用于解析字符串的库,matplotlib依赖 +importlib-resources = "6.5.2" # 用于访问Python包资源的库 +matplotlib = "3.9.4" # 数据可视化库,用于创建图表和图形 +python-dateutil = "2.9.0.post0" # 日期时间处理扩展库,提供更强大的日期操作 +six = "1.17.0" # Python 2和Python 3兼容性库 +click = "8.1.8" # 命令行界面开发库,用于创建命令行工具 +joblib = "1.5.3" # 用于并行计算和任务调度的库,常用于机器学习 +nltk = "3.9.2" # 自然语言处理工具包,包含语料库和算法 +regex = "2025.11.3" # 正则表达式扩展库,提供比标准re更强大的功能 +tqdm = "4.67.1" # 进度条库,用于显示循环和任务的进度 +pycocotools = "2.0.11" # COCO数据集工具库,用于目标检测、分割等任务 +argparse = "1.4.0" # 命令行参数解析库,用于处理命令行输入 +pandas = "2.3.3" +pytz = "2025.2" +tzdata = "2025.3" +tensordict = "0.10.0" +cloudpickle = "3.1.2" +importlib-metadata = "8.7.1" +pyvers = "0.1.0" +pillow = "11.3.0" +setuptools = "80.9.0" +[[tool.poetry.source]] +name = "aliyun" +url = "https://mirrors.huaweicloud.com/repository/pypi/simple/" +priority = "primary" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/tutorials/01-basics/feedforward_neural_network/main.py b/tutorials/01-basics/feedforward_neural_network/main.py index 0c766a7e..0fb48bbf 100644 --- a/tutorials/01-basics/feedforward_neural_network/main.py +++ b/tutorials/01-basics/feedforward_neural_network/main.py @@ -2,93 +2,121 @@ import torch.nn as nn import torchvision import torchvision.transforms as transforms +import ssl +# 前馈神经网络 +""" +前馈神经网络是网络结构,反向传播是训练这个网络的核心算法。 +神经网络的「骨架」—— 定义了「神经元如何分层、层与层如何连接、每层神经元数量、用什么激活函数」的整体框架, +决定了信息在网络中如何传递,是模型能拟合数据的基础,和 “反向传播(训练算法)” 是 “骨架” 和 “打磨骨架的方法” 的关系。 -# Device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +""" +ssl._create_default_https_context = ssl._create_unverified_context -# Hyper-parameters -input_size = 784 -hidden_size = 500 -num_classes = 10 -num_epochs = 5 -batch_size = 100 -learning_rate = 0.001 - -# MNIST dataset -train_dataset = torchvision.datasets.MNIST(root='../../data', - train=True, - transform=transforms.ToTensor(), - download=True) - -test_dataset = torchvision.datasets.MNIST(root='../../data', - train=False, - transform=transforms.ToTensor()) - -# Data loader -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True) - -test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - shuffle=False) # Fully connected neural network with one hidden layer class NeuralNet(nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(NeuralNet, self).__init__() - self.fc1 = nn.Linear(input_size, hidden_size) - self.relu = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - + self.fc1 = nn.Linear(input_size, hidden_size) # 输入层→隐藏层 + self.relu = nn.ReLU() # 激活函数 + self.fc2 = nn.Linear(hidden_size, num_classes) # 隐藏层→输出层 + def forward(self, x): - out = self.fc1(x) - out = self.relu(out) - out = self.fc2(out) + out = self.fc1(x) # 线性变换 + out = self.relu(out) # 非线性激活 + out = self.fc2(out) # 线性变换 return out -model = NeuralNet(input_size, hidden_size, num_classes).to(device) - -# Loss and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - -# Train the model -total_step = len(train_loader) -for epoch in range(num_epochs): - for i, (images, labels) in enumerate(train_loader): - # Move tensors to the configured device - images = images.reshape(-1, 28*28).to(device) - labels = labels.to(device) - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (i+1) % 100 == 0: - print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' - .format(epoch+1, num_epochs, i+1, total_step, loss.item())) - -# Test the model -# In test phase, we don't need to compute gradients (for memory efficiency) -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - images = images.reshape(-1, 28*28).to(device) - labels = labels.to(device) - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total)) - -# Save the model checkpoint -torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file +""" +FNN与之前学习的模型对比 +模型 结构复杂度 学习能力 应用场景 +线性回归 简单(单层) 只能学习线性关系 简单回归任务 +逻辑回归 简单(单层+激活) 只能学习线性可分的分类 简单分类任务 +前馈神经网络 复杂(多层+激活) 可学习复杂非线性关系 复杂分类/回归任务 + +""" +""" +前馈神经网络的作用 +FNN的核心作用是学习输入与输出之间的复杂映射关系,主要用于两类任务: + +分类任务:将输入数据分为不同类别(如代码中的MNIST数字分类) +回归任务:预测连续数值(如房价预测、股票价格预测) +其强大之处在于:通过多层结构和非线性激活,能够拟合几乎任何复杂的函数关系(这是神经网络的"万能近似定理")。 +""" +if __name__ == '__main__': + + # Device configuration + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + # Hyper-parameters + input_size = 784 + hidden_size = 500 + num_classes = 10 + num_epochs = 5 + batch_size = 100 + learning_rate = 0.001 + + # MNIST dataset + train_dataset = torchvision.datasets.MNIST(root='../../data', + train=True, + transform=transforms.ToTensor(), + download=True) + + test_dataset = torchvision.datasets.MNIST(root='../../data', + train=False, + transform=transforms.ToTensor()) + + # Data loader + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=batch_size, + shuffle=True) + + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=batch_size, + shuffle=False) + + model = NeuralNet(input_size, hidden_size, num_classes).to(device) + + # Loss and optimizer + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + # Train the model + total_step = len(train_loader) + for epoch in range(num_epochs): + for i, (images, labels) in enumerate(train_loader): + # Move tensors to the configured device + images = images.reshape(-1, 28 * 28).to(device) + labels = labels.to(device) + + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if (i + 1) % 100 == 0: + print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' + .format(epoch + 1, num_epochs, i + 1, total_step, loss.item())) + + # Test the model + # In test phase, we don't need to compute gradients (for memory efficiency) + with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + images = images.reshape(-1, 28 * 28).to(device) + labels = labels.to(device) + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total)) + + # Save the model checkpoint + torch.save(model.state_dict(), 'model.ckpt') diff --git a/tutorials/01-basics/life/README.md b/tutorials/01-basics/life/README.md new file mode 100644 index 00000000..3942dda3 --- /dev/null +++ b/tutorials/01-basics/life/README.md @@ -0,0 +1,193 @@ +一、推荐系统的核心目标 +为用户精准匹配他们可能感兴趣的内容/商品,同时帮助平台: + +提高用户留存和活跃度 +增加用户消费和观看时长 +优化平台资源利用率 +实现商业价值最大化 +二、推荐系统的基本流程 +数据收集: + +用户行为数据:浏览记录、点击、购买、评分、收藏、分享等 +物品特征数据:商品类别、价格、品牌、描述;视频的标签、时长、创作者等 +用户特征数据:年龄、性别、地域、偏好等 +数据预处理: + +清洗脏数据(如异常值、缺失值) +特征工程(提取关键特征、特征编码、归一化等) +构建用户-物品交互矩阵 +模型训练: + +使用机器学习或深度学习算法,从数据中学习用户偏好模式 +常见算法:协同过滤、基于内容的推荐、深度学习推荐等 +推荐生成: + +为用户预测对候选物品的兴趣度 +根据兴趣度排序,生成推荐列表 +在线服务与评估: + +将推荐结果实时展示给用户 +收集用户反馈,持续优化模型 +三、推荐系统的主要算法 +1. 协同过滤(Collaborative Filtering) +协同过滤是推荐系统最经典的算法,基于**"相似用户喜欢相似物品"**的假设: + +(1)基于用户的协同过滤(User-based CF) +原理:找到与目标用户兴趣相似的其他用户,推荐这些相似用户喜欢的物品 +步骤: +计算用户之间的相似度(如余弦相似度、皮尔逊相关系数) +为目标用户找到最相似的K个邻居 +将邻居喜欢的、目标用户未接触过的物品推荐给目标用户 +(2)基于物品的协同过滤(Item-based CF) +原理:找到与目标用户喜欢的物品相似的其他物品,推荐这些相似物品 + +步骤: + +计算物品之间的相似度(如余弦相似度、调整余弦相似度) +为目标用户喜欢的每个物品找到最相似的K个物品 +将这些相似物品推荐给目标用户 +特点:计算复杂度低,推荐结果稳定,常用于电商平台(如Amazon) + +2. 基于内容的推荐(Content-based Recommendation) +原理:基于用户过去喜欢的物品特征,推荐具有相似特征的其他物品 + +步骤: + +提取物品的特征向量(如商品的类别、品牌、价格;视频的标签、风格) +分析用户的历史行为,学习用户对不同特征的偏好权重 +根据物品特征与用户偏好的匹配度,生成推荐列表 +特点:不依赖其他用户数据,适用于冷启动场景,常用于视频/音乐推荐(如Spotify) + +3. 混合推荐系统(Hybrid Recommendation) +原理:结合多种推荐算法的优势,提高推荐效果 +常见组合方式: +加权混合:对不同算法的推荐结果进行加权融合 +特征组合:将多种算法的特征输入到统一模型中 +级联混合:先用一种算法生成候选集,再用另一种算法精排 +切换混合:根据不同场景或用户群体选择不同算法 +4. 深度学习在推荐系统中的应用 +随着深度学习的发展,越来越多的推荐系统开始采用深度神经网络: + +(1)矩阵分解模型(如MF、SVD++) +将用户和物品映射到低维隐向量空间 +通过向量内积预测用户对物品的评分 +(2)神经网络推荐模型 +DNN(深度神经网络):学习复杂的用户-物品交互模式 +CNN(卷积神经网络):提取物品文本或图像的局部特征 +RNN/LSTM(循环神经网络):捕捉用户行为序列的时序信息 +Attention Mechanism(注意力机制):识别用户行为中的关键物品或特征 +(3)经典深度学习推荐模型 +Wide & Deep:结合记忆能力(Wide部分)和泛化能力(Deep部分) +DeepFM:融合因子分解机(FM)和深度神经网络,自动学习高阶特征交互 +DIN(Deep Interest Network):引入注意力机制,捕捉用户动态兴趣 +BERT4Rec:基于Transformer,建模用户行为序列 +四、电商平台 vs 视频网站:推荐策略差异 +虽然核心原理相似,但由于业务场景和用户行为特点不同,两者的推荐策略有所差异: + +对比维度 电商平台(商品推荐) 视频网站(内容推荐) +用户目标 明确(购物、购买特定商品) 相对模糊(娱乐、打发时间) +决策周期 较长(需要比较、考虑) 较短(几秒内决定是否观看) +物品特点 种类丰富,价格差异大 内容形式相对统一,时长多样 +关键特征 价格、品牌、销量、评价 内容质量、创作者、时效性、标签 +推荐重点 转化率、客单价、复购率 点击率、观看时长、完播率 +实时性要求 中等(购物决策较慢) 极高(需快速响应用户兴趣变化) +冷启动挑战 新用户、新商品 新用户、新内容、新创作者 +五、推荐系统的评估指标 +推荐系统的效果通常通过以下指标评估: + +准确性指标: + +准确率(Precision):推荐列表中用户真正感兴趣的物品比例 +召回率(Recall):用户真正感兴趣的物品被推荐到的比例 +F1值:准确率和召回率的调和平均 +排序指标: + +NDCG(Normalized Discounted Cumulative Gain):衡量推荐列表的排序质量 +MAP(Mean Average Precision):平均准确率均值 +业务指标: + +点击率(CTR) +转化率(CVR) +平均观看时长 +用户留存率 +销售额/GMV +六、推荐系统面临的挑战 +冷启动问题: + +新用户:缺乏历史行为数据 +新物品:缺乏用户交互数据 +数据稀疏性: + +用户-物品交互矩阵通常非常稀疏 +多数用户只与少数物品产生交互 +实时性要求: + +需实时响应用户兴趣变化 +处理海量并发请求 +公平性与多样性: + +避免"信息茧房"(只推荐用户已感兴趣的内容) +确保推荐结果的多样性和公平性 +隐私保护: + +在利用用户数据的同时,保护用户隐私 +遵守数据保护法规(如GDPR、个人信息保护法) +七、推荐系统的发展趋势 +多模态推荐:融合文本、图像、音频、视频等多种模态信息 +联邦学习:在保护用户隐私的前提下,实现跨平台的推荐模型训练 +强化学习:通过与环境交互,动态优化推荐策略 +因果推荐:从相关性分析转向因果关系挖掘,提高推荐的可解释性 +大模型与推荐系统结合:利用预训练大模型(如LLM)增强推荐系统的语义理解能力 + + + +## 核心技术:人脸识别系统 + +#### 1. 智能手机解锁和人脸支付都依赖于人脸识别技术,其基本流程如下: +* 图像采集:通过手机前置摄像头捕捉用户面部图像 +* 人脸检测:从图像中定位并提取人脸区域 +* 特征提取:将人脸图像转换为计算机可理解的特征向量 +* 特征匹配:将提取的特征与数据库中存储的用户特征进行比对 +* 决策判断:根据匹配结果和阈值,判断是否通过验证 +## 二、深度学习在人脸识别中的应用 +现代智能手机的人脸识别系统几乎都采用了深度学习技术,特别是卷积神经网络(CNN),这与您学习的前馈神经网络有密切联系,但结构更复杂: + +1. 人脸检测 +使用专门的人脸检测网络(如MTCNN、RetinaFace等) +这些网络能够在复杂环境下(如不同光线、角度、遮挡)准确检测人脸位置 +通常包含多个阶段:候选框生成 → 人脸回归 → 人脸关键点定位 +2. 特征提取 +使用深度卷积神经网络(如FaceNet、ArcFace、SphereFace等) +这些网络经过大量人脸数据训练,能够学习到人脸的判别性特征 +关键特性:相同人脸在不同条件下的特征向量距离较小,不同人脸的特征向量距离较大 +3. 特征匹配与验证 +采用度量学习方法,将人脸映射到一个高维特征空间 +使用相似度度量(如欧氏距离、余弦相似度)比较特征向量 +设置阈值判断是否为同一人 + + +三、智能手机解锁 vs 人脸支付:主要区别 +虽然原理相似,但两者在安全性要求和实现细节上有很大差异: + +对比维度 智能手机解锁 人脸支付 +安全级别要求 中等(主要防止非授权使用) 极高(涉及资金安全) +活体检测 基本(简单眨眼、摇头) 高级(3D结构光、红外成像) +防攻击能力 一般(可能被高清照片欺骗) 强(能防御照片、视频、3D模型攻击) +硬件支持 普通摄像头即可 专用硬件(如3D结构光、ToF传感器) +验证阈值 相对宽松(优先用户体验) 非常严格(优先安全性) + +四、关键安全技术:活体检测 +为了防止照片、视频等欺骗手段,人脸支付系统通常采用活体检测技术: + +2D活体检测:分析面部表情变化(眨眼、张嘴)、纹理信息等 +3D结构光:投射红外点阵,获取面部3D结构信息(如iPhone的Face ID) +红外成像:使用红外摄像头,区分真实人脸和照片/视频 +多光谱成像:结合可见光和红外光信息,提高检测准确率 + +五、与您学习的PyTorch知识的联系 +虽然人脸识别系统比您当前学习的前馈神经网络复杂得多,但它们的核心原理是一致的: + +神经网络结构:都采用层级结构,通过非线性激活函数(如ReLU)提取特征 +训练过程:都需要大量标注数据,通过反向传播和优化器(如Adam)更新参数 +损失函数:人脸识别常用专门的损失函数(如Triplet Loss、ArcFace Loss),但基本思想仍是最小化预测与真实值的差距 +模型评估:都需要在测试集上评估准确率、召回率等指标 \ No newline at end of file diff --git a/tutorials/01-basics/linear_regression/main.py b/tutorials/01-basics/linear_regression/main.py index b3715d99..15d513f2 100644 --- a/tutorials/01-basics/linear_regression/main.py +++ b/tutorials/01-basics/linear_regression/main.py @@ -3,53 +3,73 @@ import numpy as np import matplotlib.pyplot as plt +# 线性回归 -# Hyper-parameters -input_size = 1 -output_size = 1 -num_epochs = 60 -learning_rate = 0.001 - -# Toy dataset -x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168], - [9.779], [6.182], [7.59], [2.167], [7.042], - [10.791], [5.313], [7.997], [3.1]], dtype=np.float32) - -y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573], - [3.366], [2.596], [2.53], [1.221], [2.827], - [3.465], [1.65], [2.904], [1.3]], dtype=np.float32) - -# Linear regression model -model = nn.Linear(input_size, output_size) - -# Loss and optimizer -criterion = nn.MSELoss() -optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) - -# Train the model -for epoch in range(num_epochs): - # Convert numpy arrays to torch tensors - inputs = torch.from_numpy(x_train) - targets = torch.from_numpy(y_train) - - # Forward pass - outputs = model(inputs) - loss = criterion(outputs, targets) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (epoch+1) % 5 == 0: - print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item())) - -# Plot the graph -predicted = model(torch.from_numpy(x_train)).detach().numpy() -plt.plot(x_train, y_train, 'ro', label='Original data') -plt.plot(x_train, predicted, label='Fitted line') -plt.legend() -plt.show() - -# Save the model checkpoint -torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file +if __name__ == '__main__': + # Hyper-parameters + input_size = 1 + output_size = 1 + num_epochs = 60 + learning_rate = 0.001 + + # Toy dataset + x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168], + [9.779], [6.182], [7.59], [2.167], [7.042], + [10.791], [5.313], [7.997], [3.1]], dtype=np.float32) + + y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573], + [3.366], [2.596], [2.53], [1.221], [2.827], + [3.465], [1.65], [2.904], [1.3]], dtype=np.float32) + + # Linear regression model + # 创建一个线性回归模型(也称为全连接层或仿射变换层) + # 在PyTorch中,nn.Linear 会自动初始化权重 w 和偏置 b + model = nn.Linear(input_size, output_size) + + # Loss and optimizer + # 定义损失函数,用于衡量模型预测值与真实值之间的差异 + # 这里使用均方误差损失(Mean Squared Error Loss) + criterion = nn.MSELoss() + # 创建优化器,用于更新模型的参数(权重 w 和偏置 b) + # 学习率的作用:学习率过大可能导致模型训练不稳定,过小则训练速度太慢 + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + + # Train the model + for epoch in range(num_epochs): + # Convert numpy arrays to torch tensors + inputs = torch.from_numpy(x_train) + targets = torch.from_numpy(y_train) + + # Forward pass + outputs = model(inputs) + loss = criterion(outputs, targets) + + # Backward and optimize + optimizer.zero_grad() + # 反向传播计算梯度: + # 计算损失函数关于模型参数的梯度 + loss.backward() + # 更新模型参数: + # 使用优化器根据计算得到的梯度更新模型参数 + optimizer.step() + + if (epoch + 1) % 5 == 0: + print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item())) + """ + 模型训练时确实使用了 x_train(输入)和 y_train(目标),但训练完成后: + + 我们需要验证模型在训练数据上的拟合效果 + 通过对相同的 x_train 进行预测,得到 predicted(模型输出) + 然后将 predicted 与真实的 y_train 对比绘图,直观展示模型学习的线性关系 + """ + # .detach() # 从计算图中分离张量 + # Plot the graph + predicted = model( + torch.from_numpy(x_train)).detach().numpy() # 将PyTorch张量转换回NumPy数组 因为matplotlib绘图库需要NumPy数组格式 方便后续的可视化操作 + plt.plot(x_train, y_train, 'ro', label='Original data') + plt.plot(x_train, predicted, label='Fitted line') + plt.legend() + plt.show() + + # Save the model checkpoint + torch.save(model.state_dict(), 'model.ckpt') diff --git a/tutorials/01-basics/logistic_regression/main.py b/tutorials/01-basics/logistic_regression/main.py index c7eb378b..ea3c4b29 100644 --- a/tutorials/01-basics/logistic_regression/main.py +++ b/tutorials/01-basics/logistic_regression/main.py @@ -2,75 +2,109 @@ import torch.nn as nn import torchvision import torchvision.transforms as transforms +import ssl +# 逻辑回归 +ssl._create_default_https_context = ssl._create_unverified_context +if __name__ == '__main__': + # Hyper-parameters + input_size = 28 * 28 # 784 + num_classes = 10 + num_epochs = 5 + batch_size = 100 + learning_rate = 0.001 + # MNIST dataset (images and labels) + train_dataset = torchvision.datasets.MNIST(root='../../data', + train=True, + transform=transforms.ToTensor(), + download=True) -# Hyper-parameters -input_size = 28 * 28 # 784 -num_classes = 10 -num_epochs = 5 -batch_size = 100 -learning_rate = 0.001 + test_dataset = torchvision.datasets.MNIST(root='../../data', + train=False, + transform=transforms.ToTensor()) -# MNIST dataset (images and labels) -train_dataset = torchvision.datasets.MNIST(root='../../data', - train=True, - transform=transforms.ToTensor(), - download=True) + # Data loader (input pipeline) + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=batch_size, + shuffle=True) -test_dataset = torchvision.datasets.MNIST(root='../../data', - train=False, - transform=transforms.ToTensor()) + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=batch_size, + shuffle=False) -# Data loader (input pipeline) -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True) + # Logistic regression model + # 逻辑回归(多分类):输出类别数量的得分 + # 逻辑回归:输出层维度 = 类别数量(这里10个数字) + # 线性回归:输出层维度 = 1(预测单一连续值) -test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - shuffle=False) + model = nn.Linear(input_size, num_classes) -# Logistic regression model -model = nn.Linear(input_size, num_classes) + # Loss and optimizer + # nn.CrossEntropyLoss() computes softmax internally + # 逻辑回归:使用交叉熵损失(自动包含softmax) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) -# Loss and optimizer -# nn.CrossEntropyLoss() computes softmax internally -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + # Train the model + total_step = len(train_loader) + for epoch in range(num_epochs): + for i, (images, labels) in enumerate(train_loader): + # Reshape images to (batch_size, input_size) + images = images.reshape(-1, input_size) + # 逻辑回归:CrossEntropyLoss内部已包含softmax激活 + # Forward pass + outputs = model(images) # 输出是原始得分(logits) + loss = criterion(outputs, labels) -# Train the model -total_step = len(train_loader) -for epoch in range(num_epochs): - for i, (images, labels) in enumerate(train_loader): - # Reshape images to (batch_size, input_size) - images = images.reshape(-1, input_size) - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (i+1) % 100 == 0: - print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' - .format(epoch+1, num_epochs, i+1, total_step, loss.item())) + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() -# Test the model -# In test phase, we don't need to compute gradients (for memory efficiency) -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - images = images.reshape(-1, input_size) - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum() + if (i+1) % 100 == 0: + print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' + .format(epoch+1, num_epochs, i+1, total_step, loss.item())) + """ + 线性回归和逻辑回归的核心区别在于: - print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) + 任务目标不同:回归 vs 分类 + 输出处理不同:直接输出 vs 激活函数映射 + 损失函数不同:MSE vs 交叉熵 + """ + # Test the model + # In test phase, we don't need to compute gradients (for memory efficiency) + with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + images = images.reshape(-1, input_size) + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum() -# Save the model checkpoint -torch.save(model.state_dict(), 'model.ckpt') + print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) + + # Save the model checkpoint + torch.save(model.state_dict(), 'model.ckpt') + + """ + 一、核心区别对比表 + 对比维度 均方误差损失 (MSE) 交叉熵损失 (Cross-Entropy) + 数学定义 $MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2$ 二分类:$CE = -\frac{1}{n}\sum_{i=1}^{n}[y_i\log\hat{y}i + (1-y_i)\log(1-\hat{y}i)]$ + 多分类:$CE = -\frac{1}{n}\sum{i=1}^{n}\sum{c=1}^{C}y_{ic}\log\hat{y}_{ic}$ + 适用任务 回归任务(预测连续值,如房价、温度) 分类任务(预测离散类别,如图片分类、文本分类) + 输出范围假设 预测值$\hat{y}$可任意实数($-\infty, +\infty$) 预测值需转换为概率分布($(0, 1)$区间) + 激活函数配合 通常不需要特定激活函数(线性输出) + 或配合sigmoid/tanh(约束输出范围) 二分类:配合sigmoid激活 + 多分类:配合softmax激活(PyTorch中CrossEntropyLoss内部自动计算) + 梯度特性 梯度与预测偏差$(y-\hat{y})$成正比 + 预测远离真实值时梯度大,易不稳定 梯度与概率分布的差异相关 + 训练更稳定,尤其适合分类任务 + + 场景 推荐损失函数 代码示例 + 预测连续值(如房价、温度) MSE criterion = nn.MSELoss() + 二分类(如垃圾邮件检测) Binary Cross-Entropy criterion = nn.BCELoss() + 多分类(如MNIST数字识别) Cross-Entropy criterion = nn.CrossEntropyLoss() + 简单来说:回归用MSE,分类用CrossEntropy,这是深度学习中的"黄金法则"之一! + """ \ No newline at end of file diff --git a/tutorials/01-basics/pytorch_basics/main.py b/tutorials/01-basics/pytorch_basics/main.py index 744400c2..16dbf19d 100644 --- a/tutorials/01-basics/pytorch_basics/main.py +++ b/tutorials/01-basics/pytorch_basics/main.py @@ -1,10 +1,27 @@ -import torch -import torchvision -import torch.nn as nn +import ssl + import numpy as np +import torch +import torch.nn as nn +import torchvision import torchvision.transforms as transforms +class CustomDataset(torch.utils.data.Dataset): + def __init__(self): + # 初始化一些示例数据 + self.data = torch.randn(100, 3) # 100个样本,每个样本3个特征 + self.labels = torch.randint(0, 2, (100,)) # 100个标签,0或1 + + def __getitem__(self, index): + # 返回数据对 (特征, 标签) + return self.data[index], self.labels[index] + + def __len__(self): + # 返回数据集大小 + return len(self.data) + + # ================================================================== # # Table of Contents # # ================================================================== # @@ -15,175 +32,264 @@ # 4. Input pipline (Line 104 to 129) # 5. Input pipline for custom dataset (Line 136 to 156) # 6. Pretrained model (Line 163 to 176) -# 7. Save and load model (Line 183 to 189) +# 7. Save and load model (Line 183 to 189) +# (CNN)的基本原理 是什么? # ================================================================== # # 1. Basic autograd example 1 # # ================================================================== # -# Create tensors. -x = torch.tensor(1., requires_grad=True) -w = torch.tensor(2., requires_grad=True) -b = torch.tensor(3., requires_grad=True) - -# Build a computational graph. -y = w * x + b # y = 2 * x + 3 - -# Compute gradients. -y.backward() - -# Print out the gradients. -print(x.grad) # x.grad = 2 -print(w.grad) # w.grad = 1 -print(b.grad) # b.grad = 1 - - -# ================================================================== # -# 2. Basic autograd example 2 # -# ================================================================== # - -# Create tensors of shape (10, 3) and (10, 2). -x = torch.randn(10, 3) -y = torch.randn(10, 2) - -# Build a fully connected layer. -linear = nn.Linear(3, 2) -print ('w: ', linear.weight) -print ('b: ', linear.bias) - -# Build loss function and optimizer. -criterion = nn.MSELoss() -optimizer = torch.optim.SGD(linear.parameters(), lr=0.01) - -# Forward pass. -pred = linear(x) - -# Compute loss. -loss = criterion(pred, y) -print('loss: ', loss.item()) - -# Backward pass. -loss.backward() - -# Print out the gradients. -print ('dL/dw: ', linear.weight.grad) -print ('dL/db: ', linear.bias.grad) - -# 1-step gradient descent. -optimizer.step() - -# You can also perform gradient descent at the low level. -# linear.weight.data.sub_(0.01 * linear.weight.grad.data) -# linear.bias.data.sub_(0.01 * linear.bias.grad.data) - -# Print out the loss after 1-step gradient descent. -pred = linear(x) -loss = criterion(pred, y) -print('loss after 1 step optimization: ', loss.item()) - - -# ================================================================== # -# 3. Loading data from numpy # -# ================================================================== # - -# Create a numpy array. -x = np.array([[1, 2], [3, 4]]) - -# Convert the numpy array to a torch tensor. -y = torch.from_numpy(x) - -# Convert the torch tensor to a numpy array. -z = y.numpy() - - -# ================================================================== # -# 4. Input pipeline # -# ================================================================== # - -# Download and construct CIFAR-10 dataset. -train_dataset = torchvision.datasets.CIFAR10(root='../../data/', - train=True, - transform=transforms.ToTensor(), - download=True) - -# Fetch one data pair (read data from disk). -image, label = train_dataset[0] -print (image.size()) -print (label) - -# Data loader (this provides queues and threads in a very simple way). -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=64, - shuffle=True) - -# When iteration starts, queue and thread start to load data from files. -data_iter = iter(train_loader) - -# Mini-batch images and labels. -images, labels = data_iter.next() - -# Actual usage of the data loader is as below. -for images, labels in train_loader: - # Training code should be written here. - pass - - -# ================================================================== # -# 5. Input pipeline for custom dataset # -# ================================================================== # - -# You should build your custom dataset as below. -class CustomDataset(torch.utils.data.Dataset): - def __init__(self): - # TODO - # 1. Initialize file paths or a list of file names. +""" +神经网络中的延伸 +在神经网络中,损失函数 ( L ) 是关于所有模型参数(权重 ( w_1, w_2, ... ) 和偏置 ( b_1, b_2, ... ))的高维多元函数。因此: + +参数数量 = 偏导数数量 +每个参数的偏导数 ( \frac{\partial L}{\partial \theta} ) 代表损失函数在该参数方向上的变化率 +所有偏导数组成的向量称为梯度,用于指导参数更新方向 +""" +if __name__ == '__main__': + """ + 偏导数的大小:变化率的强度 + 偏导数的绝对值大小代表了函数在该参数方向上的变化率强度: + + 偏导数绝对值越大 → 函数在该方向上变化越快 → 该参数对损失函数的影响越敏感 + 偏导数绝对值越小 → 函数在该方向上变化越慢 → 该参数对损失函数的影响较不敏感 + """ + ssl._create_default_https_context = ssl._create_unverified_context + # Create tensors. + x = torch.tensor(1., requires_grad=True) + print(f"x Python 类型: {type(x)}") + print(f"tensor 类名: {x.__class__.__name__}") + print(f"tensor 模块: {x.__class__.__module__}") + print(f"tensor 是否是 torch.Tensor: {isinstance(x, torch.Tensor)}") + print("在 PyTorch 中,当你创建一个张量并设置 requires_grad=True 时,PyTorch 会自动构建一个计算图" + "计算图记录了所有对该张量的操作,以便后续计算梯度" + "y.backward() 会从 y 开始,沿着计算图反向传播,计算 y 相对于所有具有 requires_grad=True 的输入张量(这里是 x、w、b)的偏导数") + w = torch.tensor(2., requires_grad=True) + b = torch.tensor(3., requires_grad=True) + + # Build a computational graph. + y = w * x + b # y = 2 * x + 3 + print("""你不能直接打印 y.grad,因为: + 只有叶子张量(Leaf Tensor)才能保存梯度 + 叶子张量是指直接创建的张量(不是通过其他张量计算得到的) + 在这个例子中,x、w、b 是叶子张量,而 y 是通过计算得到的中间张量 + PyTorch 默认只保存叶子张量的梯度,以节省内存空间""") + # Compute gradients. + # y.retain_grad() # 保存 y 的梯度 + # 执行反向传播 + y.backward() + + # Print out the gradients. + # 用于执行反向传播,计算梯度: + print(x.grad) # x.grad = 2 + print(w.grad) # w.grad = 1 + print(b.grad) # b.grad = 1 + + # ================================================================== # + # 2. Basic autograd example 2 # + # ================================================================== # + + # Create tensors of shape (10, 3) and (10, 2). + """ + torch.randn(10, 3): 创建一个形状为 (10, 3) 的张量(矩阵),其中包含从标准正态分布中随机采样的值 + + 10 表示批次大小(batch size),即一次处理10个样本 + 3 表示每个样本有3个特征 + torch.randn(10, 2): 创建一个形状为 (10, 2) 的张量,作为模型的目标输出 + + 10 同样是批次大小,与输入数据对应 + 2 表示每个样本期望输出2个值 + """ + x = torch.randn(10, 3) + y = torch.randn(10, 2) + print('x: ', x) + print('y: ', y) + # Build a fully connected layer. + # 构建全连接层 + """ + 第一个参数 3 是输入特征的维度 + 第二个参数 2 是输出特征的维度 + 这个线性层会自动初始化权重和偏置参数 + """ + linear = nn.Linear(3, 2) + print('w: ', linear.weight) + print('b: ', linear.bias) + + # Build loss function and optimizer. + criterion = nn.MSELoss() + optimizer = torch.optim.SGD(linear.parameters(), lr=0.01) + + # Forward pass. + pred = linear(x) + + # Compute loss. + loss = criterion(pred, y) + print('loss: ', loss.item()) + + # Backward pass. + loss.backward() + + # Print out the gradients. + print('dL/dw: ', linear.weight.grad) + print('dL/db: ', linear.bias.grad) + + # 1-step gradient descent. + optimizer.step() + + # You can also perform gradient descent at the low level. + # linear.weight.data.sub_(0.01 * linear.weight.grad.data) + # linear.bias.data.sub_(0.01 * linear.bias.grad.data) + + # Print out the loss after 1-step gradient descent. + pred = linear(x) + loss = criterion(pred, y) + print('loss after 1 step optimization: ', loss.item()) + """ + 这段代码的意义 + 这是深度学习中构建神经网络的基础步骤: + 准备输入数据和目标数据 + 定义模型结构(这里是一个简单的线性层) + 查看和理解模型参数 + 在实际应用中,这段代码之后通常会添加: + 损失函数定义(如 MSE、交叉熵等) + 优化器选择(如 SGD、Adam 等) + 前向传播、损失计算、反向传播和参数更新的循环 + 这段代码展示了PyTorch构建神经网络的核心概念,是理解更复杂模型的基础。 + """ + + # ================================================================== # + # 3. Loading data from numpy # + # ================================================================== # + + # Create a numpy array. + x = np.array([[1, 2], [3, 4]]) + + # Convert the numpy array to a torch tensor. + y = torch.from_numpy(x) + + # Convert the torch tensor to a numpy array. + z = y.numpy() + # x==z: [[ True True] + print(f"x==z: {x == z}") + + # ================================================================== # + # 4. Input pipeline # + # ================================================================== # + + # Download and construct CIFAR-10 dataset. + print(""" + # CIFAR - 10 + # 是一个经典的图像分类数据集,包含: + # 60000 张 32x32 彩色图像 + # 10 个类别:飞机、汽车、鸟、猫、鹿、狗、青蛙、马、船、卡车 + # 每个类别有 6000 张图像 + # 训练集:50000 张 + # 测试集:10000 张 + # 数据预处理: + # 图像被归一化到 [0, 1] 范围 + # 每个通道的均值和标准差分别为 [0.5, 0.5, 0.5] 和 [0.5, 0.5, 0.5] + """) + train_dataset = torchvision.datasets.CIFAR10(root='../../data/', + train=True, + transform=transforms.ToTensor(), + download=True) + + # Fetch one data pair (read data from disk). + image, label = train_dataset[0] + print(f"image.size(): {image.size()}") + print(f"label: {label}") + + # Data loader (this provides queues and threads in a very simple way). + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=64, + shuffle=True) + + # When iteration starts, queue and thread start to load data from files. + data_iter = iter(train_loader) + + # Mini-batch images and labels. + images, labels = data_iter.__next__() + print(f"images.size(): {images.size()}") + print(f"labels: {labels}") + # Actual usage of the data loader is as below. + for images, labels in train_loader: + # Training code should be written here. pass - def __getitem__(self, index): - # TODO - # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open). - # 2. Preprocess the data (e.g. torchvision.Transform). - # 3. Return a data pair (e.g. image and label). - pass - def __len__(self): - # You should change 0 to the total size of your dataset. - return 0 - -# You can then use the prebuilt data loader. -custom_dataset = CustomDataset() -train_loader = torch.utils.data.DataLoader(dataset=custom_dataset, - batch_size=64, - shuffle=True) - - -# ================================================================== # -# 6. Pretrained model # -# ================================================================== # - -# Download and load the pretrained ResNet-18. -resnet = torchvision.models.resnet18(pretrained=True) - -# If you want to finetune only the top layer of the model, set as below. -for param in resnet.parameters(): - param.requires_grad = False - -# Replace the top layer for finetuning. -resnet.fc = nn.Linear(resnet.fc.in_features, 100) # 100 is an example. - -# Forward pass. -images = torch.randn(64, 3, 224, 224) -outputs = resnet(images) -print (outputs.size()) # (64, 100) - - -# ================================================================== # -# 7. Save and load the model # -# ================================================================== # - -# Save and load the entire model. -torch.save(resnet, 'model.ckpt') -model = torch.load('model.ckpt') -# Save and load only the model parameters (recommended). -torch.save(resnet.state_dict(), 'params.ckpt') -resnet.load_state_dict(torch.load('params.ckpt')) + # ================================================================== # + # 5. Input pipeline for custom dataset # + # ================================================================== # + + # You should build your custom dataset as below. + + # You can then use the prebuilt data loader. + custom_dataset = CustomDataset() + train_loader = torch.utils.data.DataLoader(dataset=custom_dataset, + batch_size=64, + shuffle=True) + + # ================================================================== # + # 6. Pretrained model # + # ================================================================== # + """ + torchvision.models:PyTorch视觉库(torchvision)中的模型模块,包含了多种经典的计算机视觉预定义模型(如ResNet、VGG、AlexNet等) + resnet18:ResNet(Residual Network,残差网络)模型家族中的一个变体,指具有18层网络结构的ResNet模型 + pretrained=True:关键参数,指定加载在ImageNet数据集上预训练好的模型权重 + """ + """预训练模型是指: + 已经在大型数据集(这里是ImageNet,包含1400万张图像,1000 + 个类别)上训练完成的模型 + 模型权重已经学习到了通用的图像特征(如边缘、纹理、形状等高级视觉特征) + 可以直接用于推理,或作为迁移学习的起点""" + # Download and load the pretrained ResNet-18. + resnet = torchvision.models.resnet18(pretrained=True) + # 模型微调(Fine - tuning)的经典实现 + # If you want to finetune only the top layer of the model, set as below. + """ + 作用:将ResNet-18模型中所有原有参数的requires_grad设置为False + 效果:在反向传播时,这些参数不会计算梯度,也就不会被更新 + 原理:预训练模型的底层(卷积层)已经学习到了通用的视觉特征(如边缘、纹理、形状),这些特征对大多数图像任务都有效,无需重新学习 + """ + for param in resnet.parameters(): + param.requires_grad = False + + # Replace the top layer for finetuning. + """ + 作用:将ResNet-18模型的全连接层(fc层)替换为一个新的全连接层,输出维度为100 + 效果:模型的输出层从原来的1000个类别(ImageNet数据集的类别数)减少到100个类别 + 原理:全连接层是模型的最后一层,负责将特征映射到类别空间。通过替换fc层,我们可以将模型用于不同的分类任务(如CIFAR - 10) + """ + resnet.fc = nn.Linear(resnet.fc.in_features, 100) # 100 is an example. + + # Forward pass. + images = torch.randn(64, 3, 224, 224) + outputs = resnet(images) + print(outputs.size()) # (64, 100) + + # ================================================================== # + # 7. Save and load the model # + # ================================================================== # + """ + torch.save(resnet, 'model.ckpt'):保存完整模型对象 + 保存的是整个模型的Python对象,包括: + 模型的网络结构(如ResNet-18的卷积层、池化层、全连接层等) + 模型的所有参数(权重和偏置) + 模型的优化器状态(如果模型包含的话) + 其他与模型相关的Python对象(如类定义、导入依赖等) + 本质:使用Python的pickle序列化机制保存整个对象 + """ + # Save and load the entire model. + torch.save(resnet, 'model.ckpt') + model = torch.load('model.ckpt', weights_only=False) + """ + 只保存模型的参数状态字典(State Dictionary): + 以字典形式保存所有可学习参数的名称和值 + 不包含模型的网络结构信息 + 不包含任何Python类定义或依赖 + 本质:只保存模型的"权重",不保存"骨架" + """ + # Save and load only the model parameters (recommended). + torch.save(resnet.state_dict(), 'params.ckpt') + resnet.load_state_dict(torch.load('params.ckpt')) diff --git a/tutorials/01-mine/FashionMNIST.py b/tutorials/01-mine/FashionMNIST.py new file mode 100644 index 00000000..fe897f3e --- /dev/null +++ b/tutorials/01-mine/FashionMNIST.py @@ -0,0 +1,149 @@ +import torch +from torch import nn +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor + + +# 设备配置:优先使用GPU,否则使用CPU,M系列芯片可使用MPS +# if torch.backends.mps.is_available(): +# device = torch.device('mps') # Apple Silicon M系列芯片加速 +# elif torch.cuda.is_available(): +# device = torch.device('cuda') # NVIDIA GPU加速 +# else: +# device = torch.device('cpu') # CPU训练 +class NeuralNetwork(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10) + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + + +# 在单个训练循环中,模型会对训练数据集(以批次形式输入)进行预测,并通过反向传播预测误差来调整模型的参数。 +def train(dataloader, model, loss_fn, optimizer): + size = len(dataloader.dataset) + model.train() + for batch, (X, y) in enumerate(dataloader): + X, y = X.to(device), y.to(device) + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + loss.backward() + optimizer.step() + optimizer.zero_grad() + + if batch % 100 == 0: + loss, current = loss.item(), (batch + 1) * len(X) + print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") + +# 根据测试数据集检查模型的性能,以确保它在学习。 +def test(dataloader, model, loss_fn): + size = len(dataloader.dataset) + num_batches = len(dataloader) + model.eval() + test_loss, correct = 0, 0 + with torch.no_grad(): + for X, y in dataloader: + X, y = X.to(device), y.to(device) + pred = model(X) + test_loss += loss_fn(pred, y).item() + correct += (pred.argmax(1) == y).type(torch.float).sum().item() + test_loss /= num_batches + correct /= size + print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") + +""" +下载模型数据 +创建模型 +优化模型参数 +保存模型 +加载模型 +预测数据 +""" +if __name__ == '__main__': + # Download training data from open datasets. + training_data = datasets.FashionMNIST( + root="data", + train=True, + download=True, + transform=ToTensor(), + ) + + # Download test data from open datasets. + # 它是一个包含图像和对应标签的数据集对象。 + test_data = datasets.FashionMNIST( + root="data", + train=False, + download=True, + transform=ToTensor(), + ) + + batch_size = 64 + # Create data loaders. + train_dataloader = DataLoader(training_data, batch_size=batch_size) + test_dataloader = DataLoader(test_data, batch_size=batch_size) + + for X, y in test_dataloader: + print(f"Shape of X [N, C, H, W]: {X.shape}") + print(f"Shape of y: {y.shape} {y.dtype}") + break + # Creating Models + device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" + print(f"Using {device} device") + # Define model + + model = NeuralNetwork().to(device) + print(f"model: {model}") + # To train a model, we need a loss function and an optimizer. + loss_fn = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + epochs = 5 + for t in range(epochs): + print(f"Epoch {t + 1}\n-------------------------------") + train(train_dataloader, model, loss_fn, optimizer) + test(test_dataloader, model, loss_fn) + print("Done!") + torch.save(model.state_dict(), "model.pth") + print("Saved PyTorch Model State to model.pth") + model.load_state_dict(torch.load("model.pth", weights_only=True)) + classes = [ + "T-shirt/top", + "Trouser", + "Pullover", + "Dress", + "Coat", + "Sandal", + "Shirt", + "Sneaker", + "Bag", + "Ankle boot", + ] + + model.eval() # 设置模型为评估模式 + x, y = test_data[0][0], test_data[0][1] + print(f"x: {x} ,y: {y}") + with torch.no_grad(): + x = x.to(device) + # logits 指的是 模型最后一层(通常是线性层 nn.Linear)的原始输出,这些输出 没有经过归一化,因此它们不是概率值。 + # Softmax 函数转换为概率分布: + pred = model(x) # 模型预测,输出10个类别的logits + # probs = torch.softmax(pred, dim=1) + # print(probs) + + print(f"pred: {pred}") + # 在分类任务中,我们通常只需要找到最大logits对应的类别即可,不需要转换为概率: + predicted, actual = classes[pred[0].argmax(0)], classes[y] + print(f'Predicted: "{predicted}", Actual: "{actual}"') \ No newline at end of file diff --git a/tutorials/01-mine/autograd.py b/tutorials/01-mine/autograd.py new file mode 100644 index 00000000..ee0818f8 --- /dev/null +++ b/tutorials/01-mine/autograd.py @@ -0,0 +1,91 @@ +# 微分是把整体拆成无限小的局部,求「瞬间变化率」;积分是把无限小的局部拼回整体,求「累积总量」,二者是互逆运算,就像 “拆积木” 和 “搭积木” 的关系。 +# 通俗说:比如汽车行驶,微分就是求某一秒的瞬时速度(不是平均速度);比如山坡,微分就是求某一点的坡度(斜率)。 +""" +定积分:求 “面积 / 总量”(核心应用) +对函数\(y=f(x)\),在区间\([a,b]\)上的定积分\(\int_{a}^{b}f(x)dx\),本质是:把区间\([a,b]\)拆成无数个微小区间,每个区间对应一个微小矩形(高 = f (x),宽 = dx),把所有微小矩形的面积加起来,就是定积分的结果。 +不定积分:微分的 “逆运算” +通俗说:知道 “每一点的斜率”,反推 “原来的曲线”;知道 “瞬时速度”,反推 “位移函数”。✅ 例子:已知微分(导数)\(y'=2x\),不定积分就是 \(y=x^2 + C\)(C 是任意常数),因为\(x^2\)、\(x^2+1\)、\(x^2+100\)的导数都是 2x。 + +""" + +# %matplotlib inline + +import torch + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import math +import torch + + +def demo1(): + """ + Consider the simplest one-layer neural network, + with input x, parameters w and b, and some loss function. It can be defined in PyTorch in the following manner: + """ + x = torch.ones(5) # input tensor + y = torch.zeros(3) # expected output + w = torch.randn(5, 3, requires_grad=True) + b = torch.randn(3, requires_grad=True) + z = torch.matmul(x, w) + b + loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y) + print(f"Gradient function for z = {z.grad_fn}") + print(f"Gradient function for loss = {loss.grad_fn}") + # Computing Gradients + loss.backward() + """ + 我们只能获取计算图中叶节点的grad属性,这些叶节点的requires_grad属性被设置为True。对于图中的所有其他节点,梯度将不可用。 + + """ + print(w.grad) + print(b.grad) + + +def sin_demo(): + a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) + print(a) + b = torch.sin(a) + plt.plot(a.detach(), b.detach()) + print(b) +BATCH_SIZE = 16 +DIM_IN = 1000 +HIDDEN_SIZE = 100 +DIM_OUT = 10 + +class TinyModel(torch.nn.Module): + + def __init__(self): + super(TinyModel, self).__init__() + + self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE) + self.relu = torch.nn.ReLU() + self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT) + + def forward(self, x): + x = self.layer1(x) + x = self.relu(x) + x = self.layer2(x) + return x + +def sin_demo1(): + some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False) + ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False) + + model = TinyModel() + print(model.layer2.weight[0][0:10]) # just a small slice + print(model.layer2.weight.grad) + optimizer = torch.optim.SGD(model.parameters(), lr=0.001) + prediction = model(some_input) + loss = (ideal_output - prediction).pow(2).sum() + print(loss) + loss.backward() + print(model.layer2.weight[0][0:10]) + print(model.layer2.weight.grad[0][0:10]) + optimizer.step() + print(model.layer2.weight[0][0:10]) + print(model.layer2.weight.grad[0][0:10]) + + + +if __name__ == '__main__': + sin_demo1() diff --git a/tutorials/01-mine/build_neural_network.py b/tutorials/01-mine/build_neural_network.py new file mode 100644 index 00000000..03bd0f23 --- /dev/null +++ b/tutorials/01-mine/build_neural_network.py @@ -0,0 +1,69 @@ + + + +# 构建神经网络 +import os +import torch +from torch import nn +from torch.utils.data import DataLoader +from torchvision import datasets, transforms + +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") + +class NeuralNetwork(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28*28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10), + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + + +if __name__ == '__main__': + model = NeuralNetwork().to(device) + print(model) + # To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. Do not call model.forward() directly! + # 要使用该模型,我们需向其传入输入数据。这会执行模型的forward以及一些后台操作。请勿直接调用model.forward()! + X = torch.rand(1, 28, 28, device=device) + logits = model(X) + print(logits) + pred_probab = nn.Softmax(dim=1)(logits) + y_pred = pred_probab.argmax(1) + print(f"Predicted class: {y_pred}") + input_image = torch.rand(3, 28, 28) + print(input_image.size()) + # 我们初始化nn.Flatten层,将每个2D的28x28图像转换为一个包含784个像素值的连续数组(保持dim=0处的小批量维度)。 + flatten = nn.Flatten() + flat_image = flatten(input_image) + print(flat_image.size()) + # 线性层是一个模块,它使用其存储的权重和偏置对输入进行线性变换。 + layer1 = nn.Linear(in_features=28 * 28, out_features=20) + hidden1 = layer1(flat_image) + print(hidden1.size()) + print(f"Before ReLU: {hidden1}\n\n") + hidden1 = nn.ReLU()(hidden1) + print(f"After ReLU: {hidden1}") + seq_modules = nn.Sequential( + flatten, + layer1, + nn.ReLU(), + nn.Linear(20, 10) + ) + input_image = torch.rand(3, 28, 28) + logits = seq_modules(input_image) + softmax = nn.Softmax(dim=1) + pred_probab = softmax(logits) + print(f"Model structure: {model}\n\n") + + for name, param in model.named_parameters(): + print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n") \ No newline at end of file diff --git a/tutorials/01-mine/control_flow_weight_sharing.py b/tutorials/01-mine/control_flow_weight_sharing.py new file mode 100644 index 00000000..f4dc4732 --- /dev/null +++ b/tutorials/01-mine/control_flow_weight_sharing.py @@ -0,0 +1,75 @@ +import random +import torch +import math +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") +""" +模块 torch.nn (Neural Network) torch.optim (Optimizer) +核心职责 定义模型结构,执行前向计算 更新模型参数,实现优化算法 +是什么 模型架构师与计算器 模型调参优化师 +管什么 管理模型的层、参数、计算图 管理参数的更新方向和步长 +关键输出 模型的预测输出 模型参数的新数值 +类比 汽车的发动机与车身设计图 汽车的驾驶员与导航系统 +""" +class DynamicNet(torch.nn.Module): + def __init__(self): + """ + In the constructor we instantiate five parameters and assign them as members. + """ + super().__init__() + self.a = torch.nn.Parameter(torch.randn(())) + self.b = torch.nn.Parameter(torch.randn(())) + self.c = torch.nn.Parameter(torch.randn(())) + self.d = torch.nn.Parameter(torch.randn(())) + self.e = torch.nn.Parameter(torch.randn(())) + + def forward(self, x): + """ + For the forward pass of the model, we randomly choose either 4, 5 + and reuse the e parameter to compute the contribution of these orders. + + Since each forward pass builds a dynamic computation graph, we can use normal + Python control-flow operators like loops or conditional statements when + defining the forward pass of the model. + + Here we also see that it is perfectly safe to reuse the same parameter many + times when defining a computational graph. + """ + y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3 + for exp in range(4, random.randint(4, 6)): + y = y + self.e * x ** exp + return y + + def string(self): + """ + Just like any class in Python, you can also define custom method on PyTorch modules + """ + return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?' + + +# Create Tensors to hold input and outputs. +x = torch.linspace(-math.pi, math.pi, 2000,device=device) +y = torch.sin(x).to(device) + +# Construct our model by instantiating the class defined above +model = DynamicNet().to(device) + +# Construct our loss function and an Optimizer. Training this strange model with +# vanilla stochastic gradient descent is tough, so we use momentum +criterion = torch.nn.MSELoss(reduction='sum') +optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9) +for t in range(30000): + # Forward pass: Compute predicted y by passing x to the model + y_pred = model(x) + + # Compute and print loss + loss = criterion(y_pred, y) + if t % 2000 == 1999: + print(t, loss.item()) + + # Zero gradients, perform a backward pass, and update the weights. + optimizer.zero_grad() + loss.backward() + optimizer.step() + +print(f'Result: {model.string()}') \ No newline at end of file diff --git a/tutorials/01-mine/cpu_2_gpu.py b/tutorials/01-mine/cpu_2_gpu.py new file mode 100644 index 00000000..0413258b --- /dev/null +++ b/tutorials/01-mine/cpu_2_gpu.py @@ -0,0 +1,59 @@ +""" +在许多PyTorch应用中,将数据从CPU传输到GPU是基本操作。用户理解在设备之间移动数据的最有效工具和选项至关重要。 +本教程探讨了PyTorch中设备到设备数据传输的两种关键方法:pin_memory()和带有non_blocking=True选项的to()。 +""" + +import contextlib +import torch +from torch.cuda import Stream + +s = Stream() + +torch.manual_seed(42) +t1_cpu_pinned = torch.randn(1024 ** 2 * 5, pin_memory=True) +t2_cpu_paged = torch.randn(1024 ** 2 * 5, pin_memory=False) +t3_cuda = torch.randn(1024 ** 2 * 5, device="cuda:0") + +assert torch.cuda.is_available() +device = torch.device("cuda", torch.cuda.current_device()) + + +# The function we want to profile +def inner(pinned: bool, streamed: bool): + with torch.cuda.stream(s) if streamed else contextlib.nullcontext(): + if pinned: + t1_cuda = t1_cpu_pinned.to(device, non_blocking=True) + else: + t2_cuda = t2_cpu_paged.to(device, non_blocking=True) + t_star_cuda_h2d_event = s.record_event() + # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is + # done in the other stream + t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda + t3_cuda_h2d_event = torch.cuda.current_stream().record_event() + t_star_cuda_h2d_event.synchronize() + t3_cuda_h2d_event.synchronize() + + +# Our profiler: profiles the `inner` function and stores the results in a .json file +def benchmark_with_profiler( + pinned, + streamed, +) -> None: + torch._C._profiler._set_cuda_sync_enabled_val(True) + wait, warmup, active = 1, 1, 2 + num_steps = wait + warmup + active + rank = 0 + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1 + ), + ) as prof: + for step_idx in range(1, num_steps + 1): + inner(streamed=streamed, pinned=pinned) + if rank is None or rank == 0: + prof.step() + prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json") diff --git a/tutorials/01-mine/custom_autograd.py b/tutorials/01-mine/custom_autograd.py new file mode 100644 index 00000000..181b434d --- /dev/null +++ b/tutorials/01-mine/custom_autograd.py @@ -0,0 +1,88 @@ +import torch +import math + + +class LegendrePolynomial3(torch.autograd.Function): + """ + We can implement our own custom autograd Functions by subclassing + torch.autograd.Function and implementing the forward and backward passes + which operate on Tensors. + """ + + @staticmethod + def forward(ctx, input): + """ + In the forward pass we receive a Tensor containing the input and return + a Tensor containing the output. ctx is a context object that can be used + to stash information for backward computation. You can cache tensors for + use in the backward pass using the ``ctx.save_for_backward`` method. Other + objects can be stored directly as attributes on the ctx object, such as + ``ctx.my_object = my_object``. Check out `Extending torch.autograd `_ + for further details. + """ + ctx.save_for_backward(input) + return 0.5 * (5 * input ** 3 - 3 * input) + + @staticmethod + def backward(ctx, grad_output): + """ + In the backward pass we receive a Tensor containing the gradient of the loss + with respect to the output, and we need to compute the gradient of the loss + with respect to the input. + """ + input, = ctx.saved_tensors + return grad_output * 1.5 * (5 * input ** 2 - 1) + + +dtype = torch.float +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") +# device = torch.device("cuda:0") # Uncomment this to run on GPU + +# Create Tensors to hold input and outputs. +# By default, requires_grad=False, which indicates that we do not need to +# compute gradients with respect to these Tensors during the backward pass. +x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype) +y = torch.sin(x) + +# Create random Tensors for weights. For this example, we need +# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized +# not too far from the correct result to ensure convergence. +# Setting requires_grad=True indicates that we want to compute gradients with +# respect to these Tensors during the backward pass. +a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True) +b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True) +c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True) +d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True) + +learning_rate = 5e-6 +for t in range(2000): + # To apply our Function, we use Function.apply method. We alias this as 'P3'. + P3 = LegendrePolynomial3.apply + + # Forward pass: compute predicted y using operations; we compute + # P3 using our custom autograd operation. + y_pred = a + b * P3(c + d * x) + + # Compute and print loss + loss = (y_pred - y).pow(2).sum() + if t % 100 == 99: + print(t, loss.item()) + + # Use autograd to compute the backward pass. + loss.backward() + + # Update weights using gradient descent + with torch.no_grad(): + a -= learning_rate * a.grad + b -= learning_rate * b.grad + c -= learning_rate * c.grad + d -= learning_rate * d.grad + + # Manually zero the gradients after updating weights + a.grad = None + b.grad = None + c.grad = None + d.grad = None + +print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)') \ No newline at end of file diff --git a/tutorials/01-mine/dataset_demo.py b/tutorials/01-mine/dataset_demo.py new file mode 100644 index 00000000..9e91ff6a --- /dev/null +++ b/tutorials/01-mine/dataset_demo.py @@ -0,0 +1,87 @@ +import os +import pandas as pd +from torchvision.io import decode_image +import torch +from torch.utils.data import Dataset +from torchvision import datasets +from torchvision.transforms import ToTensor +import matplotlib.pyplot as plt +from torch.utils.data import DataLoader + + + +class CustomImageDataset(Dataset): + def __init__(self, annotations_file, img_dir, transform=None, target_transform=None): + self.img_labels = pd.read_csv(annotations_file) + self.img_dir = img_dir + self.transform = transform + self.target_transform = target_transform + + def __len__(self): + return len(self.img_labels) + + def __getitem__(self, idx): + img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0]) + image = decode_image(img_path) + label = self.img_labels.iloc[idx, 1] + if self.transform: + image = self.transform(image) + if self.target_transform: + label = self.target_transform(label) + return image, label + + +def plt_show(training_data): + labels_map = { + 0: "T-Shirt", + 1: "Trouser", + 2: "Pullover", + 3: "Dress", + 4: "Coat", + 5: "Sandal", + 6: "Shirt", + 7: "Sneaker", + 8: "Bag", + 9: "Ankle Boot", + } + figure = plt.figure(figsize=(8, 8)) + cols, rows = 3, 3 + for i in range(1, cols * rows + 1): + sample_idx = torch.randint(len(training_data), size=(1,)).item() + img, label = training_data[sample_idx] + figure.add_subplot(rows, cols, i) + plt.title(labels_map[label]) + plt.axis("off") + plt.imshow(img.squeeze(), cmap="gray") + plt.show() + + +if __name__ == '__main__': + training_data = datasets.FashionMNIST( + root="data", + train=True, + download=True, + transform=ToTensor() + ) + + test_data = datasets.FashionMNIST( + root="data", + train=False, + download=True, + transform=ToTensor() + ) + # plt_show(training_data) + + # + # + train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True) + test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True) + # Display image and label. + train_features, train_labels = next(iter(train_dataloader)) + print(f"Feature batch shape: {train_features.size()}") + print(f"Labels batch shape: {train_labels.size()}") + img = train_features[0].squeeze() + label = train_labels[0] + plt.imshow(img, cmap="gray") + plt.show() + print(f"Label: {label}") \ No newline at end of file diff --git a/tutorials/01-mine/neural_network.md b/tutorials/01-mine/neural_network.md new file mode 100644 index 00000000..31366da2 --- /dev/null +++ b/tutorials/01-mine/neural_network.md @@ -0,0 +1,165 @@ +## 1. **神经网络的本质:数据处理的 “流水线”** + +在 PyTorch 中,**神经网络**本质上是一个 **由可训练参数(权重和偏置)组成的计算图**,它的作用是: + +- 接收一个输入(通常是张量,比如图片的像素矩阵、文本的向量表示) +- 通过一系列 **线性变换**(矩阵乘法)和 **非线性激活**(比如 ReLU、Sigmoid) +- 输出一个预测结果(比如分类任务的类别概率、回归任务的数值) + +你可以把它想象成一个 **“智能函数”**: + +\(y = f(x; W, b)\) + +其中: + +- x 是输入(张量) +- W 和 b 是网络的 **可训练参数**(张量) +- f 是网络的计算逻辑(由多个层组成) +- y 是输出(张量) + +------ + +## 2. **PyTorch 神经网络的核心组成** + +在 PyTorch 中,神经网络通常由以下几个关键部分构成: + +### (1)`torch.nn.Module`:网络的 “容器” + +- 所有神经网络都必须继承自 `torch.nn.Module` 类 + +- 它是一个 + + + + 参数化的容器 + + ,可以包含: + + - 网络的层(如 `nn.Linear`、`nn.Conv2d`、`nn.ReLU` 等) + - 可训练的参数(`nn.Parameter`) + - 自定义的计算逻辑 + +例如: + +```python +import torch +import torch.nn as nn + +class MyNet(nn.Module): + def __init__(self): + super(MyNet, self).__init__() + # 定义层(包含可训练参数) + self.fc1 = nn.Linear(10, 20) # 输入10维,输出20维 + self.relu = nn.ReLU() # 非线性激活 + self.fc2 = nn.Linear(20, 2) # 输出2类 + + def forward(self, x): + # 定义数据流动的路径(前向传播) + x = self.fc1(x) # 线性变换:x @ W1 + b1 + x = self.relu(x) # 非线性激活 + x = self.fc2(x) # 线性变换:x @ W2 + b2 + return x +``` + +------ + +### (2)**层(Layer):网络的 “基本单元”** + +层是神经网络的核心组件,每个层都是一个 **参数化的函数**,负责对输入张量进行特定的变换。常见的层包括: + +| 层类型 | 作用 | 数学表达(简化) | +| -------------------- | ------------------------------ | ---------------------------------------- | +| `nn.Linear(in, out)` | 线性变换(全连接层) | \(y = xW^T + b\) | +| `nn.Conv2d(in, out)` | 二维卷积(提取空间特征) | \(y = \text{Conv}(x, W) + b\) | +| `nn.ReLU()` | 非线性激活(增加模型表达能力) | \(y = \max(0, x)\) | +| `nn.Softmax(dim)` | 归一化输出为概率分布 | \(y_i = \frac{e^{x_i}}{\sum_j e^{x_j}}\) | + +这些层的本质都是 **对张量的运算**,而层中的 `weight` 和 `bias` 是 **可训练的张量**(`nn.Parameter` 类型),会在训练过程中通过梯度下降更新。 + +------ + +### (3)**前向传播(forward):数据的 “流动路径”** + +- `forward` 方法定义了 **数据如何在网络中流动** +- 输入张量 `x` 依次经过各层的变换,最终得到输出张量 +- 这个过程就是 **计算图的构建过程**(PyTorch 会自动记录运算,用于反向传播) + +例如: + +```python +net = MyNet() +x = torch.randn(3, 10) # 3个样本,每个样本10维特征 +y = net(x) # 前向传播:x → fc1 → relu → fc2 → y +print(y.shape) # 输出: torch.Size([3, 2]) +``` + +------ + +### (4)**参数(Parameter):网络的 “可训练变量”** + +- 网络的参数(权重 W 和偏置 b)是 `nn.Parameter` 类型的张量 +- 它们会被自动注册到网络的 `parameters()` 或 `named_parameters()` 方法中 +- 在训练时,优化器(如 `torch.optim.SGD`)会根据梯度更新这些参数 + +查看网络参数: + +```python +for name, param in net.named_parameters(): + print(name, param.shape) +``` + +输出: + +```plaintext +fc1.weight torch.Size([20, 10]) +fc1.bias torch.Size([20]) +fc2.weight torch.Size([2, 20]) +fc2.bias torch.Size([2]) +``` + +------ + +## 3. **神经网络的 “学习” 过程** + +神经网络之所以能 “智能”,是因为它的参数可以通过 **数据驱动的方式更新**(即训练过程),核心步骤如下: + +### (1)定义损失函数(衡量预测与真实值的差距) + +```python +loss_fn = nn.CrossEntropyLoss() # 分类任务常用损失 +``` + +### (2)定义优化器(负责更新参数) + +```python +optimizer = torch.optim.SGD(net.parameters(), lr=0.01) # 随机梯度下降 +``` + +### (3)训练循环(前向传播 → 计算损失 → 反向传播 → 更新参数) + +```python +for epoch in range(100): + # 前向传播 + y_pred = net(x) + + # 计算损失 + loss = loss_fn(y_pred, y_true) # y_true是真实标签 + + # 反向传播(计算梯度) + optimizer.zero_grad() # 清空上一轮梯度 + loss.backward() # 自动计算参数的梯度 + + # 更新参数(梯度下降) + optimizer.step() +``` + +通过这个过程,网络的参数会逐渐调整,使得预测结果越来越接近真实值。 + +------ + +## 4. **PyTorch 神经网络的特点** + +- **模块化**:可以像搭积木一样组合不同的层 +- **自动微分**:`backward()` 方法自动计算梯度,无需手动推导 +- **灵活性**:`forward` 方法可以写任意复杂的逻辑(如循环、条件判断) +- **GPU 加速**:只需调用 `.to("cuda")` 即可在 GPU 上运行“智能函数”:\(y = f(x; W, b)\)其中:x 是输入(张量)W 和 b 是网络的 可训练参数(张量)f 是网络的计算逻辑(由多个层组成)y 是输出(张量) \ No newline at end of file diff --git a/tutorials/01-mine/tensors_demo.py b/tutorials/01-mine/tensors_demo.py new file mode 100644 index 00000000..3bbede3f --- /dev/null +++ b/tutorials/01-mine/tensors_demo.py @@ -0,0 +1,69 @@ +import torch +import numpy as np + + +if __name__ == '__main__': + # Initializing a Tensor + data = [[1, 2], [3, 4]] + x_data = torch.tensor(data) + np_array = np.array(data) + x_np = torch.from_numpy(np_array) + print(f"x_np.numpy() == np_array: {x_np.numpy() == np_array}") + # The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden. + x_ones = torch.ones_like(x_data) # retains the properties of x_data + print(f"Ones Tensor: \n {x_ones} \n") + x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data + print(f"Random Tensor: \n {x_rand} \n") + shape = (2, 3,) + rand_tensor = torch.rand(shape) + ones_tensor = torch.ones(shape) + zeros_tensor = torch.zeros(shape) + + print(f"Random Tensor: \n {rand_tensor} \n") + print(f"Ones Tensor: \n {ones_tensor} \n") + print(f"Zeros Tensor: \n {zeros_tensor}") + + tensor = torch.rand(3, 4) + + print(f"Shape of tensor: {tensor.shape}") + print(f"Datatype of tensor: {tensor.dtype}") + print(f"Device tensor is stored on: {tensor.device}") + # Operations on Tensors + # 超过1200种张量运算,包括算术运算、线性代数运算、矩阵操作(转置、索引、切片)、采样等 + # 默认情况下,张量在CPU上创建。我们需要使用.to方法(在检查加速器可用性之后)将张量显式移动到加速器。请记住,跨设备复制大型张量在时间和内存方面可能成本很高! + # We move our tensor to the current accelerator if available + device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" + print(f"Using {device} device") + # tensor = tensor.to(device) + tensor = torch.ones(4, 4) + print(f"First row: {tensor[0]}") + print(f"First column: {tensor[:, 0]}") + print(f"Last column: {tensor[..., -1]}") + tensor[:, 1] = 0 + print(tensor) + # Joining tensors 拼接张量 + t1 = torch.cat([tensor, tensor, tensor], dim=1) + print(f"t1: \n {t1} \n") + # Arithmetic operations 算术运算 + # This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value + # ``tensor.T`` returns the transpose of a tensor + print(f"tensor: \n {tensor} \n") + print(f"tensor.T: \n {tensor.T} \n") + y1 = tensor @ tensor.T #矩阵乘法 + y2 = tensor.matmul(tensor.T) #矩阵乘法 + + y3 = torch.rand_like(y1) + print(f"y1: \n {y1} \n, \ny2: \n {y2} \n, \ny3: \n {y3}") + torch.matmul(tensor, tensor.T, out=y3) + + # This computes the element-wise product. z1, z2, z3 will have the same value + z1 = tensor * tensor + z2 = tensor.mul(tensor) + + z3 = torch.rand_like(tensor) + torch.mul(tensor, tensor, out=z3) + print(f"agg tensor: \n {tensor} \n") + # tensor.sum() 会对张量的所有元素进行求和,返回一个标量(只有一个值的张量)。 + agg = tensor.sum() + agg_item = agg.item() + print(agg_item, type(agg_item)) diff --git a/tutorials/01-mine/tensors_demo2.py b/tutorials/01-mine/tensors_demo2.py new file mode 100644 index 00000000..71d383e5 --- /dev/null +++ b/tutorials/01-mine/tensors_demo2.py @@ -0,0 +1,191 @@ +import torch +import numpy as np +import torch # for all things PyTorch +import torch.nn as nn # for torch.nn.Module, the parent object for PyTorch models +import torch.nn.functional as F # for the activation function +import torch +import math + +""" + Tensors operations (张量运算) +""" + + +def tensors_demo(): + z = torch.zeros(5, 3) + print(z) + # 发现这些零是32位浮点数,这是PyTorch的默认类型。 + print(z.dtype) + i = torch.ones((5, 3), dtype=torch.int16) + print(i) + print(i.dtype) + torch.manual_seed(1729) + r1 = torch.rand(2, 2) + print('A random tensor:') + print(r1) + + r2 = torch.rand(2, 2) + print('\nA different random tensor:') + print(r2) # new values + + torch.manual_seed(1729) + r3 = torch.rand(2, 2) + print('\nShould match r1:') + print(r3) # repeats values of r1 because of re-seed + ones = torch.ones(2, 3) + print(ones) + + twos = torch.ones(2, 3) * 2 # every element is multiplied by 2 + print(twos) + + threes = ones + twos # addition allowed because shapes are similar + print(threes) # tensors are added element-wise + print(threes.shape) # this has the same dimensions as input tensors + + r1 = torch.rand(2, 3) + r2 = torch.rand(3, 2) + # uncomment this line to get a runtime error + # r3 = r1 + r2 + r = (torch.rand(2, 2) - 0.5) * 2 # values between -1 and 1 + print('A random matrix, r:') + print(r) + + # Common mathematical operations are supported: + print('\nAbsolute value of r:') + print(torch.abs(r)) + + # ...as are trigonometric functions: + print('\nInverse sine of r:') + """ + 求每个元素的反正弦值 + """ + print(torch.asin(r)) + + # ...and linear algebra operations like determinant and singular value decomposition + print('\nDeterminant of r:') + """ + 计算 方阵 的 行列式(determinant)。行列式是一个标量,描述矩阵的缩放因子和方向变化。 + r = torch.tensor([[1.0, 2.0], + [3.0, 4.0]]) + print(torch.det(r)) + =1×4−2×3=−2 + """ + print(torch.det(r)) + print('\nSingular value decomposition of r:') + print(torch.svd(r)) + + # ...and statistical and aggregate operations: + print('\nAverage and standard deviation of r:') + # 同时计算张量的 标准差(standard deviation) 和 均值(mean)。 + print(torch.std_mean(r)) + print('\nMaximum value of r:') + print(torch.max(r)) + + +class LeNet(nn.Module): + + def __init__(self): + super(LeNet, self).__init__() + # 1 input image channel (black & white), 6 output channels, 5x5 square convolution + # kernel + self.conv1 = nn.Conv2d(1, 6, 5) + self.conv2 = nn.Conv2d(6, 16, 5) + # an affine operation: y = Wx + b + self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + # 前向计算 + """ + 前向计算 + 就是让输入数据通过神经网络,得到模型的预测结果的过程。你可以把它想象成: + 输入数据(比如一张图片、一段文字)从网络的第一层开始,依次经过每一层的运算(卷积、矩阵乘法、激活函数等)。 + 每一层都会对上一层的输出做处理,最终在网络的最后一层得到预测值(比如分类任务的类别概率、回归任务的数值)。 + 作用是什么? + 得到预测结果:比如输入一张猫的图片,前向计算会输出模型认为这是 “猫” 的概率。 + 计算损失:将预测结果与真实标签(比如 “猫”)比较,用损失函数(Loss Function)衡量模型预测的误差。 + + 反向传播(Backward Pass) + 反向传播就是根据损失函数,从网络的最后一层往回计算每一层参数的梯度的过程。梯度表示参数变化对损失的影响程度,优化器(如 SGD、Adam)会用这些梯度来更新参数,让损失更小。 + 你可以把它想象成: + 从损失值开始,沿着网络的每一层反向计算参数的梯度(使用链式法则)。 + 梯度会告诉我们:每个参数应该调整多少,才能让模型预测更准确。 + 作用是什么? + 计算梯度:得到每个参数(权重 w、偏置 b)的梯度。更新参数:优化器根据梯度调整参数,让模型的预测结果越来越接近真实值。 + + 前向计算:输入数据 → 模型 → 预测值 → 损失。 + 反向传播:损失 → 计算梯度 → 更新参数。 + """ + def forward(self, x): + # Max pooling over a (2, 2) window + x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) + # If the size is a square you can only specify a single number + x = F.max_pool2d(F.relu(self.conv2(x)), 2) + x = x.view(-1, self.num_flat_features(x)) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + def num_flat_features(self, x): + size = x.size()[1:] # all dimensions except the batch dimension + num_features = 1 + for s in size: + num_features *= s + return num_features + + +def pytorch_model_demo(): + net = LeNet() + print(net) + input = torch.rand(1, 1, 32, 32) # stand-in for a 32x32 black & white image + print('\nImage batch shape:') + print(input.shape) + + output = net(input) # we don't call forward() directly + print('\nRaw output:') + print(output) + print(output.shape) + + +def tensors_demo1(): + x = torch.empty(3, 4) + print(type(x)) + print(x) + #随机张量与种子 + torch.manual_seed(1729) + random1 = torch.rand(2, 3) + print(random1) + + random2 = torch.rand(2, 3) + print(random2) + + torch.manual_seed(1729) + random3 = torch.rand(2, 3) + print(random3) + + random4 = torch.rand(2, 3) + print(random4) + # In Brief: Tensor Broadcasting + # 广播是一种机制,它允许在进行逐元素运算时,对不同形状的张量自动进行 “虚拟扩展”,使它们的形状兼容,从而可以进行运算。 + """ + 广播的核心思想是: + 不需要实际复制数据,而是在逻辑上扩展张量的维度。 + 它遵循一套规则,自动对齐不同形状的张量。 + 广播的规则 + 广播有两个核心规则: + 规则 1:维度对齐 + 从 最后一个维度 开始向前比较两个张量的维度: + 如果两个维度的大小 相同 → 兼容。 + 如果其中一个维度的大小是 1 → 兼容(会被扩展)。 + 如果两个维度的大小 不同且都不是 1 → 不兼容,会报错。 + 规则 2:维度扩展 + 对于大小为 1 的维度,沿着该维度复制数据(在逻辑上),直到与另一个张量的对应维度大小一致。 + """ + rand = torch.rand(2, 4) + doubled = rand * (torch.ones(1, 4) * 2) + + print(f"rand: {rand}") + print(f"doubled: {doubled}") + +if __name__ == '__main__': + tensors_demo1() diff --git a/tutorials/01-mine/transforms.py b/tutorials/01-mine/transforms.py new file mode 100644 index 00000000..c8dd4645 --- /dev/null +++ b/tutorials/01-mine/transforms.py @@ -0,0 +1,21 @@ +import torch +from torchvision import datasets +from torchvision.transforms import ToTensor, Lambda + + +""" +数据并不总是以训练机器学习算法所需的最终处理形式出现。我们使用变换对数据进行一些处理,使其适合训练。 +所有TorchVision数据集都有两个参数——transform用于修改特征,target_transform用于修改标签, +这两个参数接收包含转换逻辑的可调用对象。torchvision.transforms模块提供了几种常用的现成转换方法。 +""" +if __name__ == '__main__': + # ToTensor将PIL图像或NumPy ndarray转换为FloatTensor`_ + +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + +This is the third and final tutorial on doing **NLP From Scratch**, where we +write our own classes and functions to preprocess the data to do our NLP +modeling tasks. + +In this project we will be teaching a neural network to translate from +French to English. + +.. code-block:: sh + + [KEY: > input, = target, < output] + + > il est en train de peindre un tableau . + = he is painting a picture . + < he is painting a picture . + + > pourquoi ne pas essayer ce vin delicieux ? + = why not try that delicious wine ? + < why not try that delicious wine ? + + > elle n est pas poete mais romanciere . + = she is not a poet but a novelist . + < she not not a poet but a novelist . + + > vous etes trop maigre . + = you re too skinny . + < you re all alone . + +... to varying degrees of success. + +This is made possible by the simple but powerful idea of the `sequence +to sequence network `__, in which two +recurrent neural networks work together to transform one sequence to +another. An encoder network condenses an input sequence into a vector, +and a decoder network unfolds that vector into a new sequence. + +.. figure:: /_static/img/seq-seq-images/seq2seq.png + :alt: + +To improve upon this model we'll use an `attention +mechanism `__, which lets the decoder +learn to focus over a specific range of the input sequence. + +**Recommended Reading:** + +I assume you have at least installed PyTorch, know Python, and +understand Tensors: + +- https://pytorch.org/ For installation instructions +- :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general +- :doc:`/beginner/pytorch_with_examples` for a wide and deep overview +- :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user + + +It would also be useful to know about Sequence to Sequence networks and +how they work: + +- `Learning Phrase Representations using RNN Encoder-Decoder for + Statistical Machine Translation `__ +- `Sequence to Sequence Learning with Neural + Networks `__ +- `Neural Machine Translation by Jointly Learning to Align and + Translate `__ +- `A Neural Conversational Model `__ + +You will also find the previous tutorials on +:doc:`/intermediate/char_rnn_classification_tutorial` +and :doc:`/intermediate/char_rnn_generation_tutorial` +helpful as those concepts are very similar to the Encoder and Decoder +models, respectively. + +**Requirements** +""" +from __future__ import unicode_literals, print_function, division +from io import open +import unicodedata +import re +import random + +import torch +import torch.nn as nn +from torch import optim +import torch.nn.functional as F + +import time +import math + +import numpy as np +from torch.utils.data import TensorDataset, DataLoader, RandomSampler + +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") +# +SOS_token = 0 +EOS_token = 1 + +class Lang: + def __init__(self, name): + self.name = name + self.word2index = {} + self.word2count = {} + self.index2word = {0: "SOS", 1: "EOS"} + self.n_words = 2 # Count SOS and EOS + + def addSentence(self, sentence): + """添加整个句子到词汇表""" + # 检查句子是否包含中文字符 + has_chinese = any('\u4e00' <= char <= '\u9fff' for char in sentence) + if has_chinese: # 中文按字符处理 + for char in sentence: + self.addWord(char) + else: # 英文按单词处理 + for word in sentence.split(' '): + self.addWord(word) + + def addWord(self, word): + """添加单个词到词汇表""" + if word not in self.word2index: + self.word2index[word] = self.n_words + self.word2count[word] = 1 + self.index2word[self.n_words] = word + self.n_words += 1 + else: + self.word2count[word] += 1 + +def unicodeToAscii(s): + return ''.join( + c for c in unicodedata.normalize('NFD', s) + if unicodedata.category(c) != 'Mn' + ) + + +def normalizeString(s): + s = s.strip() + # 判断是否为英文句子(以字母开头) + if s and s[0].isalpha() and s[0].lower() in 'abcdefghijklmnopqrstuvwxyz': + # 英文句子处理 + s = unicodeToAscii(s.lower()) + s = re.sub(r"([.!?])", r" \1", s) + s = re.sub(r"[^a-zA-Z!?]+", r" ", s) + else: + # 中文句子处理 + s = re.sub(r'\\s+', ' ', s) # 只去除多余空格 + return s.strip() + +def readLangs(lang1, lang2, reverse=False): + print("Reading lines...") + + # Read the file and split into lines + lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \ + read().strip().split('\n') + + # Split every line into pairs and normalize + pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] + + # Reverse pairs, make Lang instances + if reverse: + pairs = [list(reversed(p)) for p in pairs] + input_lang = Lang(lang2) + output_lang = Lang(lang1) + else: + input_lang = Lang(lang1) + output_lang = Lang(lang2) + + return input_lang, output_lang, pairs + + +MAX_LENGTH = 10 + +# 更宽松的英文前缀列表 +eng_prefixes = ( + "i ", "you ", "he ", "she ", "we ", "they ", + "it ", "this ", "that ", "there ", "the ", + "a ", "an ", "my ", "your ", "his ", "her ", + "our ", "their " +) + + +def filterPair(p): + # 判断输入是英文还是中文,分别计算长度 + if p[0] and p[0][0].isalpha() and p[0][0].lower() in 'abcdefghijklmnopqrstuvwxyz': + input_length = len(p[0].split(' ')) + else: + input_length = len(p[0]) + + output_length = len(p[1].split(' ')) + + # 仅保留长度合适的句子对,移除eng_prefixes限制 + return input_length <= MAX_LENGTH - 1 and output_length <= MAX_LENGTH - 1 + + +def filterPairs(pairs): + return [pair for pair in pairs if filterPair(pair)] + +def prepareData(lang1, lang2, reverse=False): + input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse) + print("Read %s sentence pairs" % len(pairs)) + pairs = filterPairs(pairs) + print("前5个句子对示例:") + for i, pair in enumerate(pairs[:5]): + print(f" 中文: {pair[0]}") + print(f" 英文: {pair[1]}") + print() + print("Trimmed to %s sentence pairs" % len(pairs)) + print("Counting words...") + for pair in pairs: + input_lang.addSentence(pair[0]) + output_lang.addSentence(pair[1]) + print("Counted words:") + print(input_lang.name, input_lang.n_words) + print(output_lang.name, output_lang.n_words) + return input_lang, output_lang, pairs + + +input_lang, output_lang, pairs = prepareData('cmn', 'eng', True) +print(random.choice(pairs)) + +class EncoderRNN(nn.Module): + def __init__(self, input_size, hidden_size, dropout_p=0.1): + super(EncoderRNN, self).__init__() + self.hidden_size = hidden_size + + self.embedding = nn.Embedding(input_size, hidden_size) + self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) + self.dropout = nn.Dropout(dropout_p) + + def forward(self, input): + embedded = self.dropout(self.embedding(input)) + output, hidden = self.gru(embedded) + return output, hidden + + +class DecoderRNN(nn.Module): + def __init__(self, hidden_size, output_size): + super(DecoderRNN, self).__init__() + self.embedding = nn.Embedding(output_size, hidden_size) + self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) + self.out = nn.Linear(hidden_size, output_size) + + def forward(self, encoder_outputs, encoder_hidden, target_tensor=None): + batch_size = encoder_outputs.size(0) + decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token) + decoder_hidden = encoder_hidden + decoder_outputs = [] + + for i in range(MAX_LENGTH): + decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden) + decoder_outputs.append(decoder_output) + + if target_tensor is not None: + # Teacher forcing: Feed the target as the next input + decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing + else: + # Without teacher forcing: use its own predictions as the next input + _, topi = decoder_output.topk(1) + decoder_input = topi.squeeze(-1).detach() # detach from history as input + + decoder_outputs = torch.cat(decoder_outputs, dim=1) + decoder_outputs = F.log_softmax(decoder_outputs, dim=-1) + return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop + + def forward_step(self, input, hidden): + output = self.embedding(input) + output = F.relu(output) + output, hidden = self.gru(output, hidden) + output = self.out(output) + return output, hidden + +class BahdanauAttention(nn.Module): + def __init__(self, hidden_size): + super(BahdanauAttention, self).__init__() + self.Wa = nn.Linear(hidden_size, hidden_size) + self.Ua = nn.Linear(hidden_size, hidden_size) + self.Va = nn.Linear(hidden_size, 1) + + def forward(self, query, keys): + scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys))) + scores = scores.squeeze(2).unsqueeze(1) + + weights = F.softmax(scores, dim=-1) + context = torch.bmm(weights, keys) + + return context, weights + + +class AttnDecoderRNN(nn.Module): + def __init__(self, hidden_size, output_size, dropout_p=0.1): + super(AttnDecoderRNN, self).__init__() + self.embedding = nn.Embedding(output_size, hidden_size) + self.attention = BahdanauAttention(hidden_size) + self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True) + self.out = nn.Linear(hidden_size, output_size) + self.dropout = nn.Dropout(dropout_p) + + def forward(self, encoder_outputs, encoder_hidden, target_tensor=None): + batch_size = encoder_outputs.size(0) + decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token) + decoder_hidden = encoder_hidden + decoder_outputs = [] + attentions = [] + + for i in range(MAX_LENGTH): + decoder_output, decoder_hidden, attn_weights = self.forward_step( + decoder_input, decoder_hidden, encoder_outputs + ) + decoder_outputs.append(decoder_output) + attentions.append(attn_weights) + + if target_tensor is not None: + # Teacher forcing: Feed the target as the next input + decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing + else: + # Without teacher forcing: use its own predictions as the next input + _, topi = decoder_output.topk(1) + decoder_input = topi.squeeze(-1).detach() # detach from history as input + + decoder_outputs = torch.cat(decoder_outputs, dim=1) + decoder_outputs = F.log_softmax(decoder_outputs, dim=-1) + attentions = torch.cat(attentions, dim=1) + + return decoder_outputs, decoder_hidden, attentions + + def forward_step(self, input, hidden, encoder_outputs): + embedded = self.dropout(self.embedding(input)) + + query = hidden.permute(1, 0, 2) + context, attn_weights = self.attention(query, encoder_outputs) + input_gru = torch.cat((embedded, context), dim=2) + + output, hidden = self.gru(input_gru, hidden) + output = self.out(output) + + return output, hidden, attn_weights + + +def indexesFromSentence(lang, sentence): + has_chinese = any('\u4e00' <= char <= '\u9fff' for char in sentence) + indexes = [] + + if has_chinese: + for char in sentence: + if char in lang.word2index: + indexes.append(lang.word2index[char]) + else: + indexes.append(2) # UNK的索引是2 + else: + for word in sentence.split(' '): + if word in lang.word2index: + indexes.append(lang.word2index[word]) + else: + indexes.append(2) # UNK的索引是2 + return indexes + + +def tensorFromSentence(lang, sentence): + indexes = indexesFromSentence(lang, sentence) + indexes.append(EOS_token) + return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1) + + +def tensorsFromPair(pair): + input_tensor = tensorFromSentence(input_lang, pair[0]) + target_tensor = tensorFromSentence(output_lang, pair[1]) + return (input_tensor, target_tensor) + + +def get_dataloader(batch_size): + # 使用全局的input_lang, output_lang, pairs + n = len(pairs) + input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32) + target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32) + + for idx, (inp, tgt) in enumerate(pairs): + inp_ids = indexesFromSentence(input_lang, inp) + tgt_ids = indexesFromSentence(output_lang, tgt) + + # 确保句子长度不超过MAX_LENGTH-1,然后添加EOS_token + inp_ids = inp_ids[:MAX_LENGTH - 1] + tgt_ids = tgt_ids[:MAX_LENGTH - 1] + + inp_ids.append(EOS_token) + tgt_ids.append(EOS_token) + + input_ids[idx, :len(inp_ids)] = inp_ids + target_ids[idx, :len(tgt_ids)] = tgt_ids + + train_data = TensorDataset(torch.LongTensor(input_ids).to(device), + torch.LongTensor(target_ids).to(device)) + + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) + return input_lang, output_lang, train_dataloader + +def train_epoch(dataloader, encoder, decoder, encoder_optimizer, + decoder_optimizer, criterion): + total_loss = 0 + for data in dataloader: + input_tensor, target_tensor = data + + encoder_optimizer.zero_grad() + decoder_optimizer.zero_grad() + + encoder_outputs, encoder_hidden = encoder(input_tensor) + decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor) + + loss = criterion( + decoder_outputs.view(-1, decoder_outputs.size(-1)), + target_tensor.view(-1) + ) + loss.backward() + + encoder_optimizer.step() + decoder_optimizer.step() + + total_loss += loss.item() + + return total_loss / len(dataloader) + + + +def asMinutes(s): + m = math.floor(s / 60) + s -= m * 60 + return '%dm %ds' % (m, s) + + +def timeSince(since, percent): + now = time.time() + s = now - since + es = s / (percent) + rs = es - s + return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) + + +def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, + print_every=100, plot_every=100): + start = time.time() + plot_losses = [] + print_loss_total = 0 # Reset every print_every + plot_loss_total = 0 # Reset every plot_every + + encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) + decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) + criterion = nn.NLLLoss() + + for epoch in range(1, n_epochs + 1): + loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) + print_loss_total += loss + plot_loss_total += loss + + if epoch % print_every == 0: + print_loss_avg = print_loss_total / print_every + print_loss_total = 0 + print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs), + epoch, epoch / n_epochs * 100, print_loss_avg)) + + if epoch % plot_every == 0: + plot_loss_avg = plot_loss_total / plot_every + plot_losses.append(plot_loss_avg) + plot_loss_total = 0 + + showPlot(plot_losses) + + +import matplotlib.pyplot as plt + +plt.switch_backend('agg') +import matplotlib.ticker as ticker +import numpy as np + + +def showPlot(points): + plt.figure() + fig, ax = plt.subplots() + # this locator puts ticks at regular intervals + loc = ticker.MultipleLocator(base=0.2) + ax.yaxis.set_major_locator(loc) + plt.plot(points) + +def evaluate(encoder, decoder, sentence, input_lang, output_lang): + with torch.no_grad(): + input_tensor = tensorFromSentence(input_lang, sentence) + + encoder_outputs, encoder_hidden = encoder(input_tensor) + decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden) + + _, topi = decoder_outputs.topk(1) + decoded_ids = topi.squeeze() + + decoded_words = [] + for idx in decoded_ids: + if idx.item() == EOS_token: + decoded_words.append('') + break + decoded_words.append(output_lang.index2word[idx.item()]) + return decoded_words, decoder_attn + + +def evaluateRandomly(encoder, decoder, n=10): + for i in range(n): + pair = random.choice(pairs) + print('>', pair[0]) + print('=', pair[1]) + output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang) + output_sentence = ' '.join(output_words) + print('<', output_sentence) + print('') + + +hidden_size = 128 +batch_size = 32 + +input_lang, output_lang, train_dataloader = get_dataloader(batch_size) + +encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device) +decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device) + +# train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5) +# +# ###################################################################### +# # +# # Set dropout layers to ``eval`` mode +# encoder.eval() +# decoder.eval() +# evaluateRandomly(encoder, decoder) + + +###################################################################### +# Visualizing Attention +# --------------------- +# +# A useful property of the attention mechanism is its highly interpretable +# outputs. Because it is used to weight specific encoder outputs of the +# input sequence, we can imagine looking where the network is focused most +# at each time step. +# +# You could simply run ``plt.matshow(attentions)`` to see attention output +# displayed as a matrix. For a better viewing experience we will do the +# extra work of adding axes and labels: +# + +def showAttention(input_sentence, output_words, attentions): + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(attentions.cpu().numpy(), cmap='bone') + fig.colorbar(cax) + + # Set up axes + ax.set_xticklabels([''] + input_sentence.split(' ') + + [''], rotation=90) + ax.set_yticklabels([''] + output_words) + + # Show label at every tick + ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) + ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) + + plt.show() + + +def evaluateAndShowAttention(input_sentence): + # 根据输入句子的语言类型选择正确的语言对象 + has_chinese = any('\u4e00' <= char <= '\u9fff' for char in input_sentence) + if has_chinese: + # 中文句子,使用output_lang作为输入语言 + output_words, attentions = evaluate(encoder, decoder, input_sentence, output_lang, input_lang) + else: + # 英文句子,使用input_lang作为输入语言 + output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang) + + # 显示结果 + print('input =', input_sentence) + print('output =', ' '.join(output_words)) + showAttention(input_sentence, output_words, attentions[0, :len(output_words), :]) + +# test_sentences = [ +# 'i am anxious', # 从训练数据前缀中选取 +# 'he is happy', +# 'you are welcome' +# ] +# +# for sent in test_sentences: +# evaluateAndShowAttention(sent) +evaluateAndShowAttention('i am anxious') +evaluateAndShowAttention('he is happy') +evaluateAndShowAttention('you are welcome?') diff --git a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py new file mode 100644 index 00000000..4af9cc56 --- /dev/null +++ b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/bidirectional_recurrent_neural_network.py @@ -0,0 +1,111 @@ +import torch +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms + +# Device configuration +# 支持CUDA、MPS和CPU设备 +# 优化前 +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# 优化后 +device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu') +print(f"使用设备: {device}") + +# BiRNN是双向循环神经网络(Bidirectional Recurrent Neural Network)的缩写, +# 是一种能够同时利用序列过去和未来信息的神经网络架构。 +# Bidirectional recurrent neural network (many-to-one) +class BiRNN(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, num_classes): + super(BiRNN, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) + self.fc = nn.Linear(hidden_size * 2, num_classes) # 2 for bidirection + + def forward(self, x): + # Set initial states + h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) # 2 for bidirection + c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) + + # Forward propagate LSTM + out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size*2) + + # Decode the hidden state of the last time step + out = self.fc(out[:, -1, :]) + return out + + +if __name__ == '__main__': + + # Hyper-parameters + sequence_length = 28 + input_size = 28 + hidden_size = 128 + num_layers = 2 + num_classes = 10 + batch_size = 100 + num_epochs = 2 + learning_rate = 0.003 + + # MNIST dataset + train_dataset = torchvision.datasets.MNIST(root='../../data/', + train=True, + transform=transforms.ToTensor(), + download=True) + + test_dataset = torchvision.datasets.MNIST(root='../../data/', + train=False, + transform=transforms.ToTensor()) + + # Data loader + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=batch_size, + shuffle=True) + + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=batch_size, + shuffle=False) + + model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device) + + # Loss and optimizer + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + # Train the model + total_step = len(train_loader) + for epoch in range(num_epochs): + for i, (images, labels) in enumerate(train_loader): + images = images.reshape(-1, sequence_length, input_size).to(device) + labels = labels.to(device) + + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if (i + 1) % 100 == 0: + print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' + .format(epoch + 1, num_epochs, i + 1, total_step, loss.item())) + + # Test the model + with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + images = images.reshape(-1, sequence_length, input_size).to(device) + labels = labels.to(device) + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) + + # Save the model checkpoint + torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file diff --git a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py b/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py deleted file mode 100644 index a0ecd773..00000000 --- a/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py +++ /dev/null @@ -1,102 +0,0 @@ -import torch -import torch.nn as nn -import torchvision -import torchvision.transforms as transforms - - -# Device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# Hyper-parameters -sequence_length = 28 -input_size = 28 -hidden_size = 128 -num_layers = 2 -num_classes = 10 -batch_size = 100 -num_epochs = 2 -learning_rate = 0.003 - -# MNIST dataset -train_dataset = torchvision.datasets.MNIST(root='../../data/', - train=True, - transform=transforms.ToTensor(), - download=True) - -test_dataset = torchvision.datasets.MNIST(root='../../data/', - train=False, - transform=transforms.ToTensor()) - -# Data loader -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True) - -test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - shuffle=False) - -# Bidirectional recurrent neural network (many-to-one) -class BiRNN(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, num_classes): - super(BiRNN, self).__init__() - self.hidden_size = hidden_size - self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) - self.fc = nn.Linear(hidden_size*2, num_classes) # 2 for bidirection - - def forward(self, x): - # Set initial states - h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection - c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) - - # Forward propagate LSTM - out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size*2) - - # Decode the hidden state of the last time step - out = self.fc(out[:, -1, :]) - return out - -model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device) - - -# Loss and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - -# Train the model -total_step = len(train_loader) -for epoch in range(num_epochs): - for i, (images, labels) in enumerate(train_loader): - images = images.reshape(-1, sequence_length, input_size).to(device) - labels = labels.to(device) - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (i+1) % 100 == 0: - print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' - .format(epoch+1, num_epochs, i+1, total_step, loss.item())) - -# Test the model -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - images = images.reshape(-1, sequence_length, input_size).to(device) - labels = labels.to(device) - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) - -# Save the model checkpoint -torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file diff --git a/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py b/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py new file mode 100644 index 00000000..b200bb68 --- /dev/null +++ b/tutorials/02-intermediate/convolutional_neural_network/convolutional_neural_network.py @@ -0,0 +1,123 @@ +import torch +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms + + +# 卷积神经网络 + +# Convolutional neural network (two convolutional layers) +class ConvNet(nn.Module): + # 网络层定义 + def __init__(self, num_classes=10): + super(ConvNet, self).__init__() + # 第一层卷积层:输入通道1(MNIST灰度图),输出通道16,卷积核5x5,步长1,填充2 + # 卷积后尺寸保持不变:(28-5+2*2)/1 + 1 = 28 + self.layer1 = nn.Sequential( + nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2), + nn.BatchNorm2d(16), # 批标准化:加速训练,提高稳定性 + nn.ReLU(), # 激活函数:引入非线性 + nn.MaxPool2d(kernel_size=2, stride=2) # 池化层:尺寸减半为14x14 无参数的下采样操作 + ) + # 第二层卷积层:输入通道16,输出通道32,卷积核5x5,步长1,填充2 + # 卷积后尺寸保持不变:(14-5+2*2)/1 + 1 = 14 + self.layer2 = nn.Sequential( + nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2), + nn.BatchNorm2d(32), # 批标准化 + nn.ReLU(), # 激活函数 + nn.MaxPool2d(kernel_size=2, stride=2) # 池化层:尺寸减半为7x7 + ) + # 全连接层:输入尺寸7x7x32,输出类别数 + self.fc = nn.Linear(7 * 7 * 32, num_classes) + # 前向传播定义 + def forward(self, x): + # 前向传播路径 + out = self.layer1(x) # 输入x经过第一层卷积层,输出尺寸:[batch_size, 16, 14, 14] + out = self.layer2(out) # 输出经过第二层卷积层,输出尺寸:[batch_size, 32, 7, 7] + out = out.reshape(out.size(0), -1) # 展平:[batch_size, 32*7*7] + out = self.fc(out) # 全连接层分类,输出尺寸:[batch_size, num_classes] + return out + + +if __name__ == '__main__': + + # Device configuration + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + + # Hyper parameters + num_epochs = 5 + num_classes = 10 + batch_size = 100 + learning_rate = 0.001 + + # MNIST dataset + train_dataset = torchvision.datasets.MNIST(root='../../data/', + train=True, + transform=transforms.ToTensor(), + download=True) + + test_dataset = torchvision.datasets.MNIST(root='../../data/', + train=False, + transform=transforms.ToTensor()) + + # Data loader + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=batch_size, + shuffle=True) + + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=batch_size, + shuffle=False) + + model = ConvNet(num_classes).to(device) + + # Loss and optimizer + # 损失函数:衡量模型预测与真实标签的差异程度 + # CrossEntropyLoss适用于分类任务,内部整合了softmax和负对数似然 + # 公式:loss = -sum(y_true * log(y_pred)),其中y_true是one-hot编码的真实标签 + criterion = nn.CrossEntropyLoss() + + # 优化器:根据损失函数的梯度来更新模型参数,最小化损失 + # Adam是一种常用的自适应学习率优化算法,结合了Momentum和RMSProp的优点 + # 参数说明: + # - model.parameters(): 需要优化的模型参数集合 + # - lr: 学习率,控制参数更新的步长大小 + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + # Train the model + total_step = len(train_loader) # 每个epoch的总步数 = 训练数据集大小 / 批大小 + for epoch in range(num_epochs): # 训练num_epochs轮 + for i, (images, labels) in enumerate(train_loader): # 遍历每个批次 + images = images.to(device) # 将图像数据移至指定设备 + labels = labels.to(device) # 将标签移至指定设备 + + # Forward pass(前向传播):模型对输入图像进行预测 + outputs = model(images) # outputs形状:[batch_size, num_classes] + loss = criterion(outputs, labels) # 计算损失:模型预测与真实标签的差异 + + # Backward and optimize(反向传播与参数优化) + optimizer.zero_grad() # 清除之前的梯度,避免梯度累积 + loss.backward() # 反向传播:计算所有可训练参数的梯度 + optimizer.step() # 优化器更新参数:根据梯度调整模型权重 + + if (i + 1) % 100 == 0: + print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' + .format(epoch + 1, num_epochs, i + 1, total_step, loss.item())) + + # Test the model + model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance) + with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + images = images.to(device) + labels = labels.to(device) + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) + + # Save the model checkpoint + torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file diff --git a/tutorials/02-intermediate/convolutional_neural_network/main.py b/tutorials/02-intermediate/convolutional_neural_network/main.py deleted file mode 100644 index ec904f1f..00000000 --- a/tutorials/02-intermediate/convolutional_neural_network/main.py +++ /dev/null @@ -1,100 +0,0 @@ -import torch -import torch.nn as nn -import torchvision -import torchvision.transforms as transforms - - -# Device configuration -device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - -# Hyper parameters -num_epochs = 5 -num_classes = 10 -batch_size = 100 -learning_rate = 0.001 - -# MNIST dataset -train_dataset = torchvision.datasets.MNIST(root='../../data/', - train=True, - transform=transforms.ToTensor(), - download=True) - -test_dataset = torchvision.datasets.MNIST(root='../../data/', - train=False, - transform=transforms.ToTensor()) - -# Data loader -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True) - -test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - shuffle=False) - -# Convolutional neural network (two convolutional layers) -class ConvNet(nn.Module): - def __init__(self, num_classes=10): - super(ConvNet, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2), - nn.BatchNorm2d(16), - nn.ReLU(), - nn.MaxPool2d(kernel_size=2, stride=2)) - self.layer2 = nn.Sequential( - nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2), - nn.BatchNorm2d(32), - nn.ReLU(), - nn.MaxPool2d(kernel_size=2, stride=2)) - self.fc = nn.Linear(7*7*32, num_classes) - - def forward(self, x): - out = self.layer1(x) - out = self.layer2(out) - out = out.reshape(out.size(0), -1) - out = self.fc(out) - return out - -model = ConvNet(num_classes).to(device) - -# Loss and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - -# Train the model -total_step = len(train_loader) -for epoch in range(num_epochs): - for i, (images, labels) in enumerate(train_loader): - images = images.to(device) - labels = labels.to(device) - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (i+1) % 100 == 0: - print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' - .format(epoch+1, num_epochs, i+1, total_step, loss.item())) - -# Test the model -model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance) -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - images = images.to(device) - labels = labels.to(device) - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) - -# Save the model checkpoint -torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file diff --git a/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py b/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py new file mode 100644 index 00000000..d9aa9fe4 --- /dev/null +++ b/tutorials/02-intermediate/deep_residual_network/deep_residual_network.py @@ -0,0 +1,274 @@ +# ---------------------------------------------------------------------------- # +# ResNet模型实现:基于论文https://arxiv.org/pdf/1512.03385.pdf # +# 采用CIFAR-10数据集的模型架构(见论文4.2节) # +# 部分代码参考PyTorch官方实现: # +# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py # +# ---------------------------------------------------------------------------- # + +# 导入必要的库 +import torch # PyTorch核心库 +import torch.nn as nn # 神经网络模块 +import torchvision # 计算机视觉库,提供数据集和模型 +import torchvision.transforms as transforms # 图像预处理工具 +from torch.cuda.amp import autocast, GradScaler # 混合精度训练 + +# 设备配置:优先使用GPU,否则使用CPU,M系列芯片可使用MPS +if torch.backends.mps.is_available(): + device = torch.device('mps') # Apple Silicon M系列芯片加速 +elif torch.cuda.is_available(): + device = torch.device('cuda') # NVIDIA GPU加速 +else: + device = torch.device('cpu') # CPU训练 +print(f"使用设备: {device}") + + +# 3x3卷积层封装函数 +# ResNet大量使用3x3卷积,此函数简化代码复用 +def conv3x3(in_channels, out_channels, stride=1): + # 定义3x3卷积: + # - in_channels: 输入通道数 + # - out_channels: 输出通道数 + # - kernel_size: 卷积核大小 + # - stride: 步长,默认1 + # - padding: 填充,设置为1保持尺寸不变 + # - bias: 不使用偏置,因为后续会接批标准化层 + return nn.Conv2d(in_channels, out_channels, kernel_size=3, + stride=stride, padding=1, bias=False) + + +# 残差块(Residual Block)定义 +# ResNet的核心组件,通过残差连接解决深度网络训练问题 +class ResidualBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride=1, downsample=None): + super(ResidualBlock, self).__init__() + # 第一个3x3卷积层:可能改变通道数和尺寸 + self.conv1 = conv3x3(in_channels, out_channels, stride) + self.bn1 = nn.BatchNorm2d(out_channels) # 批标准化层 + self.relu = nn.ReLU(inplace=True) # ReLU激活函数,inplace=True节省内存 + # 第二个3x3卷积层:保持通道数和尺寸不变 + self.conv2 = conv3x3(out_channels, out_channels) + self.bn2 = nn.BatchNorm2d(out_channels) # 批标准化层 + # 下采样模块:当输入输出通道数或尺寸不匹配时使用 + self.downsample = downsample + + def forward(self, x): + # 保存输入作为残差连接 + residual = x + + # 主路径:两次卷积+激活 + out = self.conv1(x) # 第一次卷积 + out = self.bn1(out) # 批标准化 + out = self.relu(out) # 激活函数 + + out = self.conv2(out) # 第二次卷积 + out = self.bn2(out) # 批标准化 + + # 残差路径:如果需要下采样则调整尺寸和通道数 + if self.downsample: + residual = self.downsample(x) + + # 残差连接:主路径输出 + 残差路径输出 + out += residual + out = self.relu(out) # 最终激活 + + return out + + +# ResNet主网络定义 +class ResNet(nn.Module): + def __init__(self, block, layers, num_classes=10): + super(ResNet, self).__init__() + # 初始输入通道数 + self.in_channels = 16 + + # 第一层:3x3卷积 + 批标准化 + ReLU + # CIFAR-10输入为3通道,输出16通道 + self.conv = conv3x3(3, 16) + self.bn = nn.BatchNorm2d(16) + self.relu = nn.ReLU(inplace=True) + + # 构建残差层: + # layer1: 16通道,layers[0]个残差块,步长1 + # layer2: 32通道,layers[1]个残差块,步长2(尺寸减半) + # layer3: 64通道,layers[2]个残差块,步长2(尺寸减半) + self.layer1 = self.make_layer(block, 16, layers[0]) + self.layer2 = self.make_layer(block, 32, layers[1], 2) + self.layer3 = self.make_layer(block, 64, layers[2], 2) + + # 全局平均池化:将64x8x8特征图转为64x1x1 + self.avg_pool = nn.AvgPool2d(8) + + # 全连接层:将64维特征映射到10个类别 + self.fc = nn.Linear(64, num_classes) + + # 创建残差层的辅助函数 + def make_layer(self, block, out_channels, blocks, stride=1): + downsample = None + + # 当步长不为1或输入输出通道数不匹配时,需要下采样 + if (stride != 1) or (self.in_channels != out_channels): + downsample = nn.Sequential( + conv3x3(self.in_channels, out_channels, stride=stride), + nn.BatchNorm2d(out_channels) + ) + + layers = [] + # 添加第一个残差块(可能包含下采样) + layers.append(block(self.in_channels, out_channels, stride, downsample)) + self.in_channels = out_channels # 更新输入通道数 + + # 添加剩余的残差块(不需要下采样) + for i in range(1, blocks): + layers.append(block(out_channels, out_channels)) + + return nn.Sequential(*layers) + + def forward(self, x): + # 输入x: [batch_size, 3, 32, 32] + + # 初始卷积层 + out = self.conv(x) # [batch_size, 16, 32, 32] + out = self.bn(out) # 批标准化 + out = self.relu(out) # 激活 + + # 残差层1:保持尺寸不变 + out = self.layer1(out) # [batch_size, 16, 32, 32] + + # 残差层2:尺寸减半 + out = self.layer2(out) # [batch_size, 32, 16, 16] + + # 残差层3:尺寸减半 + out = self.layer3(out) # [batch_size, 64, 8, 8] + + # 全局平均池化 + out = self.avg_pool(out) # [batch_size, 64, 1, 1] + + # 展平特征 + out = out.view(out.size(0), -1) # [batch_size, 64] + + # 全连接层分类 + out = self.fc(out) # [batch_size, 10] + + return out +# 学习率更新函数 +def update_lr(optimizer, lr): + # 遍历优化器中的所有参数组,更新学习率 + for param_group in optimizer.param_groups: + param_group['lr'] = lr + +if __name__ == '__main__': + # 主函数入口 + + # 超参数设置 - 针对Mac优化 + num_epochs = 20 # 减少训练轮数(原80轮,可根据需要调整) + batch_size = 32 # 减小批大小适配Mac内存(原100) + learning_rate = 0.001 # 初始学习率 + + # 图像预处理模块:数据增强提高模型泛化能力 + transform = transforms.Compose([ + transforms.Pad(4), # 填充4个像素,32x32→40x40 + transforms.RandomHorizontalFlip(), # 随机水平翻转 + transforms.RandomCrop(32), # 随机裁剪回32x32 + transforms.ToTensor() # 转换为Tensor格式 + ]) + + # CIFAR-10数据集加载 + # 训练集:应用数据增强 + train_dataset = torchvision.datasets.CIFAR10(root='../../data/', + train=True, # 训练集 + transform=transform, # 应用数据增强 + download=True) # 自动下载 + + # 测试集:仅转换为Tensor,不应用数据增强 + test_dataset = torchvision.datasets.CIFAR10(root='../../data/', + train=False, # 测试集 + transform=transforms.ToTensor()) + + # 数据加载器 - 优化版本 + # 训练数据加载器:多线程加载+内存锁定 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=batch_size, # 批大小 + shuffle=True, # 打乱数据 + num_workers=4, # 多线程加载(根据CPU核心数调整) + pin_memory=True, # 内存锁定加速数据传输 + persistent_workers=True) # 保持线程存活 + + # 测试数据加载器:多线程加载+内存锁定 + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, + batch_size=batch_size, + shuffle=False, + num_workers=2, + pin_memory=True) + + # 初始化ResNet模型 + # 使用ResidualBlock作为基本单元,[2, 2, 2]表示每个残差层包含2个残差块 + # 即创建一个ResNet-18模型(计算方式:2*(2+2+2)+2=14层?不,正确计算是: + # 初始卷积层(1) + 3个残差层(每个2个块,每个块2层) + 全连接层(1) = 1+3*2*2+1=14层) + model = ResNet(ResidualBlock, [2, 2, 2]).to(device) + # 损失函数和优化器 + # 交叉熵损失:适用于分类问题,内部包含softmax + criterion = nn.CrossEntropyLoss() + + # Adam优化器:自适应学习率优化器,收敛速度快 + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + # 混合精度训练配置 + scaler = GradScaler(enabled=(device.type == 'cuda' or device.type == 'mps')) + + # 训练模型 + total_step = len(train_loader) # 每个epoch的总步数 + curr_lr = learning_rate # 当前学习率 + + # 训练模型 + total_step = len(train_loader) # 每个epoch的总步数 + curr_lr = learning_rate # 当前学习率 + + for epoch in range(num_epochs): # 遍历每个epoch + model.train() # 确保模型处于训练模式 + for i, (images, labels) in enumerate(train_loader): # 遍历每个batch + # 将数据移至指定设备 + images = images.to(device, non_blocking=True) # 非阻塞传输 + labels = labels.to(device, non_blocking=True) # 非阻塞传输 + + # 前向传播:模型预测(混合精度) + with autocast(enabled=(device.type == 'cuda' or device.type == 'mps')): + outputs = model(images) + loss = criterion(outputs, labels) + + # 反向传播和优化(混合精度) + optimizer.zero_grad(set_to_none=True) # 更高效的梯度清除 + scaler.scale(loss).backward() # 缩放损失并反向传播 + scaler.step(optimizer) # 更新参数 + scaler.update() # 更新缩放器 + + # 每100步打印一次损失 + if (i + 1) % 100 == 0: + print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}" + .format(epoch + 1, num_epochs, i + 1, total_step, loss.item())) + + # 每20个epoch衰减一次学习率 + if (epoch + 1) % 20 == 0: + curr_lr /= 3 # 学习率除以3 + update_lr(optimizer, curr_lr) # 更新优化器的学习率 + # 测试模型 + model.eval() # 设置模型为评估模式 + + with torch.no_grad(): # 不计算梯度,节省内存和计算资源 + correct = 0 # 正确预测数 + total = 0 # 总样本数 + + for images, labels in test_loader: # 遍历测试集 + images = images.to(device) + labels = labels.to(device) + + outputs = model(images) # 模型预测 + _, predicted = torch.max(outputs.data, 1) # 获取预测类别 + + total += labels.size(0) # 更新总样本数 + correct += (predicted == labels).sum().item() # 更新正确预测数 + + # 计算并打印准确率 + print('Accuracy of the model on the test images: {} %'.format(100 * correct / total)) + + # 保存模型参数到文件 + torch.save(model.state_dict(), 'resnet.ckpt') \ No newline at end of file diff --git a/tutorials/02-intermediate/deep_residual_network/main.py b/tutorials/02-intermediate/deep_residual_network/main.py deleted file mode 100644 index 69dbe5fb..00000000 --- a/tutorials/02-intermediate/deep_residual_network/main.py +++ /dev/null @@ -1,170 +0,0 @@ -# ---------------------------------------------------------------------------- # -# An implementation of https://arxiv.org/pdf/1512.03385.pdf # -# See section 4.2 for the model architecture on CIFAR-10 # -# Some part of the code was referenced from below # -# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py # -# ---------------------------------------------------------------------------- # - -import torch -import torch.nn as nn -import torchvision -import torchvision.transforms as transforms - - -# Device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# Hyper-parameters -num_epochs = 80 -batch_size = 100 -learning_rate = 0.001 - -# Image preprocessing modules -transform = transforms.Compose([ - transforms.Pad(4), - transforms.RandomHorizontalFlip(), - transforms.RandomCrop(32), - transforms.ToTensor()]) - -# CIFAR-10 dataset -train_dataset = torchvision.datasets.CIFAR10(root='../../data/', - train=True, - transform=transform, - download=True) - -test_dataset = torchvision.datasets.CIFAR10(root='../../data/', - train=False, - transform=transforms.ToTensor()) - -# Data loader -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True) - -test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - shuffle=False) - -# 3x3 convolution -def conv3x3(in_channels, out_channels, stride=1): - return nn.Conv2d(in_channels, out_channels, kernel_size=3, - stride=stride, padding=1, bias=False) - -# Residual block -class ResidualBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, downsample=None): - super(ResidualBlock, self).__init__() - self.conv1 = conv3x3(in_channels, out_channels, stride) - self.bn1 = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3(out_channels, out_channels) - self.bn2 = nn.BatchNorm2d(out_channels) - self.downsample = downsample - - def forward(self, x): - residual = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - if self.downsample: - residual = self.downsample(x) - out += residual - out = self.relu(out) - return out - -# ResNet -class ResNet(nn.Module): - def __init__(self, block, layers, num_classes=10): - super(ResNet, self).__init__() - self.in_channels = 16 - self.conv = conv3x3(3, 16) - self.bn = nn.BatchNorm2d(16) - self.relu = nn.ReLU(inplace=True) - self.layer1 = self.make_layer(block, 16, layers[0]) - self.layer2 = self.make_layer(block, 32, layers[1], 2) - self.layer3 = self.make_layer(block, 64, layers[2], 2) - self.avg_pool = nn.AvgPool2d(8) - self.fc = nn.Linear(64, num_classes) - - def make_layer(self, block, out_channels, blocks, stride=1): - downsample = None - if (stride != 1) or (self.in_channels != out_channels): - downsample = nn.Sequential( - conv3x3(self.in_channels, out_channels, stride=stride), - nn.BatchNorm2d(out_channels)) - layers = [] - layers.append(block(self.in_channels, out_channels, stride, downsample)) - self.in_channels = out_channels - for i in range(1, blocks): - layers.append(block(out_channels, out_channels)) - return nn.Sequential(*layers) - - def forward(self, x): - out = self.conv(x) - out = self.bn(out) - out = self.relu(out) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.avg_pool(out) - out = out.view(out.size(0), -1) - out = self.fc(out) - return out - -model = ResNet(ResidualBlock, [2, 2, 2]).to(device) - - -# Loss and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - -# For updating learning rate -def update_lr(optimizer, lr): - for param_group in optimizer.param_groups: - param_group['lr'] = lr - -# Train the model -total_step = len(train_loader) -curr_lr = learning_rate -for epoch in range(num_epochs): - for i, (images, labels) in enumerate(train_loader): - images = images.to(device) - labels = labels.to(device) - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (i+1) % 100 == 0: - print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}" - .format(epoch+1, num_epochs, i+1, total_step, loss.item())) - - # Decay learning rate - if (epoch+1) % 20 == 0: - curr_lr /= 3 - update_lr(optimizer, curr_lr) - -# Test the model -model.eval() -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - images = images.to(device) - labels = labels.to(device) - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - print('Accuracy of the model on the test images: {} %'.format(100 * correct / total)) - -# Save the model checkpoint -torch.save(model.state_dict(), 'resnet.ckpt') diff --git a/tutorials/02-intermediate/language_model/main.py b/tutorials/02-intermediate/language_model/language_model.py similarity index 79% rename from tutorials/02-intermediate/language_model/main.py rename to tutorials/02-intermediate/language_model/language_model.py index ef135bb7..e534a71d 100644 --- a/tutorials/02-intermediate/language_model/main.py +++ b/tutorials/02-intermediate/language_model/language_model.py @@ -8,7 +8,9 @@ # Device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +# 支持CUDA、MPS和CPU设备 +device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu') +print(f"使用设备: {device}") # Hyper-parameters embed_size = 128 @@ -31,22 +33,22 @@ class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() - self.embed = nn.Embedding(vocab_size, embed_size) - self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) - self.linear = nn.Linear(hidden_size, vocab_size) - + self.embed = nn.Embedding(vocab_size, embed_size) # 词嵌入层 + self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) # LSTM层 + self.linear = nn.Linear(hidden_size, vocab_size) # 输出层 + def forward(self, x, h): - # Embed word ids to vectors - x = self.embed(x) - - # Forward propagate LSTM - out, (h, c) = self.lstm(x, h) - - # Reshape output to (batch_size*sequence_length, hidden_size) - out = out.reshape(out.size(0)*out.size(1), out.size(2)) - - # Decode hidden states of all time steps - out = self.linear(out) + # 词嵌入:将单词ID转换为向量表示 + x = self.embed(x) # [batch_size, seq_length, embed_size] + + # LSTM前向传播 + out, (h, c) = self.lstm(x, h) # out: [batch_size, seq_length, hidden_size] + + # 重塑输出以匹配全连接层输入格式 + out = out.reshape(out.size(0) * out.size(1), out.size(2)) # [batch_size*seq_length, hidden_size] + + # 预测下一个单词的概率分布 + out = self.linear(out) # [batch_size*seq_length, vocab_size] return out, (h, c) model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device) diff --git a/tutorials/02-intermediate/recurrent_neural_network/main.py b/tutorials/02-intermediate/recurrent_neural_network/main.py deleted file mode 100644 index c138c5ad..00000000 --- a/tutorials/02-intermediate/recurrent_neural_network/main.py +++ /dev/null @@ -1,103 +0,0 @@ -import torch -import torch.nn as nn -import torchvision -import torchvision.transforms as transforms - - -# Device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# Hyper-parameters -sequence_length = 28 -input_size = 28 -hidden_size = 128 -num_layers = 2 -num_classes = 10 -batch_size = 100 -num_epochs = 2 -learning_rate = 0.01 - -# MNIST dataset -train_dataset = torchvision.datasets.MNIST(root='../../data/', - train=True, - transform=transforms.ToTensor(), - download=True) - -test_dataset = torchvision.datasets.MNIST(root='../../data/', - train=False, - transform=transforms.ToTensor()) - -# Data loader -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True) - -test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - shuffle=False) - -# Recurrent neural network (many-to-one) -class RNN(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, num_classes): - super(RNN, self).__init__() - self.hidden_size = hidden_size - self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) - self.fc = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - # Set initial hidden and cell states - h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) - c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) - - # Forward propagate LSTM - out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size) - - # Decode the hidden state of the last time step - out = self.fc(out[:, -1, :]) - return out - -model = RNN(input_size, hidden_size, num_layers, num_classes).to(device) - - -# Loss and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - -# Train the model -total_step = len(train_loader) -for epoch in range(num_epochs): - for i, (images, labels) in enumerate(train_loader): - images = images.reshape(-1, sequence_length, input_size).to(device) - labels = labels.to(device) - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if (i+1) % 100 == 0: - print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' - .format(epoch+1, num_epochs, i+1, total_step, loss.item())) - -# Test the model -model.eval() -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - images = images.reshape(-1, sequence_length, input_size).to(device) - labels = labels.to(device) - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) - -# Save the model checkpoint -torch.save(model.state_dict(), 'model.ckpt') \ No newline at end of file diff --git a/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py b/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py new file mode 100644 index 00000000..7f0300fc --- /dev/null +++ b/tutorials/02-intermediate/recurrent_neural_network/recurrent_neural_network.py @@ -0,0 +1,186 @@ +# 导入PyTorch核心库 +import torch +import torch.nn as nn +# 导入TorchVision用于数据加载和预处理 +import torchvision +import torchvision.transforms as transforms + +# 设备配置:自动选择GPU(如果可用)否则使用CPU +# MPS是Apple Silicon GPU支持,这里也可以加上对MPS的支持 +device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')) + +""" +RNN(循环神经网络)的用途与应用场景 +RNN(Recurrent Neural Network,循环神经网络)是一类专门用于处理序列数据的深度学习模型, +其核心特点是能够记忆之前的信息并用于当前决策,这使得它在各种需要处理时序依赖关系的任务中表现出色。 +RNN的核心特性 +RNN通过在网络中引入循环连接,使模型能够: + +处理任意长度的序列数据 +捕捉序列中的时间依赖关系 +保留序列的上下文信息 + +RNN的主要应用场景 +1. 自然语言处理(NLP) +文本分类:情感分析、垃圾邮件检测、新闻分类 +语言建模:预测下一个词的概率分布 +机器翻译:将一种语言翻译成另一种语言 +命名实体识别:识别文本中的人名、地名、组织名等 +文本生成:自动生成文章、诗歌、对话等 +2. 时间序列预测 +股票价格预测:基于历史价格预测未来走势 +天气预报:基于气象数据预测未来天气 +电力负荷预测:预测未来电力需求 +销售预测:预测产品未来销量 +3. 语音处理 +语音识别:将语音转换为文本 +语音合成:将文本转换为语音 +说话人识别:识别说话人的身份 +4. 图像与视频分析 +图像描述生成:为图像生成文字描述 +视频分析:行为识别、动作检测 +手写体识别:如代码示例中的MNIST数字分类 +""" +# 定义RNN模型类,继承自nn.Module +class RNN(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, num_classes): + """ + RNN模型初始化函数 + :param input_size: 输入特征维度 (MNIST图像的每行像素数:28) + :param hidden_size: 隐藏层维度 (LSTM单元的隐藏状态大小:128) + :param num_layers: LSTM层数 (2层) + :param num_classes: 分类数量 (MNIST有10个数字类别) + """ + super(RNN, self).__init__() + self.hidden_size = hidden_size # 隐藏层大小 + self.num_layers = num_layers # LSTM层数 + + # 定义LSTM层: + # - input_size: 输入特征维度 + # - hidden_size: 隐藏层维度 + # - num_layers: LSTM层数 + # - batch_first=True: 输入输出形状为(batch_size, seq_length, feature_size) + self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) + + # 全连接层:将LSTM输出映射到分类结果 + self.fc = nn.Linear(hidden_size, num_classes) + + def forward(self, x): + """ + 前向传播函数 + :param x: 输入张量,形状为(batch_size, sequence_length, input_size) + :return: 输出张量,形状为(batch_size, num_classes) + """ + # 初始化LSTM的隐藏状态h0和细胞状态c0 + # 形状:(num_layers, batch_size, hidden_size) + h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) + c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) + + # 前向传播LSTM + # out: LSTM的输出,形状为(batch_size, seq_length, hidden_size) + # _: 包含最终隐藏状态和细胞状态的元组(这里未使用) + out, _ = self.lstm(x, (h0, c0)) + + # 解码最后一个时间步的隐藏状态用于分类 + # out[:, -1, :] 表示取所有样本的最后一个时间步的隐藏状态 + out = self.fc(out[:, -1, :]) + return out + + +if __name__ == '__main__': + # 超参数设置 + sequence_length = 28 # 序列长度 (MNIST图像的行数:28) + input_size = 28 # 输入特征维度 (MNIST图像的列数:28) + hidden_size = 128 # 隐藏层维度 + num_layers = 2 # LSTM层数 + num_classes = 10 # 分类数量 (0-9数字) + batch_size = 100 # 批次大小 + num_epochs = 2 # 训练轮数 + learning_rate = 0.01 # 学习率 + + # MNIST数据集加载 + # 训练集 + train_dataset = torchvision.datasets.MNIST( + root='../../data/', # 数据集保存路径 + train=True, # 训练集 + transform=transforms.ToTensor(), # 转换为Tensor并归一化到[0,1] + download=True # 自动下载(如果本地没有) + ) + + # 测试集 + test_dataset = torchvision.datasets.MNIST( + root='../../data/', + train=False, # 测试集 + transform=transforms.ToTensor() + ) + + # 数据加载器 + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, + batch_size=batch_size, + shuffle=True # 训练时打乱数据 + ) + + test_loader = torch.utils.data.DataLoader( + dataset=test_dataset, + batch_size=batch_size, + shuffle=False # 测试时不打乱数据 + ) + + # 实例化RNN模型(many-to-one架构:多个时间步输入,一个输出) + model = RNN(input_size, hidden_size, num_layers, num_classes).to(device) + + # 损失函数和优化器 + # 交叉熵损失:适用于多分类任务 + criterion = nn.CrossEntropyLoss() + # Adam优化器:自适应学习率优化算法 + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + # 训练模型 + total_step = len(train_loader) # 每个epoch的总步数 + for epoch in range(num_epochs): # 遍历每个epoch + for i, (images, labels) in enumerate(train_loader): # 遍历每个批次 + # 将图像重塑为序列数据: + # MNIST图像原始形状:(batch_size, 1, 28, 28) + # 重塑后形状:(batch_size, sequence_length=28, input_size=28) + # 即把28x28的图像看作28个时间步,每个时间步输入28个像素 + images = images.reshape(-1, sequence_length, input_size).to(device) + labels = labels.to(device) # 标签移至设备 + + # 前向传播 + outputs = model(images) # 模型预测 + loss = criterion(outputs, labels) # 计算损失 + + # 反向传播和优化 + optimizer.zero_grad() # 清除梯度 + loss.backward() # 反向传播计算梯度 + optimizer.step() # 更新参数 + + # 打印训练信息 + if (i + 1) % 100 == 0: + print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' + .format(epoch + 1, num_epochs, i + 1, total_step, loss.item())) + + # 测试模型 + model.eval() # 设置模型为评估模式(关闭dropout等) + with torch.no_grad(): # 关闭梯度计算,节省内存和计算 + correct = 0 # 正确预测数 + total = 0 # 总样本数 + for images, labels in test_loader: # 遍历测试集 + # 重塑图像并移至设备 + images = images.reshape(-1, sequence_length, input_size).to(device) + labels = labels.to(device) + outputs = model(images) # 模型预测 + + # 获取预测结果:torch.max返回最大值和索引,索引即为预测类别 + _, predicted = torch.max(outputs.data, 1) + + total += labels.size(0) # 更新总样本数 + correct += (predicted == labels).sum().item() # 更新正确预测数 + + # 打印测试准确率 + print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) + + # 保存模型权重 + torch.save(model.state_dict(), 'model.ckpt') + print("Model weights saved to 'model.ckpt'") \ No newline at end of file diff --git a/tutorials/03-advanced/generative_adversarial_network/main.py b/tutorials/03-advanced/generative_adversarial_network/generative_adversarial_network.py similarity index 100% rename from tutorials/03-advanced/generative_adversarial_network/main.py rename to tutorials/03-advanced/generative_adversarial_network/generative_adversarial_network.py diff --git a/tutorials/compilers/Inductor_demo.py b/tutorials/compilers/Inductor_demo.py new file mode 100644 index 00000000..1fbff4a1 --- /dev/null +++ b/tutorials/compilers/Inductor_demo.py @@ -0,0 +1,15 @@ +import torch + +def foo1(x1, x2): + a = torch.neg(x1) + b = torch.maximum(x2, a) + y = torch.cat([b], dim=0) + return y + + +# TORCH_COMPILE_DEBUG=1 python xx.py +if __name__ == '__main__': + x1 = torch.randint(256, (1, 8), dtype=torch.uint8) + x2 = torch.randint(256, (8390, 8), dtype=torch.uint8) + compiled_foo1 = torch.compile(foo1) + result = compiled_foo1(x1, x2) diff --git a/tutorials/compilers/basic_demo.py b/tutorials/compilers/basic_demo.py new file mode 100644 index 00000000..e9fd7d2b --- /dev/null +++ b/tutorials/compilers/basic_demo.py @@ -0,0 +1,203 @@ +import torch + +# def foo(x, y): +# a = torch.sin(x) +# b = torch.cos(y) +# return a + b +# +# +# opt_foo1 = torch.compile(foo) +# print(opt_foo1(torch.randn(3, 3), torch.randn(3, 3))) +# +# +# @torch.compile +# def opt_foo2(x, y): +# a = torch.sin(x) +# b = torch.cos(y) +# return a + b +# +# +# print(opt_foo2(torch.randn(3, 3), torch.randn(3, 3))) +# +# def inner(x): +# return torch.sin(x) +# +# +# @torch.compile +# def outer(x, y): +# a = inner(x) +# b = torch.cos(y) +# return a + b +# +# +# print(outer(torch.randn(3, 3), torch.randn(3, 3))) +# + +# t = torch.randn(10, 100) +# +# +# class MyModule(torch.nn.Module): +# def __init__(self): +# super().__init__() +# self.lin = torch.nn.Linear(3, 3) +# +# def forward(self, x): +# return torch.nn.functional.relu(self.lin(x)) +# +# +# mod1 = MyModule() +# mod1.compile() +# print(mod1(torch.randn(3, 3))) +# +# mod2 = MyModule() +# mod2 = torch.compile(mod2) +# print(mod2(torch.randn(3, 3))) + + +# Demonstrating Speedups 展示加速效果 + +def foo3(x): + y = x + 1 + z = torch.nn.functional.relu(y) + u = z * 2 + return u + + +# Returns the result of running `fn()` and the time it took for `fn()` to run, +# in seconds. We use CUDA events and synchronization for the most accurate +# measurements. +def timed(fn): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + result = fn() + end.record() + torch.cuda.synchronize() + return result, start.elapsed_time(end) / 1000 + + +opt_foo3 = torch.compile(foo3) +inp = torch.randn(4096, 4096).cuda() + + +def first_run(): + torch._logging.set_logs(graph_code=True) + """ + 请注意,torch.compile 似乎比即时执行要花费长得多的时间才能完成。这是因为 torch.compile 在最初几次执行时需要额外时间来编译模型。 + torch.compile 会尽可能重用已编译的代码,因此如果我们多运行几次优化后的模型,应该会看到与即时执行相比有显著的性能提升。 + """ + print("compile:", timed(lambda: opt_foo3(inp))[1]) + print("eager:", timed(lambda: foo3(inp))[1]) + + +""" +eager time 0: 0.027955583572387695 +eager time 1: 0.0004986880123615265 +eager time 2: 0.00045683199167251585 +eager time 3: 0.00045158401131629945 +eager time 4: 0.00045363199710845946 +eager time 5: 0.00045363199710845946 +eager time 6: 0.0004556800127029419 +eager time 7: 0.0004505600035190582 +eager time 8: 0.00045043200254440307 +eager time 9: 0.0004546560049057007 +~~~~~~~~~~ +compile time 0: 0.434231201171875 +compile time 1: 0.00026624000072479246 +compile time 2: 0.00023552000522613525 +compile time 3: 0.0002234240025281906 +compile time 4: 0.00021913599967956544 +compile time 5: 0.00022220799326896668 +compile time 6: 0.0002181120067834854 +compile time 7: 0.0002242559939622879 +compile time 8: 0.0002181120067834854 +compile time 9: 0.00022118400037288665 +~~~~~~~~~~ +(eval) eager median: 0.0004541440010070801, compile median: 0.00022281599789857864, speedup: 2.038201948200314x +""" + + +def many_runs(): + # turn off logging for now to prevent spam + torch._logging.set_logs(graph_code=False) + eager_times = [] + for i in range(10): + _, eager_time = timed(lambda: foo3(inp)) + eager_times.append(eager_time) + print(f"eager time {i}: {eager_time}") + print("~" * 10) + + compile_times = [] + for i in range(10): + _, compile_time = timed(lambda: opt_foo3(inp)) + compile_times.append(compile_time) + print(f"compile time {i}: {compile_time}") + print("~" * 10) + + import numpy as np + + eager_med = np.median(eager_times) + compile_med = np.median(compile_times) + speedup = eager_med / compile_med + assert speedup > 1 + print( + f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x" + ) + print("~" * 10) + + +def bar1(a, b): + x = a / (torch.abs(a) + 1) + if b.sum() < 0: + b = b * -1 + return x * b + + +def bar(a, b): + x = a / (torch.abs(a) + 1) + b = torch.where(b.sum() < 0, -b, b) + return x * b + + +# Graph Breaks 图中断 +"""“图中断”这一术语源于torch.compile尝试捕获并优化PyTorch操作图这一事实。当遇到不支持的Python代码时,这个图就必须被“中断”。 +图中断会导致优化机会的损失,这可能仍然不尽如人意,但总比出现无声的错误或硬崩溃要好。""" + + + +@torch.compile(fullgraph=True) +def bar_fixed(a, b): + x = a / (torch.abs(a) + 1) + + def true_branch(y): + return y * -1 + + def false_branch(y): + # NOTE: torch.cond doesn't allow aliased outputs + return y.clone() + + x = torch.cond(b.sum() < 0, true_branch, false_branch, (b,)) + return x * b + + +def graph_breaks_fixed_demo(): + torch._logging.set_logs(graph_code=True) + inp1 = torch.ones(10) + inp2 = torch.ones(10) + fixed = bar_fixed(inp1, inp2) + fixed1 = bar_fixed(inp1, -inp2) + print(f"fixed: {fixed}") + print(f"fixed1: {fixed1}") + + +def graph_breaks_demo(): + torch._logging.set_logs(graph_code=True) + opt_bar = torch.compile(bar) + inp1 = torch.ones(10) + inp2 = torch.ones(10) + opt_bar(inp1, inp2) + opt_bar(inp1, -inp2) + + +if __name__ == '__main__': + graph_breaks_fixed_demo() diff --git a/tutorials/compilers/benchmark_time.py b/tutorials/compilers/benchmark_time.py new file mode 100644 index 00000000..53e9a642 --- /dev/null +++ b/tutorials/compilers/benchmark_time.py @@ -0,0 +1,68 @@ +import torch +import torch.utils.benchmark as benchmark +from torch.utils.benchmark import Language +from torch.utils.benchmark import Timer +import pickle +import re +from torch.utils.benchmark import CallgrindStats, FunctionCounts + + +def cudaclu_timer(): + cpp_timer = Timer( + "x * y;", + """ + auto x = torch::ones({128}); + auto y = torch::ones({128}); + """, + language=Language.CPP, + ) + + print(cpp_timer.blocked_autorange(min_run_time=1)) + + +# 使用 Callgrind 进行 A/B 测试 +""" +指令计数最有用之处在于它们允许对计算进行精细比较,这在分析性能时至关重要。 +为了实际演示这一点,让我们将两个大小为 128 的 Tensor 相乘与一个 {128} x {1} 的乘法进行比较,后者将广播第二个 Tensor。 +""" + + +def call_grind_timer(): + broadcasting_stats = Timer( + "x * y;", + """ + auto x = torch::ones({128}); + auto y = torch::ones({1}); + """, + language=Language.CPP, + ).collect_callgrind().as_standardized().stats(inclusive=False) + # Let's round trip `broadcasting_stats` just to show that we can. + broadcasting_stats = pickle.loads(pickle.dumps(broadcasting_stats)) + + cpp_timer = Timer( + "x * y;", + """ + auto x = torch::ones({128}); + auto y = torch::ones({128}); + """, + language=Language.CPP, + ) + + print(cpp_timer.blocked_autorange(min_run_time=1)) + stats: CallgrindStats = cpp_timer.collect_callgrind() + inclusive_stats = stats.as_standardized().stats(inclusive=False) + print(inclusive_stats[:10]) + # And now to diff the two tasks: + delta = broadcasting_stats - inclusive_stats + + def extract_fn_name(fn: str): + """Trim everything except the function name.""" + fn = ":".join(fn.split(":")[1:]) + return re.sub(r"\(.+\)", "(...)", fn) + + # We use `.transform` to make the diff readable: + print(delta.transform(extract_fn_name)) + + +if __name__ == '__main__': + cudaclu_timer() diff --git a/tutorials/compilers/compiled_autograd_demo.py b/tutorials/compilers/compiled_autograd_demo.py new file mode 100644 index 00000000..49e90b36 --- /dev/null +++ b/tutorials/compilers/compiled_autograd_demo.py @@ -0,0 +1,58 @@ +import torch +import warnings +from torchvision.models import densenet121 +import numpy as np + + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + + def forward(self, x): + return self.linear(x) + + +""" +Python 解释器调用 Dynamo,因为此调用被装饰了 @torch.compile。 +Dynamo 拦截 Python 字节码,模拟其执行并将操作记录到图中。 +AOTDispatcher 禁用钩子,并调用自动梯度引擎来计算 model.linear.weight 和 +model.linear.bias 的梯度,并将操作记录到图中。使用 torch.autograd.Function,AOTDispatcher 重写了 train 的前向和反向传播实现。 +Inductor 生成一个对应于 AOTDispatcher 前向和反向传播优化实现的函数。 +Dynamo 设置优化后的函数,以便 Python 解释器接下来进行评估。 +Python 解释器执行优化后的函数,该函数执行 loss = model(x).sum()。 +Python 解释器执行 loss.backward(),调用自动梯度引擎,该引擎会路由到已编译的自动梯度引擎,因为我们将 torch._dynamo.config.compiled_autograd = True 设置为 True。 +已编译的自动梯度计算 model.linear.weight 和 model.linear.bias 的梯度,并将操作记录到图中,包括它遇到的任何钩子。 +在此过程中,它将记录 AOTDispatcher 之前重写的反向传播。然后,已编译的自动梯度生成一个新函数,该函数对应于 loss.backward() +的完全跟踪实现,并以推理模式使用 torch.compile 执行它。 +相同的步骤将递归应用于已编译的自动梯度图,但这次 AOTDispatcher 将不需要划分图。 + + +""" + + +def train_demo(): + model = Model() + x = torch.randn(10) + torch._dynamo.config.compiled_autograd = True + @torch.compile + def train(model, x): + loss = model(x).sum() + loss.backward() + train(model, x) + +def train_demo1(): + model = Model() + x = torch.randn(10) + torch._dynamo.config.compiled_autograd = True + @torch.compile + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True + torch.compile(lambda: loss.backward(), fullgraph=True)() + train(model, x) +# TORCH_LOGS="compiled_autograd" CUDA_VISIBLE_DEVICES=1 python compiled_autograd_demo.py +# TORCH_LOGS="compiled_autograd_verbose" CUDA_VISIBLE_DEVICES=1 python compiled_autograd_demo.py +if __name__ == '__main__': + train_demo1() diff --git a/tutorials/compilers/custom_onnxscript_demo.py b/tutorials/compilers/custom_onnxscript_demo.py new file mode 100644 index 00000000..d73705e6 --- /dev/null +++ b/tutorials/compilers/custom_onnxscript_demo.py @@ -0,0 +1,43 @@ +import torch +import onnxscript + +# Opset 18 is the standard supported version as of PyTorch 2.6 +from onnxscript import opset18 as op +class GeluModel(torch.nn.Module): + def forward(self, input_x): + return torch.ops.aten.gelu(input_x) + + +# Create a namespace for the custom operator using ONNX Script +# ``com.microsoft`` is an official ONNX Runtime namespace +microsoft_op = onnxscript.values.Opset(domain="com.microsoft", version=1) + +# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator. +# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml +# NOTE: All attributes must be annotated with type hints. +# The function must be scripted using the ``@onnxscript.script()`` decorator when +# using operators from custom domains. This may be improved in future versions. +from onnxscript import FLOAT + + +@onnxscript.script(microsoft_op) +def custom_aten_gelu(self: FLOAT, approximate: str = "none") -> FLOAT: + return microsoft_op.Gelu(self) +x = torch.tensor([1.0]) + +onnx_program = torch.onnx.export( + GeluModel().eval(), + (x,), + dynamo=True, + custom_translation_table={ + torch.ops.aten.gelu.default: custom_aten_gelu, + }, +) + +# Optimize the ONNX graph to remove redundant nodes +onnx_program.optimize() +print(onnx_program.model) + +result = onnx_program(x)[0] +print(f"Result: {result}") +torch.testing.assert_close(result, torch.ops.aten.gelu(x)) \ No newline at end of file diff --git a/tutorials/compilers/fx_demo.py b/tutorials/compilers/fx_demo.py new file mode 100644 index 00000000..60194dae --- /dev/null +++ b/tutorials/compilers/fx_demo.py @@ -0,0 +1,119 @@ +import torch +import torch.fx +import torchvision.models as models +import statistics, tabulate, time +from typing import Any, Dict, List +from torch.fx import Interpreter + + +# 创建性能分析解释器 +class ProfilingInterpreter(Interpreter): + def __init__(self, mod: torch.nn.Module): + # Rather than have the user symbolically trace their model, + # we're going to do it in the constructor. As a result, the + # user can pass in any ``Module`` without having to worry about + # symbolic tracing APIs + gm = torch.fx.symbolic_trace(mod) + super().__init__(gm) + + # We are going to store away two things here: + # + # 1. A list of total runtimes for ``mod``. In other words, we are + # storing away the time ``mod(...)`` took each time this + # interpreter is called. + self.total_runtime_sec: List[float] = [] + # 2. A map from ``Node`` to a list of times (in seconds) that + # node took to run. This can be seen as similar to (1) but + # for specific sub-parts of the model. + self.runtimes_sec: Dict[torch.fx.Node, List[float]] = {} + + ###################################################################### + # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run`` + # method is the top-level entry point for execution of the model. We will + # want to intercept this so that we can record the total runtime of the + # model. + + def run(self, *args) -> Any: + # Record the time we started running the model + t_start = time.time() + # Run the model by delegating back into Interpreter.run() + return_val = super().run(*args) + # Record the time we finished running the model + t_end = time.time() + # Store the total elapsed time this model execution took in the + # ``ProfilingInterpreter`` + self.total_runtime_sec.append(t_end - t_start) + return return_val + + ###################################################################### + # Now, let's override ``run_node``. ``Interpreter`` calls ``run_node`` each + # time it executes a single node. We will intercept this so that we + # can measure and record the time taken for each individual call in + # the model. + + def run_node(self, n: torch.fx.Node) -> Any: + # Record the time we started running the op + t_start = time.time() + # Run the op by delegating back into Interpreter.run_node() + return_val = super().run_node(n) + # Record the time we finished running the op + t_end = time.time() + # If we don't have an entry for this node in our runtimes_sec + # data structure, add one with an empty list value. + self.runtimes_sec.setdefault(n, []) + # Record the total elapsed time for this single invocation + # in the runtimes_sec data structure + self.runtimes_sec[n].append(t_end - t_start) + return return_val + + ###################################################################### + # Finally, we are going to define a method (one which doesn't override + # any ``Interpreter`` method) that provides us a nice, organized view of + # the data we have collected. + + def summary(self, should_sort: bool = False) -> str: + # Build up a list of summary information for each node + node_summaries: List[List[Any]] = [] + # Calculate the mean runtime for the whole network. Because the + # network may have been called multiple times during profiling, + # we need to summarize the runtimes. We choose to use the + # arithmetic mean for this. + mean_total_runtime = statistics.mean(self.total_runtime_sec) + + # For each node, record summary statistics + for node, runtimes in self.runtimes_sec.items(): + # Similarly, compute the mean runtime for ``node`` + mean_runtime = statistics.mean(runtimes) + # For easier understanding, we also compute the percentage + # time each node took with respect to the whole network. + pct_total = mean_runtime / mean_total_runtime * 100 + # Record the node's type, name of the node, mean runtime, and + # percent runtime. + node_summaries.append( + [node.op, str(node), mean_runtime, pct_total]) + + # One of the most important questions to answer when doing performance + # profiling is "Which op(s) took the longest?". We can make this easy + # to see by providing sorting functionality in our summary view + if should_sort: + node_summaries.sort(key=lambda s: s[2], reverse=True) + + # Use the ``tabulate`` library to create a well-formatted table + # presenting our summary information + headers: List[str] = [ + 'Op type', 'Op', 'Average runtime (s)', 'Pct total runtime' + ] + return tabulate.tabulate(node_summaries, headers=headers) + + +if __name__ == '__main__': + rn18 = models.resnet18() + rn18.eval() + input = torch.randn(5, 3, 224, 224) + # output = rn18(input) + # traced_rn18 = torch.fx.symbolic_trace(rn18) + # print(traced_rn18.graph) + + interp = ProfilingInterpreter(rn18) + interp.run(input) + print(interp.summary(True)) diff --git a/tutorials/compilers/nvidia_demo.py b/tutorials/compilers/nvidia_demo.py new file mode 100644 index 00000000..1c496595 --- /dev/null +++ b/tutorials/compilers/nvidia_demo.py @@ -0,0 +1,191 @@ +# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in +# order to reproduce the speedup numbers shown below and documented elsewhere. + +import torch +import warnings +from torchvision.models import densenet121 +import numpy as np + +gpu_ok = False +if torch.cuda.is_available(): + device_cap = torch.cuda.get_device_capability() + if device_cap in ((7, 0), (8, 0), (9, 0)): + gpu_ok = True + +if not gpu_ok: + warnings.warn( + "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " + "than expected." + ) + + +# Returns the result of running `fn()` and the time it took for `fn()` to run, +# in seconds. We use CUDA events and synchronization for the most accurate +# measurements. +def timed(fn): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + result = fn() + end.record() + torch.cuda.synchronize() + return result, start.elapsed_time(end) / 1000 + + +# Generates random input and targets data for the model, where `b` is +# batch size. +def generate_data(b): + return ( + torch.randn(b, 3, 128, 128).to().cuda(), + torch.randint(1000, (b,)).cuda(), + ) + + +N_ITERS = 10 + + +def init_model(): + return densenet121().cuda() + + +model = init_model() + +# Note that we generally recommend directly compiling a torch.nn.Module by calling +# its .compile() method. +model_opt = init_model() +model_opt.compile(mode="reduce-overhead") + + +def first_demo(): + inp = generate_data(16)[0] + with torch.no_grad(): + print("eager:", timed(lambda: model(inp))[1]) + print("compile:", timed(lambda: model_opt(inp))[1]) + + +""" +(eval) eager median: 0.01525604772567749, compile median: 0.003931119918823242, speedup: 3.8808400762916184x +eager eval time 0: 0.2951763916015625 +eager eval time 1: 0.01678335952758789 +eager eval time 2: 0.015734944343566894 +eager eval time 3: 0.015243231773376465 +eager eval time 4: 0.015268863677978516 +eager eval time 5: 0.01522979164123535 +eager eval time 6: 0.015177727699279785 +eager eval time 7: 0.015617024421691895 +eager eval time 8: 0.015202367782592773 +eager eval time 9: 0.015126527786254883 +~~~~~~~~~~ +compile eval time 0: 5.565470703125 +compile eval time 1: 0.24912281799316408 第二次还是慢了,尽管比第一次运行快得多。这是因为 "reduce-overhead" 模式会为 CUDA 图运行几次预热迭代。 +compile eval time 2: 0.00450867223739624 +compile eval time 3: 0.004577280044555664 +compile eval time 4: 0.003706687927246094 +compile eval time 5: 0.0037672960758209227 +compile eval time 6: 0.003935231924057007 +compile eval time 7: 0.003768320083618164 +compile eval time 8: 0.003927007913589477 +compile eval time 9: 0.0038635520935058594 +""" + + +def predict_many_demo(): + eager_times = [] + for i in range(N_ITERS): + inp = generate_data(16)[0] + with torch.no_grad(): + _, eager_time = timed(lambda: model(inp)) + eager_times.append(eager_time) + print(f"eager eval time {i}: {eager_time}") + + print("~" * 10) + + compile_times = [] + for i in range(N_ITERS): + inp = generate_data(16)[0] + with torch.no_grad(): + _, compile_time = timed(lambda: model_opt(inp)) + compile_times.append(compile_time) + print(f"compile eval time {i}: {compile_time}") + print("~" * 10) + + import numpy as np + + eager_med = np.median(eager_times) + compile_med = np.median(compile_times) + speedup = eager_med / compile_med + assert speedup > 1 + print( + f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x" + ) + print("~" * 10) + + +opt = torch.optim.Adam(model.parameters()) + + +def train(mod, data): + opt.zero_grad(True) + pred = mod(data[0]) + loss = torch.nn.CrossEntropyLoss()(pred, data[1]) + loss.backward() + opt.step() + +""" +eager train time 0: 0.6821947631835937 +eager train time 1: 0.0516577262878418 +eager train time 2: 0.048728256225585936 +eager train time 3: 0.047841407775878905 +eager train time 4: 0.04823257446289062 +eager train time 5: 0.048595008850097654 +eager train time 6: 0.057622528076171874 +eager train time 7: 0.05626262283325195 +eager train time 8: 0.057923583984375 +eager train time 9: 0.058123264312744144 +~~~~~~~~~~ + +compile train time 0: 141.419421875 +compile train time 1: 8.4247080078125 +compile train time 2: 0.018790399551391602 +compile train time 3: 0.010836992263793945 +compile train time 4: 0.010805248260498047 +compile train time 5: 0.010437631607055664 +compile train time 6: 0.010218496322631837 +compile train time 7: 0.012146688461303711 +compile train time 8: 0.012992511749267579 +compile train time 9: 0.012563455581665038 +~~~~~~~~~~ +(train) eager median: 0.05396017456054687, compile median: 0.012355072021484375, speedup: 4.3674512351457695x +""" +def train_many_demo(): + eager_times = [] + for i in range(N_ITERS): + inp = generate_data(16) + _, eager_time = timed(lambda: train(model, inp)) + eager_times.append(eager_time) + print(f"eager train time {i}: {eager_time}") + print("~" * 10) + # Note that because we are compiling a regular Python function, we do not + # call any .compile() method. + train_opt = torch.compile(train, mode="reduce-overhead") + + compile_times = [] + for i in range(N_ITERS): + inp = generate_data(16) + _, compile_time = timed(lambda: train_opt(model, inp)) + compile_times.append(compile_time) + print(f"compile train time {i}: {compile_time}") + print("~" * 10) + + eager_med = np.median(eager_times) + compile_med = np.median(compile_times) + speedup = eager_med / compile_med + assert speedup > 1 + print( + f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x" + ) + print("~" * 10) + + +if __name__ == '__main__': + train_many_demo() diff --git a/tutorials/compilers/onnx_demo.py b/tutorials/compilers/onnx_demo.py new file mode 100644 index 00000000..b3fba746 --- /dev/null +++ b/tutorials/compilers/onnx_demo.py @@ -0,0 +1,109 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import onnx +import onnxruntime +import onnxscript +import os + + +def get_version(): + print(torch.__version__) + print(onnxscript.__version__) + print(onnxruntime.__version__) + + +"""简单的图像分类器模型""" + + +class ImageClassifierModel(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x: torch.Tensor): + x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) + x = F.max_pool2d(F.relu(self.conv2(x)), 2) + x = torch.flatten(x, 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +def onnx_demo(): + # 创建示例输入(固定随机种子以确保可重复性) + torch.manual_seed(42) + example_inputs = (torch.randn(1, 1, 32, 32),) + onnx_inputs = [tensor.numpy(force=True) for tensor in example_inputs] + print(f"Input length: {len(onnx_inputs)}") + print(f"Sample input shape: {onnx_inputs[0].shape}") + + # 创建一个PyTorch模型实例,用于导出和比较 + torch_model = ImageClassifierModel() + + # 生成ONNX模型(传入同一个模型实例) + model_2_onnx(torch_model, example_inputs) + + # 加载ONNX模型并运行推理 + ort_session = onnxruntime.InferenceSession( + "./image_classifier_model.onnx", providers=["CPUExecutionProvider"] + ) + + onnxruntime_input = {input_arg.name: input_value for input_arg, input_value in + zip(ort_session.get_inputs(), onnx_inputs)} + + # ONNX Runtime returns a list of outputs + onnxruntime_outputs = ort_session.run(None, onnxruntime_input)[0] + + # 使用同一个PyTorch模型进行推理 + torch_outputs = torch_model(*example_inputs) + + print(f"PyTorch output shape: {torch_outputs.shape}") + print(f"ONNX Runtime output shape: {onnxruntime_outputs.shape}") + + # 直接比较整个张量而不是逐个元素比较 + try: + torch.testing.assert_close(torch_outputs, torch.tensor(onnxruntime_outputs), rtol=1e-3, atol=1e-3) + print("PyTorch and ONNX Runtime output matched!") + except AssertionError as e: + print(f"Outputs didn't match: {e}") + # 输出详细的差异信息 + print("\nPyTorch output:") + print(torch_outputs) + print("\nONNX Runtime output:") + print(onnxruntime_outputs) + print("\nDifference:") + print(torch_outputs - torch.tensor(onnxruntime_outputs)) + + print(f"Output length: {onnxruntime_outputs.shape[1]}") + print(f"Sample output: {onnxruntime_outputs[0][:5]}...") + + +def model_2_onnx(torch_model, example_inputs): + # 导出模型为ONNX格式,使用默认的opset版本 + torch.onnx.export( + torch_model, + example_inputs, + "image_classifier_model.onnx", + export_params=True, + # 不指定opset_version,让PyTorch自动选择合适的版本 + do_constant_folding=True, + input_names=['input'], + output_names=['output'], + dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}} + ) + + # 验证导出的模型 + onnx_model = onnx.load("image_classifier_model.onnx") + onnx.checker.check_model(onnx_model) + print("ONNX model generated and validated successfully!") + print(f"ONNX model opset version: {onnx_model.opset_import[0].version}") + + +if __name__ == '__main__': + onnx_demo() \ No newline at end of file diff --git a/tutorials/compilers/onnxscript_demo.py b/tutorials/compilers/onnxscript_demo.py new file mode 100644 index 00000000..80ac8aee --- /dev/null +++ b/tutorials/compilers/onnxscript_demo.py @@ -0,0 +1,43 @@ +import torch +import onnxscript + +# Opset 18 is the standard supported version as of PyTorch 2.6 +from onnxscript import opset18 as op + + +# Create a model that uses the operator torch.ops.aten.add.Tensor +class Model(torch.nn.Module): + def forward(self, input_x, input_y): + return torch.ops.aten.add.Tensor(input_x, input_y) + + +# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator. +# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml +# All attributes must be annotated with type hints. +def custom_aten_add(self, other, alpha: float = 1.0): + if alpha != 1.0: + alpha = op.CastLike(alpha, other) + other = op.Mul(other, alpha) + # To distinguish the custom implementation from the builtin one, we switch the order of the inputs + return op.Add(other, self) + + +x = torch.tensor([1.0]) +y = torch.tensor([2.0]) + +# Then we provide the custom implementation to the ONNX exporter as a ``custom_translation_table``. +onnx_program = torch.onnx.export( + Model().eval(), + (x, y), + dynamo=True, + custom_translation_table={ + torch.ops.aten.add.Tensor: custom_aten_add, + }, +) +# Optimize the ONNX graph to remove redundant nodes +onnx_program.optimize() +print(onnx_program.model) + +result = onnx_program(x, y)[0] +print(f"Result: {result}") +torch.testing.assert_close(result, torch.tensor([3.0])) \ No newline at end of file diff --git a/tutorials/compilers/set_stance_demo.py b/tutorials/compilers/set_stance_demo.py new file mode 100644 index 00000000..255bb49f --- /dev/null +++ b/tutorials/compilers/set_stance_demo.py @@ -0,0 +1,100 @@ +import torch + + +@torch.compile +def my_big_model(x): + return torch.relu(x) + + +# fail_on_recompile 防止重新编译 +def fail_on_recompile(): + # first compilation + my_big_model(torch.randn(3)) + + with torch.compiler.set_stance("fail_on_recompile"): + my_big_model(torch.randn(3)) # no recompilation - OK + try: + # 这里 shape 改变了,会触发 recompilation + my_big_model(torch.randn(4)) # recompilation - error + except Exception as e: + print(e) + + +@torch.compile +def my_huge_model(x): + if torch.compiler.is_compiling(): + return x + 1 + else: + return x - 1 + + +""" +报错过于 disruptive,我们可以改用 "eager_on_recompile",它将导致 torch.compile 回退到立即执行模式而不是报错。 +如果预计重新编译不会频繁发生,但一旦需要,我们宁愿承担立即执行的成本而不是重新编译的成本,那么这可能很有用。 +""" + + +def eager_on_recompile(): + # first compilation + print(my_huge_model(torch.zeros(3))) # 1 + with torch.compiler.set_stance("eager_on_recompile"): + print(my_huge_model(torch.zeros(3))) # 1 + print(my_huge_model(torch.zeros(4))) # -1 + print(my_huge_model(torch.zeros(3))) # 1 + + +# 衡量性能提升 +# Returns the result of running `fn()` and the time it took for `fn()` to run, +# in seconds. We use CUDA events and synchronization for the most accurate +# measurements. +def timed(fn): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + result = fn() + end.record() + torch.cuda.synchronize() + return result, start.elapsed_time(end) / 1000 + + +@torch.compile +def my_gigantic_model(x, y): + x = x @ y + x = x @ y + x = x @ y + return x + + +""" +eager: 0.0004822399914264679 +compiled: 0.00010444799810647964 +""" + + +def force_eager_demo(): + inps = torch.randn(5, 5), torch.randn(5, 5) + with torch.compiler.set_stance("force_eager"): + print("eager:", timed(lambda: my_gigantic_model(*inps))[1]) + # warmups + for _ in range(3): + my_gigantic_model(*inps) + print("compiled:", timed(lambda: my_gigantic_model(*inps))[1]) + + +@torch.compile +def my_humongous_model(x): + return torch.sin(x, x) + +def fast_find_error(): + try: + # sin() takes 1 positional argument but 2 were given + with torch.compiler.set_stance("force_eager"): + print(my_humongous_model(torch.randn(3))) + # this call to the compiled model won't run + print(my_humongous_model(torch.randn(3))) + except Exception as e: + print(e) + +if __name__ == '__main__': + fast_find_error() + diff --git a/tutorials/compilers/torch_compile.py b/tutorials/compilers/torch_compile.py new file mode 100644 index 00000000..77ffce45 --- /dev/null +++ b/tutorials/compilers/torch_compile.py @@ -0,0 +1,45 @@ +import torch +import torch.utils.benchmark as benchmark +from torch.utils.benchmark import Language +from torch.utils.benchmark import Timer +model = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] +) +input = torch.rand(1024, device="cuda") +output = model(input) +output.sum().backward() +opt = torch.optim.Adam(model.parameters(), lr=0.01) + + +@torch.compile(fullgraph=False) +def fn(): + opt.step() + + +def benchmark_torch_function_in_microseconds(f, *args, **kwargs): + t0 = benchmark.Timer( + stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} + ) + return t0.blocked_autorange().mean * 1e6 + + +def warmup(): + for _ in range(5): + fn() + + +def diff(): + eager_runtime = benchmark_torch_function_in_microseconds(opt.step) + compiled_runtime = benchmark_torch_function_in_microseconds(fn) + + assert eager_runtime > compiled_runtime + + print(f"eager runtime: {eager_runtime}us") + print(f"compiled runtime: {compiled_runtime}us") + + + + +if __name__ == '__main__': + diff() + diff --git a/tutorials/ddp/basic_ddp_demo.py b/tutorials/ddp/basic_ddp_demo.py new file mode 100644 index 00000000..3058607d --- /dev/null +++ b/tutorials/ddp/basic_ddp_demo.py @@ -0,0 +1,137 @@ +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn +import torch.optim as optim +import os +from torch.nn.parallel import DistributedDataParallel as DDP + +import os +import sys +import tempfile +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.optim as optim +import torch.multiprocessing as mp + +from torch.nn.parallel import DistributedDataParallel as DDP + + +# On Windows platform, the torch.distributed package only +# supports Gloo backend, FileStore and TcpStore. +# For FileStore, set init_method parameter in init_process_group +# to a local file. Example as follow: +# init_method="file:///f:/libtmp/some_file" +# dist.init_process_group( +# "gloo", +# rank=rank, +# init_method=init_method, +# world_size=world_size) +# For TcpStore, same way as on Linux. + +def setup(rank, world_size): + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12355' + + # We want to be able to train our model on an `accelerator `__ + # such as CUDA, MPS, MTIA, or XPU. + acc = torch.accelerator.current_accelerator() + print('Accelerator:', acc) + backend = torch.distributed.get_default_backend_for_device(acc) + # initialize the process group + dist.init_process_group(backend, rank=rank, world_size=world_size) + + +def cleanup(): + dist.destroy_process_group() + + +class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.net2 = nn.Linear(10, 5) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + +def demo_basic(rank, world_size): + print(f"Running basic DDP example on rank {rank}.") + setup(rank, world_size) + + # create model and move it to GPU with id rank + model = ToyModel().to(rank) + ddp_model = DDP(model, device_ids=[rank]) + + loss_fn = nn.MSELoss() + optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) + + optimizer.zero_grad() + outputs = ddp_model(torch.randn(20, 10)) + labels = torch.randn(20, 5).to(rank) + loss_fn(outputs, labels).backward() + optimizer.step() + + cleanup() + print(f"Finished running basic DDP example on rank {rank}.") + + +def run_demo(demo_fn, world_size): + mp.spawn(demo_fn, + args=(world_size,), + nprocs=world_size, + join=True) + + +def demo_checkpoint(rank, world_size): + print(f"Running DDP checkpoint example on rank {rank}.") + setup(rank, world_size) + + model = ToyModel().to(rank) + ddp_model = DDP(model, device_ids=[rank]) + + CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint" + print(f"Checkpoint path: {CHECKPOINT_PATH}") + if rank == 0: + # All processes should see same parameters as they all start from same + # random parameters and gradients are synchronized in backward passes. + # Therefore, saving it in one process is sufficient. + torch.save(ddp_model.state_dict(), CHECKPOINT_PATH) + + # Use a barrier() to make sure that process 1 loads the model after process + # 0 saves it. + dist.barrier() + # We want to be able to train our model on an `accelerator `__ + # such as CUDA, MPS, MTIA, or XPU. + acc = torch.accelerator.current_accelerator() + # configure map_location properly + map_location = {f'{acc}:0': f'{acc}:{rank}'} + ddp_model.load_state_dict( + torch.load(CHECKPOINT_PATH, map_location=map_location, weights_only=True)) + + loss_fn = nn.MSELoss() + optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) + + optimizer.zero_grad() + outputs = ddp_model(torch.randn(20, 10)) + labels = torch.randn(20, 5).to(rank) + + loss_fn(outputs, labels).backward() + optimizer.step() + + # Not necessary to use a dist.barrier() to guard the file deletion below + # as the AllReduce ops in the backward pass of DDP already served as + # a synchronization. + + if rank == 0: + os.remove(CHECKPOINT_PATH) + + cleanup() + print(f"Finished running DDP checkpoint example on rank {rank}.") + + +if __name__ == '__main__': + run_demo(demo_checkpoint, world_size=2) diff --git a/tutorials/ddp/distributed_demo.py b/tutorials/ddp/distributed_demo.py new file mode 100644 index 00000000..272a27ed --- /dev/null +++ b/tutorials/ddp/distributed_demo.py @@ -0,0 +1,79 @@ +"""run.py:""" +# !/usr/bin/env python +import os +import sys +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +# def run(rank, size): +# """ Distributed function to be implemented later. """ +# """ 分布式函数示例 """ +# print(f"Rank {rank} of {size} is running.") +# # 简单的分布式通信示例 +# tensor = torch.tensor([rank], dtype=torch.float32) +# dist.all_reduce(tensor, op=dist.ReduceOp.SUM) +# print(f"Rank {rank} reduced tensor: {tensor.item()}") +# +# """Blocking point-to-point communication.""" +# +# +# def run(rank, size): +# tensor = torch.zeros(1) +# if rank == 0: +# tensor += 1 +# # Send the tensor to process 1 +# dist.send(tensor=tensor, dst=1) +# else: +# # Receive tensor from process 0 +# dist.recv(tensor=tensor, src=0) +# print('Rank ', rank, ' has data ', tensor[0]) +"""Non-blocking point-to-point communication.""" + + +# def run(rank, size): +# tensor = torch.zeros(1) +# req = None +# if rank == 0: +# tensor += 1 +# # Send the tensor to process 1 +# req = dist.isend(tensor=tensor, dst=1) +# print('Rank 0 started sending') +# else: +# # Receive tensor from process 0 +# req = dist.irecv(tensor=tensor, src=0) +# print('Rank 1 started receiving') +# req.wait() +# print('Rank ', rank, ' has data ', tensor[0]) +def run(rank, size): + """ Simple collective communication. """ + group = dist.new_group([0, 1]) + tensor = torch.ones(1) + print(f"tensor: {tensor}") + dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group) + print('Rank ', rank, ' has data ', tensor[0]) + + +def init_process(rank, size, fn, backend='gloo'): + """ Initialize the distributed environment. """ + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29500' + dist.init_process_group(backend, rank=rank, world_size=size) + fn(rank, size) + + +if __name__ == "__main__": + world_size = 2 + processes = [] + if "google.colab" in sys.modules: + print("Running in Google Colab") + mp.get_context("spawn") + else: + mp.set_start_method("spawn") + for rank in range(world_size): + p = mp.Process(target=init_process, args=(rank, world_size, run)) + p.start() + processes.append(p) + + for p in processes: + p.join() diff --git a/tutorials/ddp/pipelining_tutorial.py b/tutorials/ddp/pipelining_tutorial.py new file mode 100644 index 00000000..f0337ab6 --- /dev/null +++ b/tutorials/ddp/pipelining_tutorial.py @@ -0,0 +1,120 @@ +import os +import torch.distributed as dist +import torch +from torch.distributed.pipelining import pipeline, SplitPoint, PipelineStage, ScheduleGPipe +import torch +import torch.nn as nn +from dataclasses import dataclass + +global rank, device, pp_group, stage_index, num_stages + + +@dataclass +class ModelArgs: + dim: int = 512 + n_layers: int = 8 + n_heads: int = 8 + vocab_size: int = 10000 + + +class Transformer(nn.Module): + def __init__(self, model_args: ModelArgs): + super().__init__() + + self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) + + # Using a ModuleDict lets us delete layers witout affecting names, + # ensuring checkpoints will correctly save and load. + self.layers = torch.nn.ModuleDict() + for layer_id in range(model_args.n_layers): + self.layers[str(layer_id)] = nn.TransformerDecoderLayer(model_args.dim, model_args.n_heads) + + self.norm = nn.LayerNorm(model_args.dim) + self.output = nn.Linear(model_args.dim, model_args.vocab_size) + + def forward(self, tokens: torch.Tensor): + # Handling layers being 'None' at runtime enables easy pipeline splitting + h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens + + for layer in self.layers.values(): + h = layer(h, h) + + h = self.norm(h) if self.norm else h + output = self.output(h).clone() if self.output else h + return output + + +def init_distributed(): + global rank, device, pp_group, stage_index, num_stages + rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + device = torch.device(f"cuda:{rank}") if torch.cuda.is_available() else torch.device("cpu") + dist.init_process_group() + + # This group can be a sub-group in the N-D parallel case + pp_group = dist.new_group() + stage_index = rank + num_stages = world_size + + +def manual_model_split(model) -> PipelineStage: + if stage_index == 0: + # prepare the first stage model + for i in range(4, 8): + del model.layers[str(i)] + model.norm = None + model.output = None + + elif stage_index == 1: + # prepare the second stage model + for i in range(4): + del model.layers[str(i)] + model.tok_embeddings = None + + stage = PipelineStage( + model, + stage_index, + num_stages, + device, + ) + return stage + + +if __name__ == "__main__": + init_distributed() + num_microbatches = 4 + model_args = ModelArgs() + model = Transformer(model_args) + + # Dummy data + x = torch.ones(32, 500, dtype=torch.long) + y = torch.randint(0, model_args.vocab_size, (32, 500), dtype=torch.long) + example_input_microbatch = x.chunk(num_microbatches)[0] + + # Option 1: Manual model splitting + stage = manual_model_split(model) + + # Option 2: Tracer model splitting + # stage = tracer_model_split(model, example_input_microbatch) + + model.to(device) + x = x.to(device) + y = y.to(device) + + + def tokenwise_loss_fn(outputs, targets): + loss_fn = nn.CrossEntropyLoss() + outputs = outputs.reshape(-1, model_args.vocab_size) + targets = targets.reshape(-1) + return loss_fn(outputs, targets) + + + schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn) + + if rank == 0: + schedule.step(x) + elif rank == 1: + losses = [] + output = schedule.step(target=y, losses=losses) + print(f"losses: {losses}") + dist.destroy_process_group() diff --git a/tutorials/ddp/pytorch_elastic.py b/tutorials/ddp/pytorch_elastic.py new file mode 100644 index 00000000..01a44ff6 --- /dev/null +++ b/tutorials/ddp/pytorch_elastic.py @@ -0,0 +1,47 @@ +import os +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.optim as optim + +from torch.nn.parallel import DistributedDataParallel as DDP + +class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.net2 = nn.Linear(10, 5) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + +def demo_basic(): + torch.accelerator.set_device_index(int(os.environ["LOCAL_RANK"])) + acc = torch.accelerator.current_accelerator() + print('Accelerator:', acc) + backend = torch.distributed.get_default_backend_for_device(acc) + dist.init_process_group(backend) + rank = dist.get_rank() + print(f"Start running basic DDP example on rank {rank}.") + # create model and move it to GPU with id rank + device_id = rank % torch.accelerator.device_count() + print(f"Device ID: {device_id}") + model = ToyModel().to(device_id) + ddp_model = DDP(model, device_ids=[device_id]) + loss_fn = nn.MSELoss() + optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) + + optimizer.zero_grad() + outputs = ddp_model(torch.randn(20, 10)) + labels = torch.randn(20, 5).to(device_id) + loss_fn(outputs, labels).backward() + optimizer.step() + dist.destroy_process_group() + print(f"Finished running basic DDP example on rank {rank}.") +""" +torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py +""" +if __name__ == "__main__": + demo_basic() diff --git a/tutorials/tcp/libuv_demo.py b/tutorials/tcp/libuv_demo.py new file mode 100644 index 00000000..1c9d5ac8 --- /dev/null +++ b/tutorials/tcp/libuv_demo.py @@ -0,0 +1,30 @@ +import logging +import os + +from time import perf_counter + +import torch +import torch.distributed as dist + +logging.basicConfig(level=logging.INFO) +logger: logging.Logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # Env var are preset when launching the benchmark + env_rank = os.environ.get("RANK", 0) + env_world_size = os.environ.get("WORLD_SIZE", 1) + env_master_addr = os.environ.get("MASTER_ADDR", "localhost") + env_master_port = os.environ.get("MASTER_PORT", "23456") + + start = perf_counter() + tcp_store = dist.TCPStore( + env_master_addr, + int(env_master_port), + world_size=int(env_world_size), + is_master=(int(env_rank) == 0), + ) + end = perf_counter() + time_elapsed = end - start + logger.info( + f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." + ) diff --git a/tutorials/torch-distributed/init-process-group.py b/tutorials/torch-distributed/init-process-group.py new file mode 100644 index 00000000..c034fe29 --- /dev/null +++ b/tutorials/torch-distributed/init-process-group.py @@ -0,0 +1,25 @@ +import os +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + + + +def init_process(rank, world_size): + print(f"进程已启动: 此进程的 rank 是 {rank}") + + +def main(): + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '29500' + world_size = torch.cuda.device_count() + print(f"准备启动 {world_size} 个进程...") + mp.spawn( + init_process, + args=(world_size,), + nprocs=world_size, + join=True + ) + +if __name__ == "__main__": + main() diff --git a/tutorials/torch-distributed/readme.md b/tutorials/torch-distributed/readme.md new file mode 100644 index 00000000..0373b825 --- /dev/null +++ b/tutorials/torch-distributed/readme.md @@ -0,0 +1,94 @@ +
+init_process_group + +```python +import os +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + + +def init_process(rank, world_size): + print(f"进程已启动: 此进程的 rank 是 {rank}") + + # 设置当前进程使用的 GPU + torch.cuda.set_device(rank) + + try: + # 加入进程组 + print(f"进程 {rank} 正在加入进程组...") + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + print(f"进程 {rank} 已成功加入进程组") + + # 验证身份 + assert rank == dist.get_rank() + assert world_size == dist.get_world_size() + + # 准备当前进程的信息 + process_info = ( + f"\n进程 {rank} 信息:\n" + f"- Device: {torch.cuda.current_device()}\n" + f"- GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}\n" + ) + + # 将字符串转换为固定长度的张量 + max_len = 100 # 确保足够长以容纳信息 + process_info_tensor = torch.zeros(max_len, dtype=torch.int32, device='cuda') + process_info_bytes = process_info.encode('utf-8') + process_info_tensor[:len(process_info_bytes)] = torch.tensor([b for b in process_info_bytes], dtype=torch.int32) + + # 创建用于收集所有进程信息的张量列表 + gathered_tensors = [torch.zeros(max_len, dtype=torch.int32, device='cuda') for _ in range(world_size)] + + # 使用 all_gather 收集所有进程的信息 + dist.all_gather(gathered_tensors, process_info_tensor) + + + if rank == 0: + print("=============== 所有进程信息 ===============") + for tensor in gathered_tensors: + info_bytes = tensor.cpu().numpy().astype('uint8').tobytes() + info_str = info_bytes.decode('utf-8', 'ignore').strip('\x00') + print(info_str) + + # 创建张量并进行通信 + tensor = torch.ones(1).cuda() * rank + print(f"进程 {rank} 的原始张量值: {tensor.item()}") + + # 所有进程同步点 + dist.all_reduce(tensor, op=dist.ReduceOp.SUM) + print(f"进程 {rank} 的最终张量值: {tensor.item()}") + + finally: + dist.destroy_process_group() + +def main(): + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '29500' + + world_size = torch.cuda.device_count() + print(f"准备启动 {world_size} 个进程...") + + mp.spawn( + init_process, + args=(world_size,), + nprocs=world_size, + join=True + ) + + #! 等价于通过以下代码启动进程 + # processes = [] + # for rank in range(world_size): + # p = mp.Process(target=init_process, args=(rank, world_size)) + # p.start() + # processes.append(p) + + # # 相当于 join=True 的效果 + # for p in processes: + # p.join() + +if __name__ == "__main__": + main() +``` + +
\ No newline at end of file diff --git a/tutorials/torch-layout/demo1.py b/tutorials/torch-layout/demo1.py new file mode 100644 index 00000000..72f49858 --- /dev/null +++ b/tutorials/torch-layout/demo1.py @@ -0,0 +1,144 @@ +import torch + + +def stride_demo(): + """ + 连续和非连续布局 + 1.按照张量的逻辑形状(比如 2x3、3x2),以C 风格(行优先) 遍历每一个元素时,访问的内存地址是否是连续递增的。 + 为什么内存布局重要? + 性能影响:连续布局的张量访问内存时,CPU/GPU 的缓存命中率更高,运算速度更快;非连续张量可能因为内存跳跃访问导致性能下降。 + 操作限制:部分 PyTorch 操作(如view、resize_)仅支持连续张量,非连续张量会抛出RuntimeError。 + 内存效率:像转置这样的操作通过修改布局而非复制数据,能节省大量内存。 + """ + # 从0到12 分成三行四列 + x = torch.arange(12).view(3, 4) + print(f"view: {x}") + print(f"shape: {x.shape}") + # 在指定维度上从一个元素调到下一个元素所需的距离 dim 是两个数字,为了访问下一行需要往前移动4步,下一列应该向前移动1步 + print(f"stride: {x.stride()}") + print(f"is_contiguous: {x.is_contiguous()}") + print("\n***********************************\n") + y = x.t() + + # 这里没有真正的转置,只是改变了视图的 stride “如何从内存中读取数据” + print(f"v_t is_contiguous: {y.is_contiguous()}") + print(f"v_t: {y}") + print(f"v_t shape: {y.shape}") + print(f"v_t stride: {y.stride()}") + print("\n***********************************\n") + z = y.contiguous() + print(f"v_t is_contiguous: {z.is_contiguous()}") + print(f"v_t: {z}") + print(f"v_t shape: {z.shape}") + print(f"v_t stride: {z.stride()}") + + +def storage_demo(): + print(f"PyTorch版本: {torch.__version__}") + print( + f"操作系统: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}" + ) + + x = torch.tensor([[0, 1, 2], [3, 4, 5]], dtype=torch.float32) + storage_x = x.storage() + # 1. storage永远是一维的,不管Tensor是几维 + print(f"storage:\n{storage_x}\n") + print("storage的类型:", type(storage_x)) # torch.storage._TypedStorage + print("storage的长度:", len(storage_x)) # 6(元素总数) + print("storage_x的id:", id(storage_x)) + + # 2. 转置后的Tensor共享同一个storage(物理内存没复制) + y = x.t() + storage_y = y.storage() + print("x的数据指针:", x.data_ptr()) + print("y的数据指针:", y.data_ptr()) + print("x和y的数据指针是否相同:", x.data_ptr() == y.data_ptr()) + + print("storage_y的id:", id(storage_y)) + + +def shared_storage_demo(): + """ + 验证多个tensor共享storage时,修改一个会影响其他所有tensor + """ + print("=" * 60) + print("验证多个tensor共享storage的行为") + print("=" * 60) + + # 创建原始tensor + x = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=torch.float32) + + # 创建多个共享storage的tensor + y = x.t() # 转置 + z = x.view(-1) # 展平成一维 + w = x[1:, 2:] # 切片 + v = x.reshape(2, 6) # reshape(如果可能,会共享storage) + + print("\n【初始状态】") + print(f"x (原始) =\n{x}") + print(f"y (转置) =\n{y}") + print(f"z (展平) = {z}") + print(f"w (切片) =\n{w}") + print(f"v (reshape) =\n{v}") + + # 验证它们是否共享存储(通过data_ptr比较) + print("\n【内存地址验证】") + print(f"x.data_ptr() = {x.data_ptr()}") + print(f"y.data_ptr() = {y.data_ptr()}") + print(f"z.data_ptr() = {z.data_ptr()}") + print(f"w.data_ptr() = {w.data_ptr()} (切片会有偏移)") + print(f"v.data_ptr() = {v.data_ptr()}") + print(f"\nx和y是否共享底层内存: {x.data_ptr() == y.data_ptr()}") + print(f"x和z是否共享底层内存: {x.data_ptr() == z.data_ptr()}") + print(f"x和v是否共享底层内存: {x.data_ptr() == v.data_ptr()}") + + # 修改x的一个元素 + print("\n【修改x[0, 0] = 999】") + x[0, 0] = 999 + + print(f"x =\n{x}") + print(f"y =\n{y}") # y[0, 0]应该也变成999 + print(f"z = {z}") # z[0]应该也变成999 + print(f"w =\n{w}") # w不受影响,因为它是切片[1:, 2:] + print(f"v =\n{v}") # v[0, 0]应该也变成999 + + # 修改y的一个元素 + print("\n【修改y[1, 1] = 888】") + y[1, 1] = 888 + + print(f"x =\n{x}") # x[1, 1]应该也变成888 + print(f"y =\n{y}") + print(f"z = {z}") # z[5]应该也变成888 + print(f"v =\n{v}") # v相应位置也会变化 + + # 修改z的一个元素 + print("\n【修改z[10] = 777】") + z[10] = 777 + + print(f"x =\n{x}") # x[2, 2]应该也变成777 + print(f"y =\n{y}") + print(f"z = {z}") + print(f"w =\n{w}") # w[1, 0]应该也变成777(因为w是x[1:, 2:]) + + # 对比:使用clone()创建真正的副本 + print("\n【对比:使用clone()创建独立副本】") + x_copy = x.clone() + print(f"x_copy.data_ptr() = {x_copy.data_ptr()}") + print(f"x_copy与x是否共享内存: {x_copy.data_ptr() == x.data_ptr()}") + + x[0, 1] = 666 + print("\n修改x[0, 1] = 666后:") + print(f"x =\n{x}") + print(f"x_copy =\n{x_copy} (不受影响)") + + print("\n" + "=" * 60) + + +if __name__ == "__main__": + # shared_storage_demo() + x = torch.randn(2, 3, 4) + print(f"x: {x}") + print(f"x shape: {x.shape}") + print(f"x stride: {x.stride()}") + # import json + # print(json.dumps(x.tolist(), indent=4))