第10节课程

zjhellofss · zjhellofss · commit 890054c3f318 · 2024-07-26T21:40:45.000+08:00
diff --git a/demo/main.cpp b/demo/main.cpp
@@ -17,13 +17,13 @@ int32_t generate(const model::LLama2Model& model, const std::string& sentence, i
     pos_tensor.index<int32_t>(0) = pos;
     if (pos < prompt_len - 1) {
       tensor::Tensor input = model.fill_input(pos_tensor, prompt_embedding, is_prompt);
-      next = model.forward(input, pos_tensor, is_prompt, next);
+       model.predict(input, pos_tensor, is_prompt, next);
     } else {
       is_prompt = false;
       tokens = std::vector<int32_t>{next};
       const auto& token_embedding = model.embedding(tokens);
       tensor::Tensor input = model.fill_input(pos_tensor, token_embedding, is_prompt);
-      model.forward(input, pos_tensor, is_prompt, next);
+      model.predict(input, pos_tensor, is_prompt, next);
     }
     if (next == model.get_eos()) {
       break;
diff --git a/imgs/qa.jpg b/imgs/qa.jpg
diff --git a/kuiper/include/model/llama2.h b/kuiper/include/model/llama2.h
@@ -36,9 +36,12 @@ class LLama2Model : public Model {
 
   base::Status init(base::DeviceType device_type) override;
 
-  base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
+  base::Status predict(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
                        bool is_prompt, int& next) const override;
 
+  base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
+                       int& next) const override;
+
   std::vector<int32_t> encode(const std::string& sentence) const override;
 
   int32_t get_eos() const override;
diff --git a/kuiper/include/model/model.h b/kuiper/include/model/model.h
@@ -18,9 +18,12 @@ class Model {
 
   virtual base::Status init(base::DeviceType device_type) = 0;
 
-  virtual base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
+  virtual base::Status predict(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
                                bool is_prompt, int& next) const = 0;
 
+  virtual base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
+                               int& next) const = 0;
+
   virtual int32_t get_eos() const = 0;
 
   base::ModelType model_type() const;
diff --git a/kuiper/source/model/llama2.cpp b/kuiper/source/model/llama2.cpp
@@ -131,7 +131,7 @@ base::Status LLama2Model::init(base::DeviceType device_type) {
 }
 
 base::Status LLama2Model::forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
-                                  bool is_prompt, int& next) const {
+                                  int& next) const  {
   if (input.is_empty()) {
     return base::error::InvalidArgument("The input tensor is empty.");
   }
@@ -149,7 +149,6 @@ base::Status LLama2Model::forward(const tensor::Tensor& input, const tensor::Ten
     feed_forward(layer_idx, input);
   }
   cls_logits(input);
-  next = post_processing(pos_tensor, is_prompt);
   return base::error::Success();
 }
 
@@ -674,6 +673,16 @@ void LLama2Model::attention_qkv(int32_t layer_idx, const tensor::Tensor& pos_ten
   STATUS_CHECK(llama_layers_->rope_layer_->forward(query, key, pos_tensor, tensor::Tensor{}));
 }
 
+base::Status LLama2Model::predict(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
+                                  bool is_prompt, int& next) const {
+  auto status = forward(input, pos_tensor, next);
+  if (!status) {
+    return status;
+  }
+  next = post_processing(pos_tensor, is_prompt);
+  return base::error::Success();
+}
+
 void LLama2Model::attention_mha(int32_t layer_idx, const tensor::Tensor& pos_tensor) const {
   CHECK(llama_layers_ != nullptr);
   // mha
diff --git a/kuiper/source/op/kernels/cuda/rmsnorm_kernel.cu b/kuiper/source/op/kernels/cuda/rmsnorm_kernel.cu
@@ -64,22 +64,12 @@ void rmsnorm_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight
   float* in_ptr = const_cast<float*>(input.ptr<float>());
   float* wei_ptr = const_cast<float*>(weight.ptr<float>());
   float* out_ptr = const_cast<float*>(output.ptr<float>());
-  if (size < 1024) {
-    constexpr int threads_num = 128;
-    if (stream) {
-      cudaStream_t stream_ = static_cast<cudaStream_t>(stream);
-      row_rmsnorm_f32<128><<<1, threads_num, 0, stream_>>>(in_ptr, wei_ptr, out_ptr, size, eps);
-    } else {
-      row_rmsnorm_f32<128><<<1, threads_num>>>(in_ptr, wei_ptr, out_ptr, size, eps);
-    }
+  constexpr int threads_num = 128;
+  if (stream) {
+    cudaStream_t stream_ = static_cast<cudaStream_t>(stream);
+    row_rmsnorm_f32<128><<<1, threads_num, 0, stream_>>>(in_ptr, wei_ptr, out_ptr, size, eps);
   } else {
-    constexpr int threads_num = 1024;
-    if (stream) {
-      cudaStream_t stream_ = static_cast<cudaStream_t>(stream);
-      row_rmsnorm_f32<1024><<<1, threads_num, 0, stream_>>>(in_ptr, wei_ptr, out_ptr, size, eps);
-    } else {
-      row_rmsnorm_f32<1024><<<1, threads_num>>>(in_ptr, wei_ptr, out_ptr, size, eps);
-    }
+    row_rmsnorm_f32<128><<<1, threads_num>>>(in_ptr, wei_ptr, out_ptr, size, eps);
   }
 }
 }  // namespace kernel
diff --git a/kuiper/source/op/kernels/cuda/rope_kernel.cu b/kuiper/source/op/kernels/cuda/rope_kernel.cu
@@ -21,9 +21,10 @@ __global__ void rope_kernel_cu_fp32(int pos, int dim, int kv_dim, int head_size,
   float val = static_cast<float>(pos) * freq;
   float fcr = cosf(val);
   float fci = sinf(val);
-  bool is_greater = idx >= kv_dim;
-
-  return rope_calc(fcr, fci, const_cast<float*>(input_q), idx) ;
+  rope_calc(fcr, fci, const_cast<float*>(input_q), idx);
+  if (idx >= kv_dim) {
+    return;
+  }
   rope_calc(fcr, fci, const_cast<float*>(input_k), idx);
 }
 
diff --git a/readme.md b/readme.md
@@ -1,16 +1,21 @@
 # 自制大模型推理框架
 > 带你从零写一个支持LLama推理，支持Cuda加速的大模型框架
 
-**🙋🙋🙋 自制大模型推理框架火热进行中，只要178块，请加下方微信了解**
+**🙋🙋🙋 自制大模型推理框架火热进行中，请加下方微信了解**
+
+
 
 <img src="./imgs/me.jpg" alt="me" height="360px" width="300px" />
 
+
+
 ## 项目运行效果
 > LLama1.1b fp32模型，视频无加速，运行平台为Nvidia 3060 laptop，速度为60.34 token/s
+
 ![](./imgs/do.gif)
 
 ## 课程目录
-只要178块，只要178块，只要178块！重要的事情说三遍！！！
+
 
 **一、项目整体架构和设计**
 > 学习架构思维，防止自己只会优化局部实现
@@ -68,6 +73,9 @@
     *这里有多个小节*
 32. 总结
 
+## 课程常见问题
+
+<img src="./imgs/qa.jpg" style="zoom: 67%;" />
 
 ## 第三方依赖
 1. google glog https://github.com/google/glog
diff --git a/test/test_op/test_load.cpp b/test/test_op/test_load.cpp
@@ -0,0 +1,46 @@
+#include <cuda_runtime_api.h>
+#include <fcntl.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <model/config.h>
+#include <sys/mman.h>
+#include "../source/op/kernels/kernels_interface.h"
+#include "base/buffer.h"
+
+TEST(test_load, load_model_config) {
+  std::string model_path = "./tmp/test.bin";
+  int32_t fd = open(model_path.data(), O_RDONLY);
+  ASSERT_NE(fd, -1);
+
+  FILE* file = fopen(model_path.data(), "rb");
+  ASSERT_NE(file, nullptr);
+
+  auto config = model::ModelConfig{};
+  fread(&config, sizeof(model::ModelConfig), 1, file);
+  ASSERT_EQ(config.dim, 16);
+  ASSERT_EQ(config.hidden_dim, 128);
+  ASSERT_EQ(config.layer_num, 256);
+}
+
+TEST(test_load, load_model_weight) {
+  std::string model_path = "./tmp/test.bin";
+  int32_t fd = open(model_path.data(), O_RDONLY);
+  ASSERT_NE(fd, -1);
+
+  FILE* file = fopen(model_path.data(), "rb");
+  ASSERT_NE(file, nullptr);
+
+  auto config = model::ModelConfig{};
+  fread(&config, sizeof(model::ModelConfig), 1, file);
+
+  fseek(file, 0, SEEK_END);
+  auto file_size = ftell(file);
+
+  void* data = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
+  float* weight_data =
+      reinterpret_cast<float*>(static_cast<int8_t*>(data) + sizeof(model::ModelConfig));
+
+  for (int i = 0; i < config.dim * config.hidden_dim; ++i) {
+    ASSERT_EQ(*(weight_data + i), float(i));
+  }
+}
diff --git a/tmp/test.bin b/tmp/test.bin

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ base::Status LLama2Model::init(base::DeviceType device_type) {`
`131`	`131`	`}`
`132`	`132`
`133`	`133`	`base::Status LLama2Model::forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,`
`134`		`- bool is_prompt, int& next) const {`
	`134`	`+ int& next) const {`
`135`	`135`	`if (input.is_empty()) {`
`136`	`136`	`return base::error::InvalidArgument("The input tensor is empty.");`
`137`	`137`	`}`
`@@ -149,7 +149,6 @@ base::Status LLama2Model::forward(const tensor::Tensor& input, const tensor::Ten`
`149`	`149`	`feed_forward(layer_idx, input);`
`150`	`150`	`}`
`151`	`151`	`cls_logits(input);`
`152`		`- next = post_processing(pos_tensor, is_prompt);`
`153`	`152`	`return base::error::Success();`
`154`	`153`	`}`
`155`	`154`
`@@ -674,6 +673,16 @@ void LLama2Model::attention_qkv(int32_t layer_idx, const tensor::Tensor& pos_ten`
`674`	`673`	`STATUS_CHECK(llama_layers_->rope_layer_->forward(query, key, pos_tensor, tensor::Tensor{}));`
`675`	`674`	`}`
`676`	`675`
	`676`	`+base::Status LLama2Model::predict(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,`
	`677`	`+ bool is_prompt, int& next) const {`
	`678`	`+ auto status = forward(input, pos_tensor, next);`
	`679`	`+ if (!status) {`
	`680`	`+ return status;`
	`681`	`+ }`
	`682`	`+ next = post_processing(pos_tensor, is_prompt);`
	`683`	`+ return base::error::Success();`
	`684`	`+}`
	`685`	`+`
`677`	`686`	`void LLama2Model::attention_mha(int32_t layer_idx, const tensor::Tensor& pos_tensor) const {`
`678`	`687`	`CHECK(llama_layers_ != nullptr);`
`679`	`688`	`// mha`