buddy-compiler · gxsoar · Nov 25, 2023 · Nov 25, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/benchmarks/DeepLearning/Models/LLama2/READEME.md b/benchmarks/DeepLearning/Models/LLama2/READEME.md
@@ -0,0 +1,50 @@
+# LLama2 benchmark
+
+This benchmark is intend to use pytorch with torchdynamo to perform vicuna end-to-end inference.
+
+## Tools
+In order to run these python files, you need to install `pytorch`, `torchdynamo`, `transformers`.
+
+```
+pip install pytorch
+pip install transformers
+pip install torchdynamo
+```
+
+## Benchmarks
+Use command `python **_cpu.py`, you will get CPU time per round of inference
+
+Use command `python **_gpu.py`, you will get GPU time per round of inference
+
+If you cannot import the model in your GPU, you can use this way `model = AutoModel.from_pretrained('Model/', torch_dtype=torch.float16)` to import the model. This way uses f16 to load the model.
+
+## Results
+Run on
+
+* Model: vicuna-7b
+
+* OS: Ubuntu 22.04.1 LTS
+
+* CPU: Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz
+
+* GPU: NVIDIA GeForce RTX 3090
+
+* CUDA：CUDA Version: 12.0
+
+* python：python3.9
+
+* pytorch：2.0.0+cu118
+
+* Anaconda：Miniconda3
+
+**CPU time per round of inference**:
+
+pytorch average time per round of inference: 982.439 ms
+
+pytorch with torchdynamo average time per round of inference: 977.569 ms
+
+**GPU time per round of inference**:
+
+pytorch average time per round of inference: 25.337ms
+
+pytorch with torchdynamo average time per round of inference: 19.131ms
diff --git a/benchmarks/DeepLearning/Models/LLama2/pytorch_cpu.py b/benchmarks/DeepLearning/Models/LLama2/pytorch_cpu.py
@@ -0,0 +1,52 @@
+import tqdm
+
+import torch
+
+import gc
+
+import numpy as np
+
+import torch._dynamo as dynamo
+
+from transformers import AutoModel
+
+from transformers import AutoTokenizer
+
+model = AutoModel.from_pretrained("Model/")
+tokenizer = AutoTokenizer.from_pretrained("Model/")
+device = "cpu"
+repetitions = 300
+
+model = model.to(device).train()
+input_x = "Hello, world"
+input_y = "你好，世界！"
+
+inputs = tokenizer.encode_plus(input_x, input_y, return_tensors = "pt")
+inputs = inputs.to(device)
+
+#  The GPU may be in a sleep state to save energy, so it needs to be warmed up
+print("warm up ...\n")
+with torch.no_grad():
+    for _ in range(100):
+        _ = model(**inputs)
+
+#  wait for all GPU tasks to be processed before returning to the CPU main thread
+torch.cuda.synchronize()
+
+starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+    enable_timing = True
+)
+timings = np.zeros((repetitions, 1))
+
+print("testing ...\n")
+with torch.no_grad():
+    for rep in tqdm.tqdm(range(repetitions)):
+        starter.record()
+        _ = model(**inputs)
+        ender.record()
+        torch.cuda.synchronize()
+        curr_time = starter.elapsed_time(ender)
+        timings[rep] = curr_time
+
+avg = timings.sum() / repetitions
+print("\navg={}ms\n".format(avg))
diff --git a/benchmarks/DeepLearning/Models/LLama2/pytorch_gpu.py b/benchmarks/DeepLearning/Models/LLama2/pytorch_gpu.py
@@ -0,0 +1,52 @@
+import tqdm
+
+import torch
+
+import gc
+
+import numpy as np
+
+import torch._dynamo as dynamo
+
+from transformers import AutoModel
+
+from transformers import AutoTokenizer
+
+model = AutoModel.from_pretrained("Model/", torch_dtype=torch.float16)
+tokenizer = AutoTokenizer.from_pretrained("Model/")
+device = "cuda"
+repetitions = 300
+
+model = model.to(device).train()
+input_x = "Hello, world"
+input_y = "你好，世界！"
+
+inputs = tokenizer.encode_plus(input_x, input_y, return_tensors="pt")
+inputs = inputs.to(device)
+
+#  The GPU may be in a sleep state to save energy, so it needs to be warmed up
+print("warm up ...\n")
+with torch.no_grad():
+    for _ in range(100):
+        _ = model(**inputs)
+
+#  wait for all GPU tasks to be processed before returning to the CPU main thread
+torch.cuda.synchronize()
+
+starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+    enable_timing=True
+)
+timings = np.zeros((repetitions, 1))
+
+print("testing ...\n")
+with torch.no_grad():
+    for rep in tqdm.tqdm(range(repetitions)):
+        starter.record()
+        _ = model(**inputs)
+        ender.record()
+        torch.cuda.synchronize()
+        curr_time = starter.elapsed_time(ender)
+        timings[rep] = curr_time
+
+avg = timings.sum() / repetitions
+print("\navg={}ms\n".format(avg))
diff --git a/benchmarks/DeepLearning/Models/LLama2/torchdynamo_cpu.py b/benchmarks/DeepLearning/Models/LLama2/torchdynamo_cpu.py
@@ -0,0 +1,54 @@
+import tqdm
+
+import torch
+
+import gc
+
+import numpy as np
+
+import torch._dynamo as dynamo
+
+from transformers import AutoModel
+
+from transformers import AutoTokenizer
+
+model = AutoModel.from_pretrained("Model/", torch_dtype=torch.float16)
+tokenizer = AutoTokenizer.from_pretrained("Model/")
+device = "cuda"
+repetitions = 300
+
+model = model.to(device).train()
+model = dynamo.optimize("inductor")(model)
+
+input_x = "Hello, world"
+input_y = "你好，世界！"
+
+inputs = tokenizer.encode_plus(input_x, input_y, return_tensors="pt")
+inputs = inputs.to(device)
+
+#  The GPU may be in a sleep state to save energy, so it needs to be warmed up
+print("warm up ...\n")
+with torch.no_grad():
+    for _ in range(100):
+        _ = model(**inputs)
+
+#  wait for all GPU tasks to be processed before returning to the CPU main thread
+torch.cuda.synchronize()
+
+starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+    enable_timing=True
+)
+timings = np.zeros((repetitions, 1))
+
+print("testing ...\n")
+with torch.no_grad():
+    for rep in tqdm.tqdm(range(repetitions)):
+        starter.record()
+        _ = model(**inputs)
+        ender.record()
+        torch.cuda.synchronize()
+        curr_time = starter.elapsed_time(ender)
+        timings[rep] = curr_time
+
+avg = timings.sum() / repetitions
+print("\navg={}ms\n".format(avg))
diff --git a/benchmarks/DeepLearning/Models/LLama2/torchdynamo_gpu.py b/benchmarks/DeepLearning/Models/LLama2/torchdynamo_gpu.py
@@ -0,0 +1,54 @@
+import tqdm
+
+import torch
+
+import gc
+
+import numpy as np
+
+import torch._dynamo as dynamo
+
+from transformers import AutoModel
+
+from transformers import AutoTokenizer
+
+model = AutoModel.from_pretrained("Model/")
+tokenizer = AutoTokenizer.from_pretrained("Model/")
+device = "cpu"
+repetitions = 300
+
+model = model.to(device).train()
+model = dynamo.optimize("inductor")(model)
+
+input_x = "Hello, world"
+input_y = "你好，世界！"
+
+inputs = tokenizer.encode_plus(input_x, input_y, return_tensors="pt")
+inputs = inputs.to(device)
+
+#  The GPU may be in a sleep state to save energy, so it needs to be warmed up
+print("warm up ...\n")
+with torch.no_grad():
+    for _ in range(100):
+        _ = model(**inputs)
+
+#  wait for all GPU tasks to be processed before returning to the CPU main thread
+torch.cuda.synchronize()
+
+starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+    enable_timing=True
+)
+timings = np.zeros((repetitions, 1))
+
+print("testing ...\n")
+with torch.no_grad():
+    for rep in tqdm.tqdm(range(repetitions)):
+        starter.record()
+        _ = model(**inputs)
+        ender.record()
+        torch.cuda.synchronize()
+        curr_time = starter.elapsed_time(ender)
+        timings[rep] = curr_time
+
+avg = timings.sum() / repetitions
+print("\navg={}ms\n".format(avg))