Add metal_profiler to profile gpu on macos

junjihashimoto · junjihashimoto · commit a3cea26b33a9 · 2024-09-27T15:43:43.000+09:00
diff --git a/examples/hello_world/Makefile b/examples/hello_world/Makefile
@@ -9,18 +9,24 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
 else
     STDLIB := -stdlib=libc++
 endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib -ldl -ldawn
 
 run: ./build/$(TARGET) dawnlib
 	$(LIBSPEC) && ./build/$(TARGET)
 
+run_with_profile: ./build/$(TARGET)_profile dawnlib
+	$(LIBSPEC) && export METAL_CAPTURE_ENABLED=1 && ./build/$(TARGET)_profile
+
 dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libdawn.so $(GPUCPP)/third_party/lib/libdawn.dylib),,run_setup)
 
 run_setup: check-python
 	cd $(GPUCPP) && python3 setup.py
 
 build/$(TARGET): run.cpp
-	mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o ./build/$(TARGET)
+	mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o $@ $<
+
+build/$(TARGET)_profile: run_profile.cpp
+	mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o $@ $< $(GPUCPP)/metal_profiler.mm -framework metal -framework Foundation
 
 debug: run.cpp
 	mkdir -p build && $(CXX) $(FLAGS) -g -o ./build/$(TARGET)
diff --git a/examples/hello_world/run_profile.cpp b/examples/hello_world/run_profile.cpp
@@ -0,0 +1,57 @@
+#include "gpu.hpp"
+#include "metal_profiler.hpp"
+#include <array>
+#include <cstdio>
+#include <future>
+
+using namespace gpu; // createContext, createTensor, createKernel,
+                     // createShader, dispatchKernel, wait, toCPU
+                     // Tensor, Kernel, Context, Shape, kf32
+
+static const char *kGelu = R"(
+const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let i: u32 = GlobalInvocationID.x;
+    if (i < arrayLength(&inp)) {
+        let x: f32 = inp[i];
+        out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR 
+                 * (x + .044715 * x * x * x))), x, x > 10.0);
+    }
+}
+)";
+
+int main(int argc, char **argv) {
+  printf("\033[2J\033[1;1H");
+  printf("\nHello gpu.cpp!\n");
+  printf("--------------\n\n");
+
+  Context ctx = createContext();
+  static constexpr size_t N = 10000;
+  std::array<float, N> inputArr, outputArr;
+  for (int i = 0; i < N; ++i) {
+    inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data
+  }
+  Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
+  Tensor output = createTensor(ctx, Shape{N}, kf32);
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  Kernel op = createKernel(ctx, {kGelu, 256, kf32},
+                           Bindings{input, output},
+                           /* nWorkgroups */ {cdiv(N, 256), 1, 1});
+  startCapture();
+  dispatchKernel(ctx, op, promise);
+  wait(ctx, future);
+  stopCapture();
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  for (int i = 0; i < 12; ++i) {
+    printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
+  }
+  printf("  ...\n\n");
+  printf("Computed %zu values of GELU(x)\n\n", N);
+  return 0;
+}
diff --git a/metal_profiler.hpp b/metal_profiler.hpp
@@ -0,0 +1,6 @@
+#ifdef __APPLE__
+extern "C" {
+  void startCapture();
+  void stopCapture();
+}
+#endif
diff --git a/metal_profiler.mm b/metal_profiler.mm
@@ -0,0 +1,46 @@
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#import <QuartzCore/CAMetalLayer.h>
+
+
+extern "C" {
+  void startCapture() {
+    if (![[NSProcessInfo processInfo].environment[@"METAL_CAPTURE_ENABLED"] boolValue]) {
+      NSLog(@"METAL_CAPTURE_ENABLED is not set. Please set it to 1 to enable Metal capture.");
+      return;
+    }
+    
+    MTLCaptureDescriptor *descriptor = [[MTLCaptureDescriptor alloc] init];
+    descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+    descriptor.outputURL = [NSURL fileURLWithPath:@"gpu.cpp.gputrace"];
+
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    if ([fileManager fileExistsAtPath:@"gpu.cpp.gputrace"]) {
+      NSError *error = nil;
+      [fileManager removeItemAtPath:@"gpu.cpp.gputrace" error:&error];
+      if (error) {
+        NSLog(@"Error deleting existing gpu.cpp.gputrace directory: %@", error);
+        return;
+      } else {
+        NSLog(@"Deleted existing gpu.cpp.gputrace directory.");
+      }
+    }
+
+    NSError *error = nil;
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    if (!device) {
+      NSLog(@"MTLCreateSystemDefaultDevice returned nil. Metal may not be supported on this system.");
+      return;
+    }
+    descriptor.captureObject = device;
+    
+    BOOL success = [MTLCaptureManager.sharedCaptureManager startCaptureWithDescriptor:descriptor error:&error];
+    if (!success) {
+        NSLog(@" error capturing mtl => %@ ", [error localizedDescription] );
+    }
+  }
+
+  void stopCapture() {
+    [MTLCaptureManager.sharedCaptureManager stopCapture];
+  }
+}