Skip to content

Commit a3cea26

Browse files
Add metal_profiler to profile gpu on macos
1 parent 01cbcf9 commit a3cea26

File tree

4 files changed

+117
-2
lines changed

4 files changed

+117
-2
lines changed

examples/hello_world/Makefile

+8-2
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,24 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
99
else
1010
STDLIB := -stdlib=libc++
1111
endif
12-
FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
12+
FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib -ldl -ldawn
1313

1414
run: ./build/$(TARGET) dawnlib
1515
$(LIBSPEC) && ./build/$(TARGET)
1616

17+
run_with_profile: ./build/$(TARGET)_profile dawnlib
18+
$(LIBSPEC) && export METAL_CAPTURE_ENABLED=1 && ./build/$(TARGET)_profile
19+
1720
dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libdawn.so $(GPUCPP)/third_party/lib/libdawn.dylib),,run_setup)
1821

1922
run_setup: check-python
2023
cd $(GPUCPP) && python3 setup.py
2124

2225
build/$(TARGET): run.cpp
23-
mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o ./build/$(TARGET)
26+
mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o $@ $<
27+
28+
build/$(TARGET)_profile: run_profile.cpp
29+
mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o $@ $< $(GPUCPP)/metal_profiler.mm -framework metal -framework Foundation
2430

2531
debug: run.cpp
2632
mkdir -p build && $(CXX) $(FLAGS) -g -o ./build/$(TARGET)

examples/hello_world/run_profile.cpp

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#include "gpu.hpp"
2+
#include "metal_profiler.hpp"
3+
#include <array>
4+
#include <cstdio>
5+
#include <future>
6+
7+
using namespace gpu; // createContext, createTensor, createKernel,
8+
// createShader, dispatchKernel, wait, toCPU
9+
// Tensor, Kernel, Context, Shape, kf32
10+
11+
static const char *kGelu = R"(
12+
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
13+
@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
14+
@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
15+
@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
16+
@compute @workgroup_size({{workgroupSize}})
17+
fn main(
18+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
19+
let i: u32 = GlobalInvocationID.x;
20+
if (i < arrayLength(&inp)) {
21+
let x: f32 = inp[i];
22+
out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR
23+
* (x + .044715 * x * x * x))), x, x > 10.0);
24+
}
25+
}
26+
)";
27+
28+
int main(int argc, char **argv) {
29+
printf("\033[2J\033[1;1H");
30+
printf("\nHello gpu.cpp!\n");
31+
printf("--------------\n\n");
32+
33+
Context ctx = createContext();
34+
static constexpr size_t N = 10000;
35+
std::array<float, N> inputArr, outputArr;
36+
for (int i = 0; i < N; ++i) {
37+
inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data
38+
}
39+
Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
40+
Tensor output = createTensor(ctx, Shape{N}, kf32);
41+
std::promise<void> promise;
42+
std::future<void> future = promise.get_future();
43+
Kernel op = createKernel(ctx, {kGelu, 256, kf32},
44+
Bindings{input, output},
45+
/* nWorkgroups */ {cdiv(N, 256), 1, 1});
46+
startCapture();
47+
dispatchKernel(ctx, op, promise);
48+
wait(ctx, future);
49+
stopCapture();
50+
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
51+
for (int i = 0; i < 12; ++i) {
52+
printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
53+
}
54+
printf(" ...\n\n");
55+
printf("Computed %zu values of GELU(x)\n\n", N);
56+
return 0;
57+
}

metal_profiler.hpp

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#ifdef __APPLE__
2+
extern "C" {
3+
void startCapture();
4+
void stopCapture();
5+
}
6+
#endif

metal_profiler.mm

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#import <Foundation/Foundation.h>
2+
#import <Metal/Metal.h>
3+
#import <QuartzCore/CAMetalLayer.h>
4+
5+
6+
extern "C" {
7+
void startCapture() {
8+
if (![[NSProcessInfo processInfo].environment[@"METAL_CAPTURE_ENABLED"] boolValue]) {
9+
NSLog(@"METAL_CAPTURE_ENABLED is not set. Please set it to 1 to enable Metal capture.");
10+
return;
11+
}
12+
13+
MTLCaptureDescriptor *descriptor = [[MTLCaptureDescriptor alloc] init];
14+
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
15+
descriptor.outputURL = [NSURL fileURLWithPath:@"gpu.cpp.gputrace"];
16+
17+
NSFileManager *fileManager = [NSFileManager defaultManager];
18+
if ([fileManager fileExistsAtPath:@"gpu.cpp.gputrace"]) {
19+
NSError *error = nil;
20+
[fileManager removeItemAtPath:@"gpu.cpp.gputrace" error:&error];
21+
if (error) {
22+
NSLog(@"Error deleting existing gpu.cpp.gputrace directory: %@", error);
23+
return;
24+
} else {
25+
NSLog(@"Deleted existing gpu.cpp.gputrace directory.");
26+
}
27+
}
28+
29+
NSError *error = nil;
30+
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
31+
if (!device) {
32+
NSLog(@"MTLCreateSystemDefaultDevice returned nil. Metal may not be supported on this system.");
33+
return;
34+
}
35+
descriptor.captureObject = device;
36+
37+
BOOL success = [MTLCaptureManager.sharedCaptureManager startCaptureWithDescriptor:descriptor error:&error];
38+
if (!success) {
39+
NSLog(@" error capturing mtl => %@ ", [error localizedDescription] );
40+
}
41+
}
42+
43+
void stopCapture() {
44+
[MTLCaptureManager.sharedCaptureManager stopCapture];
45+
}
46+
}

0 commit comments

Comments
 (0)