Skip to content

Commit 0d40a07

Browse files
Add profiler/metal to profile gpu on macos
1 parent 01cbcf9 commit 0d40a07

File tree

6 files changed

+116
-18
lines changed

6 files changed

+116
-18
lines changed

examples/matmul/Makefile

+10
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,21 @@ FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCP
1515
run: ./build/$(TARGET)
1616
$(LIBSPEC) && ./build/$(TARGET)
1717

18+
run_with_metal_profiler: ./build/$(TARGET)_with_metal_profiler
19+
$(LIBSPEC) && export METAL_CAPTURE_ENABLED=1 && ./build/$(TARGET)_with_metal_profiler
20+
21+
run_with_time_profiler: ./build/$(TARGET)_with_metal_profiler
22+
$(LIBSPEC) && xcrun xctrace record --template 'Time Profiler' --launch -- ./build/$(TARGET)_with_metal_profiler
23+
1824
# Use clang -v to see the include paths
1925
# Note in this example optimization is turned on
2026
build/$(TARGET): run.cpp
2127
mkdir -p build && $(CXX) $(FLAGS) -o ./build/$(TARGET)
2228

29+
build/$(TARGET)_with_metal_profiler: run.cpp
30+
mkdir -p build && $(CXX) $(FLAGS) -o ./build/$(TARGET)_with_metal_profiler $(GPUCPP)/experimental/profiler/metal.mm -framework metal -framework Foundation -DMETAL_PROFILER -g
31+
install_name_tool -change @rpath/libdawn.dylib $(GPUCPP)/third_party/lib/libdawn.dylib ./build/$(TARGET)_with_metal_profiler
32+
2333
watch:
2434
@command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
2535
mkdir -p build && $(CODEPATH) | entr -s "$(LIBSPEC) && rm -f ./build/$(TARGET) && make -j$(NUM_JOBS) ./build/$(TARGET) && ./build/$(TARGET)"

examples/matmul/run.cpp

+16
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
#include "experimental/wgsl.h" // loopUnrolling
1414
#include "numeric_types/half.hpp"
1515

16+
#ifdef METAL_PROFILER
17+
#include "experimental/profiler/metal.hpp"
18+
#endif
19+
1620
using namespace gpu;
1721

1822
const std::string versionToStr(int version);
@@ -799,7 +803,11 @@ void runTest(int version, size_t M, size_t K, size_t N,
799803
Tensor input = createTensor(ctx, Shape{M, K}, numtype, inputPtr.get());
800804
Tensor weights = createTensor(ctx, Shape{N, K}, numtype, weightsPtr.get()); // column-major
801805

806+
#ifdef METAL_PROFILER
807+
constexpr size_t nIter = 1;
808+
#else
802809
constexpr size_t nIter = 30;
810+
#endif
803811

804812
// Initialize Kernel and bind GPU buffers
805813

@@ -815,8 +823,10 @@ void runTest(int version, size_t M, size_t K, size_t N,
815823
kernels[i] = selectMatmul(ctx, version, {input, weights, outputs[i]}, M, K, N, numtype);
816824
}
817825

826+
#ifndef METAL_PROFILER
818827
printf("[ Press enter to start tests ... ]\n");
819828
getchar();
829+
#endif
820830
LOG(kDefLog, kInfo, "Dispatching Kernel version %d: %s, %d iterations ...",
821831
version, versionToStr(version).c_str(), nIter);
822832

@@ -930,11 +940,17 @@ int main() {
930940
N = 2 * 4096;
931941
}
932942

943+
#ifdef METAL_PROFILER
944+
startCapture();
945+
#endif
933946
if (enableF16) {
934947
runTestWithCheck<half>(version, M, K, N, transposedInput, kTestSize, numtype);
935948
} else {
936949
runTestWithCheck<float>(version, M, K, N, transposedInput, kTestSize, numtype);
937950
}
951+
#ifdef METAL_PROFILER
952+
stopCapture();
953+
#endif
938954

939955
LOG(kDefLog, kInfo, "Done.");
940956
return 0;

experimental/kernels/Makefile

+21-18
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ default: run-native
3232
run_llm.c: ./build/test_gpt2 dawnlib
3333
$(LIBSPEC) && $<
3434

35+
run_llm.c_with_metal_profiler: ./build/test_gpt2_with_metal_profiler dawnlib
36+
$(LIBSPEC) && export METAL_CAPTURE_ENABLED=1 && $<
37+
38+
run_llm.c_with_time_profiler: ./build/test_gpt2_with_metal_profiler dawnlib
39+
$(LIBSPEC) && xcrun xctrace record --template 'Time Profiler' --launch -- $<
40+
3541
run_llm.c_train: ./build/train_gpt2 dawnlib
3642
if [ ! -d dev ] ; then ln -s $(GPUCPP)/third_party/llm.c/dev ; fi
3743
if [ ! -f gpt2_tokenizer.bin ] ; then ln -s $(GPUCPP)/third_party/llm.c/gpt2_tokenizer.bin ; fi
@@ -48,8 +54,9 @@ gpt2_124M.bin: llm.c
4854
ln -s ./llm.c/gpt2_tokenizer.bin ; \
4955
fi
5056

51-
build/test_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
52-
mkdir -p build
57+
define preprocess_file
58+
sed -i -e 's/int main(/int MAIN(/g' llm.c/test_gpt2.c
59+
sed -i -e 's/int main(/int MAIN(/g' llm.c/train_gpt2.c
5360
sed -i -e 's/void encoder_forward(/void ENCODER_FORWARD_CPU(/g' llm.c/train_gpt2.c
5461
sed -i -e 's/void layernorm_forward(/void LAYERNORM_FORWARD_CPU(/g' llm.c/train_gpt2.c
5562
sed -i -e 's/void matmul_forward(/void MATMUL_FORWARD_CPU(/g' llm.c/train_gpt2.c
@@ -67,26 +74,22 @@ build/test_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
6774
sed -i -e 's/void crossentropy_softmax_backward(/void CROSSENTROPY_SOFTMAX_BACKWARD_CPU(/g' llm.c/train_gpt2.c
6875
grep -q "^#include \"unittest_kernels.h\"" llm.c/train_gpt2.c || \
6976
printf '1i\n#include "unittest_kernels.h"\n.\nw\nq\n' | ed -s llm.c/train_gpt2.c
77+
endef
78+
79+
build/test_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
80+
mkdir -p build
81+
$(call preprocess_file)
7082
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ llm.c/test_gpt2.c build/unittest_kernels.o
7183

84+
build/test_gpt2_with_metal_profiler: llm.c build/unittest_kernels.o gpt2_124M.bin
85+
mkdir -p build
86+
$(call preprocess_file)
87+
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ llm.c/test_gpt2.c build/unittest_kernels.o -I$(GPUCPP) $(GPUCPP)/experimental/profiler/metal.mm -framework metal -framework Foundation -DMETAL_PROFILER -g
88+
install_name_tool -change @rpath/libdawn.dylib $(GPUCPP)/third_party/lib/libdawn.dylib $@
89+
7290
build/train_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
7391
mkdir -p build
74-
sed -i -e 's/void encoder_forward(/void ENCODER_FORWARD_CPU(/g' llm.c/train_gpt2.c
75-
sed -i -e 's/void layernorm_forward(/void LAYERNORM_FORWARD_CPU(/g' llm.c/train_gpt2.c
76-
sed -i -e 's/void matmul_forward(/void MATMUL_FORWARD_CPU(/g' llm.c/train_gpt2.c
77-
sed -i -e 's/void attention_forward(/void ATTENTION_FORWARD_CPU(/g' llm.c/train_gpt2.c
78-
sed -i -e 's/void gelu_forward(/void GELU_FORWARD_CPU(/g' llm.c/train_gpt2.c
79-
sed -i -e 's/void residual_forward(/void RESIDUAL_FORWARD_CPU(/g' llm.c/train_gpt2.c
80-
sed -i -e 's/void softmax_forward(/void SOFTMAX_FORWARD_CPU(/g' llm.c/train_gpt2.c
81-
sed -i -e 's/void crossentropy_forward(/void CROSSENTROPY_FORWARD_CPU(/g' llm.c/train_gpt2.c
82-
sed -i -e 's/void encoder_backward(/void ENCODER_BACKWARD_CPU(/g' llm.c/train_gpt2.c
83-
sed -i -e 's/void layernorm_backward(/void LAYERNORM_BACKWARD_CPU(/g' llm.c/train_gpt2.c
84-
sed -i -e 's/void matmul_backward(/void MATMUL_BACKWARD_CPU(/g' llm.c/train_gpt2.c
85-
sed -i -e 's/void attention_backward(/void ATTENTION_BACKWARD_CPU(/g' llm.c/train_gpt2.c
86-
sed -i -e 's/void gelu_backward(/void GELU_BACKWARD_CPU(/g' llm.c/train_gpt2.c
87-
sed -i -e 's/void residual_backward(/void RESIDUAL_BACKWARD_CPU(/g' llm.c/train_gpt2.c
88-
sed -i -e 's/void crossentropy_softmax_backward(/void CROSSENTROPY_SOFTMAX_BACKWARD_CPU(/g' llm.c/train_gpt2.c
89-
grep -q "^#include \"unittest_kernels.h\"" llm.c/train_gpt2.c || sed -i '1i#include \"unittest_kernels.h\"' llm.c/train_gpt2.c
92+
$(call preprocess_file)
9093
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ llm.c/train_gpt2.c build/unittest_kernels.o
9194

9295
build/ops.o: ops.cpp ops.hpp kernels.h llm.c

experimental/kernels/unittest_llmc/unittest_kernels.h

+17
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,23 @@
22
extern "C" {
33
#endif
44

5+
#ifdef METAL_PROFILER
6+
#include "experimental/profiler/metal.hpp"
7+
8+
#define MAIN main_wrapper
9+
static int main_wrapper(int argc, char *argv[]);
10+
11+
int main(int argc, char *argv[]) {
12+
startCapture();
13+
int ret = main_wrapper(argc, argv);
14+
stopCapture();
15+
return ret;
16+
}
17+
18+
#else
19+
#define MAIN main
20+
#endif
21+
522
// -- USE_GPU_FOR_* are the GPU/CPU switching flags for the kernels in llm.c. --
623

724
#define USE_GPU_FOR_ENCODER_FORWARD 1

experimental/profiler/metal.hpp

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#ifdef __APPLE__
2+
extern "C" {
3+
void startCapture();
4+
void stopCapture();
5+
}
6+
#endif

experimental/profiler/metal.mm

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#import <Foundation/Foundation.h>
2+
#import <Metal/Metal.h>
3+
#import <QuartzCore/CAMetalLayer.h>
4+
5+
6+
extern "C" {
7+
void startCapture() {
8+
if (![[NSProcessInfo processInfo].environment[@"METAL_CAPTURE_ENABLED"] boolValue]) {
9+
NSLog(@"METAL_CAPTURE_ENABLED is not set. Please set it to 1 to enable Metal capture.");
10+
return;
11+
}
12+
13+
MTLCaptureDescriptor *descriptor = [[MTLCaptureDescriptor alloc] init];
14+
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
15+
descriptor.outputURL = [NSURL fileURLWithPath:@"gpu.cpp.gputrace"];
16+
17+
NSFileManager *fileManager = [NSFileManager defaultManager];
18+
if ([fileManager fileExistsAtPath:@"gpu.cpp.gputrace"]) {
19+
NSError *error = nil;
20+
[fileManager removeItemAtPath:@"gpu.cpp.gputrace" error:&error];
21+
if (error) {
22+
NSLog(@"Error deleting existing gpu.cpp.gputrace directory: %@", error);
23+
return;
24+
} else {
25+
NSLog(@"Deleted existing gpu.cpp.gputrace directory.");
26+
}
27+
}
28+
29+
NSError *error = nil;
30+
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
31+
if (!device) {
32+
NSLog(@"MTLCreateSystemDefaultDevice returned nil. Metal may not be supported on this system.");
33+
return;
34+
}
35+
descriptor.captureObject = device;
36+
37+
BOOL success = [MTLCaptureManager.sharedCaptureManager startCaptureWithDescriptor:descriptor error:&error];
38+
if (!success) {
39+
NSLog(@" error capturing mtl => %@ ", [error localizedDescription] );
40+
}
41+
}
42+
43+
void stopCapture() {
44+
[MTLCaptureManager.sharedCaptureManager stopCapture];
45+
}
46+
}

0 commit comments

Comments
 (0)