Skip to content

Commit 1cca3a7

Browse files
committed
Add More benchmarks
1 parent 3732466 commit 1cca3a7

11 files changed

+410
-0
lines changed

resources/data.bin

491 KB
Binary file not shown.

resources/data_write_map.bin

491 KB
Binary file not shown.

resources/data_write_sys.bin

491 KB
Binary file not shown.

src/bench_fixture.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#include <benchmark/benchmark.h>
2+
#include <array>
3+
#include <iostream>
4+
5+
struct CircularBuffer {
6+
std::array<int, 16> data{};
7+
8+
void fill(std::array<int, 16> in) {
9+
data = in;
10+
}
11+
12+
void init(int d) {
13+
data[0] = d;
14+
}
15+
};
16+
17+
class TestClass : public ::benchmark::Fixture {
18+
public:
19+
virtual void SetUp(::benchmark::State& state) {
20+
buffer.init(10);
21+
std::cout << "Setup Called" << std::endl;
22+
}
23+
24+
virtual void TearDown(::benchmark::State& state) {
25+
std::cout << "TearDown Called" << std::endl;
26+
}
27+
28+
CircularBuffer buffer;
29+
};
30+
31+
BENCHMARK_F(TestClass, FillTest)(::benchmark::State& state) {
32+
std::array<int, 16> d{12};
33+
for (auto _ : state) {
34+
buffer.fill(d);
35+
}
36+
}
37+
38+
BENCHMARK_MAIN();

src/cache_flush.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
//#include <asm/cachectl.h>
2+
3+
struct DataSoA {
4+
int x[16];
5+
int y[16];
6+
};
7+
8+
int main() {
9+
DataSoA* d = new DataSoA{};
10+
cacheflush(d, sizeof(DataSoA), BCACHE);
11+
return 0;
12+
}

src/cache_perf.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#include "benchmark/benchmark.h"
2+
#include <atomic>
3+
4+
struct StructFittingInSameCacheLine {
5+
std::atomic<int> a{0};
6+
std::atomic<int> b{0};
7+
std::atomic<int> c{0};
8+
};
9+
10+
struct StructFittingInSeperateCacheLine {
11+
std::atomic<int> a{0};
12+
char padding[33000];
13+
std::atomic<int> b{0};
14+
};
15+
16+
static void SameReadModifyWrite (benchmark::State& state) {
17+
StructFittingInSameCacheLine temp;
18+
for (auto _ : state) {
19+
auto r = temp.a.load();
20+
temp.b.fetch_add(10);
21+
auto t = temp.c.load();
22+
}
23+
}
24+
25+
BENCHMARK(SameReadModifyWrite);
26+
27+
static void DifferentReadModifyWrite (benchmark::State& state) {
28+
StructFittingInSameCacheLine temp;
29+
for (auto _ : state) {
30+
auto r = temp.a.load();
31+
temp.b.fetch_add(10);
32+
auto t = temp.c.load();
33+
}
34+
}
35+
36+
BENCHMARK(DifferentReadModifyWrite);
37+
38+
BENCHMARK_MAIN();

src/cache_thrashing.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#include <iostream>
2+
#include <chrono>
3+
#include <vector>
4+
#include <atomic>
5+
#include <thread>
6+
7+
using namespace std;
8+
9+
struct UnaligendAtomic {
10+
std::atomic<int> x{0};
11+
std::atomic<int> y{0};
12+
};
13+
14+
struct AligendAtomic {
15+
alignas(64) std::atomic<int> x{0};
16+
alignas(64) std::atomic<int> y{0};
17+
};
18+
19+
static std::atomic<bool> wait{true};
20+
static std::atomic<bool> x_started{false};
21+
static std::atomic<bool> y_started{false};
22+
23+
template<typename T>
24+
void ModifyX(T& data) {
25+
std::cout << "X Waiting" << std::endl;
26+
x_started = true;
27+
while(wait) {}
28+
for (int i=0; i<100000000;i++) {
29+
data.x = data.x + 1;
30+
}
31+
}
32+
33+
template<typename T>
34+
void ModifyY(T& data) {
35+
y_started = true;
36+
std::cout << "Y Waiting" << std::endl;
37+
while(wait) {}
38+
for (int i=0; i<100000000;i++) {
39+
data.y = data.y + 1;
40+
}
41+
}
42+
43+
int main() {
44+
using CacheStruct = UnaligendAtomic;
45+
CacheStruct data;
46+
std::cout << "Starting Threads" << std::endl;
47+
std::thread t1{ModifyX<CacheStruct>, std::ref(data)};
48+
std::thread t2{ModifyY<CacheStruct>, std::ref(data)};
49+
// Setting it to different cores
50+
cpu_set_t cpu3;
51+
cpu_set_t cpu1;
52+
CPU_ZERO(&cpu3);
53+
CPU_ZERO(&cpu1);
54+
CPU_SET(3, &cpu3);
55+
CPU_SET(1, &cpu1);
56+
57+
pthread_setaffinity_np(t1.native_handle(), sizeof(cpu_set_t), &cpu1);
58+
pthread_setaffinity_np(t2.native_handle(), sizeof(cpu_set_t), &cpu3);
59+
60+
while(!x_started) {}
61+
while(!y_started) {}
62+
63+
wait = false;
64+
std::cout << "Signalled" << std::endl;
65+
t1.join();
66+
t2.join();
67+
}

src/misc_perf.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#include "benchmark/benchmark.h"
2+
3+
void function_with_large_parameters(uint64_t a, uint64_t b,uint64_t c,
4+
uint64_t d,uint64_t e, uint64_t f,
5+
uint64_t g) {
6+
// For comparing apple to apple
7+
uint64_t result = a + b + c;
8+
}
9+
10+
void function_with_small_parameters(uint64_t a, uint64_t b, uint64_t c,
11+
uint64_t d, uint64_t e, uint64_t f) {
12+
uint64_t result = a + b + c;
13+
}
14+
15+
struct Arguments {
16+
int a;
17+
int b;
18+
int c;
19+
};
20+
21+
void function_with_arguments_packed_in_struct(struct Arguments& args) {
22+
int result = args.a + args.b + args.c;
23+
}
24+
25+
void function_with_arguments_not_passed_in_struct(int a, int b, int c) {
26+
int result = a + b + c;
27+
}
28+
29+
static void BM_function_with_large_parameters(benchmark::State& state) {
30+
for (auto _ : state) {
31+
function_with_large_parameters(1,2,3,4,5,6,7);
32+
}
33+
}
34+
35+
static void BM_function_with_small_parameters(benchmark::State& state) {
36+
for (auto _ : state) {
37+
function_with_small_parameters(1,2,3,4,5,6);
38+
}
39+
}
40+
41+
static void BM_function_with_arguments_packed_in_struct(benchmark::State& state) {
42+
for (auto _ : state) {
43+
Arguments args {1,2,3};
44+
function_with_arguments_packed_in_struct(args);
45+
}
46+
}
47+
48+
static void BM_function_with_arguments_not_passed_in_struct(benchmark::State& state) {
49+
for (auto _ : state) {
50+
function_with_arguments_not_passed_in_struct(1,2,3);
51+
}
52+
}
53+
54+
class A {
55+
public:
56+
int x;
57+
int y;
58+
virtual int getAdd() {
59+
return x + y;
60+
}
61+
};
62+
63+
class B : public A {
64+
};
65+
66+
static void BM_virtual_function_call(benchmark::State& state) {
67+
B x;
68+
A* y = &x;
69+
for (auto _ : state) {
70+
y->getAdd();
71+
}
72+
}
73+
74+
static void BM_non_virtual_function_call(benchmark::State& state) {
75+
A x;
76+
for (auto _ : state) {
77+
x.getAdd();
78+
}
79+
}
80+
81+
BENCHMARK(BM_function_with_small_parameters);
82+
BENCHMARK(BM_function_with_large_parameters);
83+
BENCHMARK(BM_function_with_arguments_packed_in_struct);
84+
BENCHMARK(BM_function_with_arguments_not_passed_in_struct);
85+
BENCHMARK(BM_virtual_function_call);
86+
BENCHMARK(BM_non_virtual_function_call);
87+
88+
BENCHMARK_MAIN();

src/vecmemcpy.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#include <benchmark/benchmark.h>
2+
#include <omp.h>
3+
#include <cstdlib>
4+
#include <cstring>
5+
6+
namespace valgo {
7+
__attribute__((always_inline)) void memcpy(char* dst, char const * src, std::size_t n) {
8+
int alignment = reinterpret_cast<std::uintptr_t>(src) & 0x3F;
9+
int unaligned_iter = alignment ? 64 - alignment : 0;
10+
if (unaligned_iter) {
11+
std::memcpy(dst, src, unaligned_iter);
12+
}
13+
14+
const char* new_src = src + unaligned_iter;
15+
char* new_dst = dst + unaligned_iter;
16+
int j = unaligned_iter;
17+
18+
j = unaligned_iter;
19+
for (; j < n; j = j + 64) {
20+
__builtin_prefetch(new_src + j + 16384, 0, 0);
21+
__builtin_prefetch(new_dst + j + 16384, 1, 0);
22+
#pragma omp simd simdlen(8) aligned(new_src: 64)
23+
for (int i = 0; i < 64; i++) {
24+
new_dst[j + i] = new_src[j + i];
25+
}
26+
}
27+
28+
int rem = n - j;
29+
if (rem > 0) {
30+
std::memcpy(dst + j, src + j, rem);
31+
}
32+
}
33+
}
34+
35+
void RandomData(char* data, std::size_t n) {
36+
for (int i=0 ; i<n; i++) {
37+
data[i] = rand() % 255;
38+
}
39+
}
40+
41+
constexpr int KiloByte() {
42+
return 1024;
43+
}
44+
45+
class MemcpyFixture : public ::benchmark::Fixture {
46+
void SetUp(const ::benchmark::State& state) override {
47+
size = state.range(0) * KiloByte();
48+
dest = new char[size];
49+
src = new char[size];
50+
51+
RandomData(src, size);
52+
}
53+
54+
void TearDown(const ::benchmark::State& state) override {
55+
delete[] dest;
56+
delete[] src;
57+
}
58+
59+
public:
60+
char* dest = nullptr;
61+
char* src = nullptr;
62+
int size = 0;
63+
};
64+
65+
BENCHMARK_DEFINE_F(MemcpyFixture, StdlibMemcpy)(benchmark::State& st) {
66+
for (auto _ : st) {
67+
std::memcpy(dest, src, size);
68+
}
69+
}
70+
71+
BENCHMARK_DEFINE_F(MemcpyFixture, VectorizedMemcpy)(benchmark::State& st) {
72+
for (auto _ : st) {
73+
valgo::memcpy(dest, src, size);
74+
}
75+
}
76+
77+
#define RUN_FOR_ALL_SIZE(f, t) BENCHMARK_REGISTER_F(f, t)->RangeMultiplier(2)->Range(1, 1 << 10);
78+
79+
//RUN_FOR_ALL_SIZE(MemcpyFixture, StdlibMemcpy);
80+
//RUN_FOR_ALL_SIZE(MemcpyFixture, VectorizedMemcpy);
81+
82+
BENCHMARK_REGISTER_F(MemcpyFixture, VectorizedMemcpy)->Args({8 * 1024});
83+
84+
BENCHMARK_MAIN();

src/vecstdmemcpy.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#include <benchmark/benchmark.h>
2+
#include <omp.h>
3+
#include <cstdlib>
4+
#include <cstring>
5+
6+
namespace valgo {
7+
void memcpy(char* dst, char const * src, std::size_t n) {
8+
#pragma omp simd simdlen(16)
9+
for (int i = 0; i < n; i++) {
10+
dst[i] = src[i];
11+
}
12+
}
13+
}
14+
15+
void RandomData(char* data, std::size_t n) {
16+
for (int i=0 ; i<n; i++) {
17+
data[i] = rand() % 255;
18+
}
19+
}
20+
21+
constexpr int KiloByte() {
22+
return 1024;
23+
}
24+
25+
class MemcpyFixture : public ::benchmark::Fixture {
26+
void SetUp(const ::benchmark::State& state) override {
27+
size = state.range(0) * KiloByte();
28+
dest = new char[size];
29+
src = new char[size];
30+
31+
RandomData(src, size);
32+
}
33+
34+
void TearDown(const ::benchmark::State& state) override {
35+
delete[] dest;
36+
delete[] src;
37+
}
38+
39+
public:
40+
char* dest = nullptr;
41+
char* src = nullptr;
42+
int size = 0;
43+
};
44+
45+
BENCHMARK_DEFINE_F(MemcpyFixture, StdlibMemcpy)(benchmark::State& st) {
46+
for (auto _ : st) {
47+
std::memcpy(dest, src, size);
48+
}
49+
}
50+
51+
BENCHMARK_DEFINE_F(MemcpyFixture, VectorizedMemcpy)(benchmark::State& st) {
52+
for (auto _ : st) {
53+
valgo::memcpy(dest, src, size);
54+
}
55+
}
56+
57+
#define RUN_FOR_ALL_SIZE(f, t) BENCHMARK_REGISTER_F(f, t)->RangeMultiplier(2)->Range(1, 1 << 10);
58+
59+
//RUN_FOR_ALL_SIZE(MemcpyFixture, StdlibMemcpy);
60+
//RUN_FOR_ALL_SIZE(MemcpyFixture, VectorizedMemcpy);
61+
62+
BENCHMARK_REGISTER_F(MemcpyFixture, StdlibMemcpy)->Args({8 * 1024});
63+
64+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)