Skip to content

Commit 800ee3d

Browse files
committed
[MicroBenchmarks] Add benchmark for control-flow-vectorization.
Benchmarks with vs. without autovec with control flow inside for loops with conditional codes.
1 parent c053bed commit 800ee3d

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

MicroBenchmarks/LoopVectorization/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,12 @@ llvm_test_executable(LoopEpilogueVectorizationBenchmarks
3636
)
3737

3838
target_link_libraries(LoopEpilogueVectorizationBenchmarks benchmark)
39+
40+
llvm_test_run()
41+
42+
llvm_test_executable(ControlFlowVectorizationBenchmarks
43+
main.cpp
44+
ControlFlowVectorization.cpp
45+
)
46+
47+
target_link_libraries(ControlFlowVectorizationBenchmarks benchmark)
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#include <iostream>
2+
#include <memory>
3+
#include <random>
4+
5+
#include "benchmark/benchmark.h"
6+
7+
#define ITERATIONS 100000
8+
9+
template <typename T> using CFVFunc = void (*)(T *, unsigned);
10+
11+
// Define conditional increment loop with given stride.
12+
#define DEF_COND_INC_LOOP(name, stride) \
13+
template <typename T> \
14+
__attribute__((noinline)) static void run_##name##_autovec(T *A, \
15+
unsigned N) { \
16+
for (unsigned i = 0; i < N; i++) { \
17+
if (i % stride == 0) { \
18+
A[i] = A[i] + 1; \
19+
} \
20+
} \
21+
} \
22+
template <typename T> \
23+
__attribute__((noinline)) static void run_##name##_novec(T *A, unsigned N) { \
24+
_Pragma("clang loop vectorize(disable) interleave(disable)") \
25+
for (unsigned i = 0; i < N; i++) { \
26+
if (i % stride == 0) { \
27+
A[i] = A[i] + 1; \
28+
} \
29+
} \
30+
}
31+
32+
// Define conditional increment by value loop.
33+
#define DEF_COND_INC_VALUE_LOOP(name, marker) \
34+
template <typename T> \
35+
__attribute__((noinline)) static void run_##name##_autovec(T *A, \
36+
unsigned N) { \
37+
for (unsigned i = 0; i < N; i++) { \
38+
if (A[i] == marker) { \
39+
A[i] = A[i] + 1; \
40+
} \
41+
} \
42+
} \
43+
template <typename T> \
44+
__attribute__((noinline)) static void run_##name##_novec(T *A, unsigned N) { \
45+
_Pragma("clang loop vectorize(disable) interleave(disable)") \
46+
for (unsigned i = 0; i < N; i++) { \
47+
if (A[i] == marker) { \
48+
A[i] = A[i] + 1; \
49+
} \
50+
} \
51+
}
52+
53+
// Define unconditional increment loop.
54+
template <typename T>
55+
__attribute__((noinline)) static void run_uncond_inc_autovec(T *A, unsigned N) {
56+
for (unsigned i = 0; i < N; i++) {
57+
A[i] = A[i] + 1;
58+
}
59+
}
60+
61+
template <typename T>
62+
__attribute__((noinline)) static void run_uncond_inc_novec(T *A, unsigned N) {
63+
_Pragma("clang loop vectorize(disable) interleave(disable)")
64+
for (unsigned i = 0; i < N; i++) {
65+
A[i] = A[i] + 1;
66+
}
67+
}
68+
69+
// Define loops with different strides.
70+
// stride=2: 50% active lanes
71+
// stride=4: 25% active lanes
72+
// stride=8: 12.5% active lanes
73+
// stride=16: 6.25% active lanes
74+
// stride=32: 3.125% active lanes
75+
// stride=64: 1.5625% active lanes
76+
// stride=128: 0.78% active lanes
77+
DEF_COND_INC_LOOP(cond_inc_stride_2, 2)
78+
DEF_COND_INC_LOOP(cond_inc_stride_4, 4)
79+
DEF_COND_INC_LOOP(cond_inc_stride_8, 8)
80+
DEF_COND_INC_LOOP(cond_inc_stride_16, 16)
81+
DEF_COND_INC_LOOP(cond_inc_stride_32, 32)
82+
DEF_COND_INC_LOOP(cond_inc_stride_64, 64)
83+
DEF_COND_INC_LOOP(cond_inc_stride_128, 128)
84+
85+
// Conditional increment by value (sparse condition).
86+
DEF_COND_INC_VALUE_LOOP(cond_inc_by_value, 42)
87+
88+
// Initialize array with random numbers.
89+
template <typename T> static void init_data(T *A) {
90+
std::uniform_int_distribution<T> dist(0, 100);
91+
std::mt19937 rng(12345);
92+
for (unsigned i = 0; i < ITERATIONS; i++) {
93+
A[i] = dist(rng);
94+
}
95+
}
96+
97+
// Benchmark vectorized version.
98+
template <typename T>
99+
static void __attribute__((always_inline))
100+
benchmark_cfv_autovec(benchmark::State &state, CFVFunc<T> VecFn,
101+
CFVFunc<T> NoVecFn) {
102+
std::unique_ptr<T[]> A(new T[ITERATIONS]);
103+
std::unique_ptr<T[]> A_vec(new T[ITERATIONS]);
104+
std::unique_ptr<T[]> A_novec(new T[ITERATIONS]);
105+
init_data(&A[0]);
106+
107+
#ifdef BENCH_AND_VERIFY
108+
// Verify the vectorized and scalar versions produce the same results.
109+
{
110+
std::copy(&A[0], &A[0] + ITERATIONS, &A_vec[0]);
111+
std::copy(&A[0], &A[0] + ITERATIONS, &A_novec[0]);
112+
VecFn(&A_vec[0], ITERATIONS);
113+
NoVecFn(&A_novec[0], ITERATIONS);
114+
for (unsigned i = 0; i < ITERATIONS; i++) {
115+
if (A_vec[i] != A_novec[i]) {
116+
std::cerr << "ERROR: vectorization result different at index " << i
117+
<< "; " << A_vec[i] << " != " << A_novec[i] << "\n";
118+
exit(1);
119+
}
120+
}
121+
}
122+
#endif
123+
124+
for (auto _ : state) {
125+
std::copy(&A[0], &A[0] + ITERATIONS, &A_vec[0]);
126+
VecFn(&A_vec[0], ITERATIONS);
127+
benchmark::DoNotOptimize(A_vec);
128+
benchmark::ClobberMemory();
129+
}
130+
}
131+
132+
// Benchmark version with vectorization disabled.
133+
template <typename T>
134+
static void __attribute__((always_inline))
135+
benchmark_cfv_novec(benchmark::State &state, CFVFunc<T> NoVecFn) {
136+
std::unique_ptr<T[]> A(new T[ITERATIONS]);
137+
std::unique_ptr<T[]> A_work(new T[ITERATIONS]);
138+
init_data(&A[0]);
139+
140+
for (auto _ : state) {
141+
std::copy(&A[0], &A[0] + ITERATIONS, &A_work[0]);
142+
NoVecFn(&A_work[0], ITERATIONS);
143+
benchmark::DoNotOptimize(A_work);
144+
benchmark::ClobberMemory();
145+
}
146+
}
147+
148+
#define BENCHMARK_CFV_CASE(name, ty) \
149+
void BENCHMARK_##name##_autovec_##ty##_(benchmark::State &state) { \
150+
benchmark_cfv_autovec<ty>(state, run_##name##_autovec, run_##name##_novec);\
151+
} \
152+
BENCHMARK(BENCHMARK_##name##_autovec_##ty##_)->Unit(benchmark::kNanosecond); \
153+
\
154+
void BENCHMARK_##name##_novec_##ty##_(benchmark::State &state) { \
155+
benchmark_cfv_novec<ty>(state, run_##name##_novec); \
156+
} \
157+
BENCHMARK(BENCHMARK_##name##_novec_##ty##_)->Unit(benchmark::kNanosecond);
158+
159+
// Unconditional increment benchmark.
160+
#define BENCHMARK_UNCOND_CASE(ty) \
161+
void BENCHMARK_uncond_inc_autovec_##ty##_(benchmark::State &state) { \
162+
benchmark_cfv_autovec<ty>(state, run_uncond_inc_autovec, \
163+
run_uncond_inc_novec); \
164+
} \
165+
BENCHMARK(BENCHMARK_uncond_inc_autovec_##ty##_) \
166+
->Unit(benchmark::kNanosecond); \
167+
\
168+
void BENCHMARK_uncond_inc_novec_##ty##_(benchmark::State &state) { \
169+
benchmark_cfv_novec<ty>(state, run_uncond_inc_novec); \
170+
} \
171+
BENCHMARK(BENCHMARK_uncond_inc_novec_##ty##_)->Unit(benchmark::kNanosecond);
172+
173+
// Add benchmarks for all variants.
174+
#define ADD_CFV_BENCHMARKS(ty) \
175+
BENCHMARK_UNCOND_CASE(ty) \
176+
BENCHMARK_CFV_CASE(cond_inc_stride_2, ty) \
177+
BENCHMARK_CFV_CASE(cond_inc_stride_4, ty) \
178+
BENCHMARK_CFV_CASE(cond_inc_stride_8, ty) \
179+
BENCHMARK_CFV_CASE(cond_inc_stride_16, ty) \
180+
BENCHMARK_CFV_CASE(cond_inc_stride_32, ty) \
181+
BENCHMARK_CFV_CASE(cond_inc_stride_64, ty) \
182+
BENCHMARK_CFV_CASE(cond_inc_stride_128, ty) \
183+
BENCHMARK_CFV_CASE(cond_inc_by_value, ty)
184+
185+
ADD_CFV_BENCHMARKS(int64_t)
186+
ADD_CFV_BENCHMARKS(int32_t)
187+
ADD_CFV_BENCHMARKS(int16_t)
188+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Checking conditional_increment_i64_stride_4
2+
Checking conditional_increment_i32_stride_4
3+
Checking conditional_increment_i16_stride_4
4+
Checking conditional_increment_i64_by_value
5+
Checking conditional_increment_i32_by_value
6+
Checking conditional_increment_i16_by_value
7+
exit 0

0 commit comments

Comments
 (0)