I found that use std::variant+std::visit is inefficient (performance almost like virtual function), but when I replace std::visit by x.index(), the performance is as better as CRTP.
My code:
#include <benchmark/benchmark.h>
#include <memory>
#include <vector>
#include <variant>
#include <cmath>
namespace variant_shapes {
// 圆形实现 - 不使用虚函数
class Circle {
private:
double radius_;
public:
explicit Circle(double radius) : radius_(radius) {}
double area() const {
return M_PI * radius_ * radius_;
}
double getRadius() const { return radius_; }
};
// 矩形实现 - 不使用虚函数
class Rectangle {
private:
double width_;
double height_;
public:
Rectangle(double width, double height) : width_(width), height_(height) {}
double area() const {
return width_ * height_;
}
double getWidth() const { return width_; }
double getHeight() const { return height_; }
};
}
// 添加一个直接使用 if constexpr 的基准测试
static void BM_StaticDispatch(benchmark::State& state) {
std::vector<std::variant<variant_shapes::Circle, variant_shapes::Rectangle>> shapes;
shapes.reserve(1000);
for (int i = 0; i < 1000; ++i) {
if (i % 2 == 0) {
shapes.emplace_back(variant_shapes::Circle(5.0));
} else {
shapes.emplace_back(variant_shapes::Rectangle(4.0, 6.0));
}
}
auto visitor = [](auto &&arg) -> double
{
using T = std::decay_t<decltype(arg)>;
if constexpr (std::is_same_v<T, variant_shapes::Circle>)
{
return arg.area();
}
else if constexpr (std::is_same_v<T, variant_shapes::Rectangle>)
{
return arg.area();
}
};
for (auto _ : state) {
double totalArea = 0.0;
for (const auto& s : shapes) {
totalArea += std::visit(visitor, s);
}
benchmark::DoNotOptimize(totalArea);
}
}
BENCHMARK(BM_StaticDispatch);
// std::variant + std::visit 基准测试
// 使用 if constexpr 的静态分派函数
template<typename T>
double get_area(const T& shape) {
if constexpr (std::is_same_v<T, variant_shapes::Circle>) {
return shape.area();
} else if constexpr (std::is_same_v<T, variant_shapes::Rectangle>) {
return shape.area();
}
return 0.0; // 不应该到达这里
}
static void BM_StaticDispatch2(benchmark::State& state) {
std::vector<std::variant<variant_shapes::Circle, variant_shapes::Rectangle>> shapes;
shapes.reserve(1000);
for (int i = 0; i < 1000; ++i) {
if (i % 2 == 0) {
shapes.emplace_back(variant_shapes::Circle(5.0));
} else {
shapes.emplace_back(variant_shapes::Rectangle(4.0, 6.0));
}
}
auto visitor = [](auto &&arg) -> double
{
using T = std::decay_t<decltype(arg)>;
if constexpr (std::is_same_v<T, variant_shapes::Circle>)
{
return arg.area();
}
else if constexpr (std::is_same_v<T, variant_shapes::Rectangle>)
{
return arg.area();
}
};
for (auto _ : state) {
double totalArea = 0.0;
for (const auto& s : shapes) {
totalArea += s.index() == 0
? get_area(std::get<0>(s))
: get_area(std::get<1>(s));
}
benchmark::DoNotOptimize(totalArea);
}
}
BENCHMARK(BM_StaticDispatch2);
Performance test link: Quick C++ Benchmark.
what you are seeing is just a bad optimization done on this code by clang that made std::visit worse, both gcc and clang are able to inline both codes without a jump table. and in gcc case both function have the same time.
overall this example is heavily influenced by the CPU and compiler optimization capabilities on this exact code and shouldn't be used as a general case for std::variant performance. however it is nice to know that both gcc and clang can inline std::visit with two alternatives.
for clang the variant assembly is as follows online compiler
.LBB0_27:
mov qword ptr [rsp + 8], 0
xorpd xmm1, xmm1
mov rax, rbx
jmp .LBB0_28
.LBB0_30:
mulsd xmm2, xmm3 // xmm2 *= xmm3
addsd xmm1, xmm2 // totalArea += xmm2
movsd qword ptr [rsp + 8], xmm1
add rax, 24
cmp rax, r12
je .LBB0_31
.LBB0_28:
cmp byte ptr [rax + 16], 0
movsd xmm2, qword ptr [rax] // xmm2 = radius
movapd xmm3, xmm2 // xmm3 = radius
mulsd xmm3, xmm0 // xmm3 *= M_PI
je .LBB0_30
movsd xmm3, qword ptr [rax + 8] // xmm3 = height
jmp .LBB0_30
which roughly translates to
test_flag = index == 0;
xmm2 = radius * M_PI;
if (!test_flag)
{
xmm2 = height_;
}
totalArea += xmm2 * (radius or width);
even if the CPU could predict that branch, it will still evaluate radius * M_PI. but this delay could be drastically different between different CPUs.
whereas the ternary operator turned into the following:
.LBB1_28:
mov qword ptr [rsp + 8], 0
xorpd xmm1, xmm1
mov rax, rbx
jmp .LBB1_29
.LBB1_56:
movsd xmm2, qword ptr [rax] // xmm2 = width;
mulsd xmm2, qword ptr [rax + 8] // xmm2 *= height;
.LBB1_57:
addsd xmm1, xmm2 // totalArea += xmm2
movsd qword ptr [rsp + 8], xmm1
add rax, 24
cmp rax, r12
je .LBB1_52
.LBB1_29:
movzx ecx, byte ptr [rax + 16]
cmp ecx, 1
je .LBB1_56
test ecx, ecx
jne .LBB1_54 // bad_access
movsd xmm3, qword ptr [rax]
movapd xmm2, xmm3 // xmm2 = radius;
mulsd xmm2, xmm0 // xmm2 *= M_PI
mulsd xmm2, xmm3 // xmm2 *= radius;
jmp .LBB1_57
which roughly translates to
test_flag = index == 0;
if (test_flag)
{
xmm2 = M_PI * radius * radius
}
else
{
xmm2 = width * height;
}
totalArea += xmm2;
which the CPU can easily predict, modern CPUs can actually predict 010101 pattern.
lastly gcc optimizes both to the following
.L30:
movsd xmm3, QWORD PTR .LC4[rip]
mov rax, rbp
pxor xmm1, xmm1
jmp .L28
.L68:
movsd xmm2, QWORD PTR [rax]
add rax, 24
movapd xmm0, xmm2 // xmm0 = radius
mulsd xmm0, xmm3 // xmm0 *= M_PI
mulsd xmm0, xmm2 // xmm0 *= radius
addsd xmm1, xmm0 // result += xmm0
cmp r15, rax
je .L24
.L28:
movzx ecx, BYTE PTR [rax+16]
test cl, cl
je .L68
cmp cl, 1 // bad access
jne .L63
movsd xmm0, QWORD PTR [rax] // xmm0 = width
mulsd xmm0, QWORD PTR [rax+8] // xmm0 *= height
add rax, 24
addsd xmm1, xmm0 // totalArea += xmm0
cmp r15, rax
jne .L28
which roughly translates to
if (index == 0)
{
totalArea += M_PI * radius * radius;
}
else
{
totalArea += width * height;
}
which the CPU can also predict.
It is partially due to complexity requirement:
When the number of variants is zero or one, the invocation of the callable object is implemented in constant time; i.e., it does not depend on the number of types can be stored in the variant.
so you cannot chain
if (s.index() == 0) return get_area(std::get<0>(s))
else if (s.index() == 1) return get_area(std::get<1>(s))
// ...
it would be O(n).
So possible implementation are switch, or using table similar to vtable.
Jump prediction (indirection through the "vtable") is generally worst than branch prediction.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With