Here are things I believe are facts:
Hypothesis:
But when I run the code below I get the result:
i16 = 7.5168
i32 = 7.3762
i64 = 7.5758
Why am I not getting the results I want?
C++:
#include <iostream>
#include <vector>
#include <cstdint>
#include <algorithm>
#include <chrono>
int main() {
const int vlength = 100'000'000;
const int maxI = 50'000;
std::vector<int16_t> v16;
for (int i = 0; i < vlength; ++i) {
v16.push_back(int16_t(i%maxI));
}
std::random_shuffle(std::begin(v16), std::end(v16));
std::vector<int32_t> v32;
std::vector<int64_t> v64;
for (int i = 0; i < vlength; ++i) {
v32.push_back(int32_t(v16[i]));
v64.push_back(int64_t(v16[i]));
}
auto t1 = std::chrono::high_resolution_clock::now();
std::sort(std::begin(v16), std::end(v16));
auto t2 = std::chrono::high_resolution_clock::now();
std :: cout << "i16 = " << (std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)).count() << std :: endl;
t1 = std::chrono::high_resolution_clock::now();
std::sort(std::begin(v32), std::end(v32));
t2 = std::chrono::high_resolution_clock::now();
std :: cout << "i32 = " << (std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)).count() << std :: endl;
t1 = std::chrono::high_resolution_clock::now();
std::sort(std::begin(v64), std::end(v64));
t2 = std::chrono::high_resolution_clock::now();
std :: cout << "i64 = " << (std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)).count() << std :: endl;
}
EDIT: In order to avoid the question of how cache friendly sort is, I've also tried the following code:
template <typename T>
inline void function_speed(T& vec) {
for (auto& i : vec) {
++i;
}
}
int main() {
const int nIter = 1000;
std::vector<int16_t> v16(1000000);
std::vector<int32_t> v32(1000000);
std::vector<int64_t> v64(1000000);
auto t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nIter; ++i) {
function_speed(v16);
}
auto t2 = std::chrono::high_resolution_clock::now();
std :: cout << "i16 = " << (std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)).count()/double(nIter) << std :: endl;
t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nIter; ++i) {
function_speed(v32);
}
t2 = std::chrono::high_resolution_clock::now();
std :: cout << "i32 = " << (std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)).count()/double(nIter) << std :: endl;
t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nIter; ++i) {
function_speed(v64);
}
t2 = std::chrono::high_resolution_clock::now();
std :: cout << "i64 = " << (std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)).count()/double(nIter) << std :: endl;
}
Typical result:
i16 = 0.00618648
i32 = 0.00617911
i64 = 0.00606275
I know that proper benchmarking is a science of itself, perhaps I am doing to wrong.
EDIT2: By avoiding overflowing I am now starting to get more interesting results:
template <typename T>
inline void function_speed(T& vec) {
for (auto& i : vec) {
++i;
i %= 1000;
}
}
Gives results such as:
i16 = 0.0143789
i32 = 0.00958941
i64 = 0.019691
If I instead do:
template <typename T>
inline void function_speed(T& vec) {
for (auto& i : vec) {
i = (i+1)%1000;
}
}
I get:
i16 = 0.00939448
i32 = 0.00913768
i64 = 0.019615
Mistaken assumption; all O(N log N) sorting algorithms have to be cache-unfriendly for the vast majority of the N! possible inputs.
Furtehrmore, I think an optimizing compiler can remove the sorts outright, and an unoptimized build will of course be pointless to benchmark.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With