I have the following code to calculate a Hadamard transform. Right now, the hadamard function is the bottleneck of my program. Do you see any potential to speed it up? Maybe using AVX2 instructions? Typical input sizes are around 512 or 1024.
Best, Tom
#include <stdio.h>
void hadamard(double *p, size_t len) {
double tmp = 0.0;
if(len == 2) {
tmp = p[0];
p[0] = tmp + p[1];
p[1] = tmp - p[1];
} else {
hadamard(p, len/2);
hadamard(p+len/2, len/2);
for(int i = 0; i < len/2; i++) {
tmp = p[i];
p[i] = tmp + p[i+len/2];
p[i+len/2] = tmp - p[i+len/2];
}
}
}
int main(int argc, char* argv[]) {
double a[] = {1.0, 2.0, 3.0, 4.0};
hadamard(a, 4);
}
Here is a proof-of-concept implementation, based on the Fast Walsh–Hadamard transform, with an optimized first pass. Compiles fine with clang-3.3 or later, and gives nice results with clang-4.0 or later (requires -O2 to properly inline relevant functions). If you don't have FMA, you need to xor the lower two elements of hada2_
with -0.0
in hadamard4
(and use a normal _mm256_add_pd
).
All gcc versions I checked would require replacing the memcpy
by manual load/store intrinsics to get similar results.
Also, I left handling of cases len<16
as exercise. And it may be worth implementing a hadamard32
and perhaps a hadamard64
similar to hadamard16
, to better use the available registers and reduce memory access. In C++ this could be done with a recursive template implementation.
#include <immintrin.h> // avx+fma
#include <assert.h> // assert
#include <string.h> // memcpy
inline __m256d hadamard4(__m256d x0123)
{
__m256d x1032 = _mm256_permute_pd(x0123, 5); // [x1, x0, x3, x2]
__m256d hada2 = _mm256_addsub_pd(x1032,x0123); // [x0+x1, x0-x1, x2+x3, x2-x3]
__m256d hada2_= _mm256_permute2f128_pd(hada2, hada2, 1); // [x2+x3, x2-x3, x0+x1, x0-x1]
// if no FMA is available, this can be done with xoring and adding:
__m256d res = _mm256_fmadd_pd(hada2_, _mm256_set_pd(1.0, 1.0, -1.0, -1.0), hada2);
return res;
}
inline void hadamard8(__m256d data[2])
{
__m256d a = hadamard4(data[0]);
__m256d b = hadamard4(data[1]);
data[0] = _mm256_add_pd(a,b);
data[1] = _mm256_sub_pd(a,b);
}
inline void hadamard16(__m256d data[4])
{
hadamard8(data+0);
hadamard8(data+2);
for(int i=0; i<2; ++i)
{
__m256d tmp = data[i];
data[i] = _mm256_add_pd(tmp, data[i+2]);
data[i+2]= _mm256_sub_pd(tmp, data[i+2]);
}
}
void hadamard(double* p, size_t len)
{
assert((len&(len-1))==0); // len must be power of 2
assert(len>=16); // TODO implement fallback for smaller sizes ...
// first pass: hadamard of 16 values each
for(size_t i=0; i<len; i+=16)
{
__m256d data[4];
memcpy(data, p+i, sizeof(data)); // should get optimized to 4x vmovupd
hadamard16(data);
memcpy(p+i, data, sizeof(data)); // should get optimized to 4x vmovupd
}
for(size_t h=32; h<len; h*=2)
{
for(size_t i=0; i<len; i+=2*h)
{
for(size_t j=i; j<i+h; j+=4)
{
__m256d x = _mm256_loadu_pd(p+j);
__m256d y = _mm256_loadu_pd(p+j+h);
_mm256_storeu_pd(p+j, _mm256_add_pd(x,y));
_mm256_storeu_pd(p+j+h, _mm256_sub_pd(x,y));
}
}
}
}
Benchmarking is also left as an exercise ;-)
Disclaimer: I did not test this. Looking at it, I may have mixed up hada2_
and hada2
in the _mm256_fmadd_pd
instruction ...
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With