In a follow-up to some previous questions on converting RGB to RGBA, and ARGB to BGR, I would like to speed up a RGB to BGRA conversion with SSE. Assume a 32-bit machine, and would like to use intrinsics. I'm having difficulty aligning both source and destination buffers to work with 128-bit registers, and seek for other savvy vectorization solutions.
The routine to be vectorized is as follows...
void RGB8ToBGRX8(int w, const void *in, void *out)
{
int i;
int width = w;
const unsigned char *src= (const unsigned char*) in;
unsigned int *dst= (unsigned int*) out;
unsigned int invalue, outvalue;
for (i=0; i<width; i++, src+=3, dst++)
{
invalue = src[0];
outvalue = (invalue<<16);
invalue = src[1];
outvalue |= (invalue<<8);
invalue = src[2];
outvalue |= (invalue);
*dst = outvalue | 0xff000000;
}
}
This routine gets used primarly for large textures (512KB), so if I can parallelize some of the operations, it may be beneficial to process more pixels at a go. Of course, I'll need to profile. :)
Edit:
My compilation arguments...
gcc -O2 main.c
This is an example of using SSSE3 intrinsics to perform the requested operation. The input and output pointers must be 16-byte aligned, and it operates on a block of 16 pixels at a time.
#include <tmmintrin.h>
/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
const __m128i *in_vec = in;
__m128i *out_vec = out;
w /= 16;
while (w-- > 0) {
/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
* in_vec[0] Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
* in_vec[1] Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
* in_vec[2] Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
*/
__m128i in1, in2, in3;
__m128i out;
in1 = in_vec[0];
out = _mm_shuffle_epi8(in1,
_mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[0] = out;
in2 = in_vec[1];
in1 = _mm_and_si128(in1,
_mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
out = _mm_and_si128(in2,
_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
out = _mm_or_si128(out, in1);
out = _mm_shuffle_epi8(out,
_mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[1] = out;
in3 = in_vec[2];
in_vec += 3;
in2 = _mm_and_si128(in2,
_mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
out = _mm_and_si128(in3,
_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
out = _mm_or_si128(out, in2);
out = _mm_shuffle_epi8(out,
_mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[2] = out;
out = _mm_shuffle_epi8(in3,
_mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[3] = out;
out_vec += 4;
}
}
I personally found that implementing the following gave me the best result for converting BGR-24 to ARGB-32.
This code runs at about 8.8ms on an image whereas the 128-bit vectorization code presented above came in at 14.5ms per image.
void PixelFix(u_int32_t *buff,unsigned char *diskmem)
{
int i,j;
int picptr, srcptr;
int w = 1920;
int h = 1080;
for (j=0; j<h; j++) {
for (i=0; i<w; i++) {
buff[picptr++]=(diskmem[srcptr]<<24) | (diskmem[srcptr+1]<<16) | diskmem[srcptr+2]<<8 | 0xff;
srcptr+=3;
}
}
}
Previously, I had been using this routine (about 13.2ms per image). Here, buff is an unsigned char*.
for (j=0; j<h; j++) {
int srcptr = (h-j-1)*w*3; // remove if you don't want vertical flipping
for (i=0; i<w; i++) {
buff[picptr+3]=diskmem[srcptr++]; // b
buff[picptr+2]=diskmem[srcptr++]; // g
buff[picptr+1]=diskmem[srcptr++]; // r
buff[picptr+0]=255; // a
picptr+=4;
}
}
Running a 2012 MacMini 2.6ghz/i7.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With