Latency is the biggest concern here. I have found that trying to render 3 1920x1080 video feeds with RGBA overlays to individual windows via OpenGL has limits. I am able to render two windows with overlays or 3 windows without overlays just fine, but when the third window is introduced, rendering stalls are obvious. I believe that the issue is due to the overuse of glAlphaFunc() to overlay and RGBA based texture on an RGB video texture. In order to reduce the overuse, my thought is to move some of the overlay function into CPU (as I have lots of CPU - dual hexcore Xeon). The ideal place to do this would be when copying the source RGB image to the mapped PBO and replacing the RGB values with the ones from the RGBA overlay where A > 0.
I have tried using Intel IPP methods, but there is no method available that doesn't involve multiple calls and results in too much latency. I've tried straight C code, but this takes longer than the 33 ms that I am allowed. I need help with creating an optimized assembly or SSE based routine that will provide minimal latency.
Compile the below code with > g++ -fopenmp -O2 -mtune=native
Basic C function for clarity:
void copyAndOverlay(const uint8_t* aSourceRGB, const uint8_t* aOverlayRGBA, uint8_t* aDestinationRGB, int aWidth, int aHeight) {
    int i;
#pragma omp parallel for
    for (i=0; i<aWidth*aHeight; ++i) {
        if (0 == aOverlayRGBA[i*4+3]) {
           aDestinationRGB[i*3] = aSourceRGB[i*3]; // R
           aDestinationRGB[i*3+1] = aSourceRGB[i*3+1]; // G
           aDestinationRGB[i*3+2] = aSourceRGB[i*3+2]; // B
        } else {
           aDestinationRGB[i*3] = aOverlayRGBA[i*4]; // R
           aDestinationRGB[i*3+1] = aOverlayRGBA[i*4+1]; // G
           aDestinationRGB[i*3+2] = aOverlayRGBA[i*4+2]; // B
        }
    }
}
uint64_t getTime() {
  struct timeval tNow;
  gettimeofday(&tNow, NULL);
  return (uint64_t)tNow.tv_sec * 1000000 + (uint64_t)tNow.tv_usec;
}
int main(int argc, char **argv) {
  int pixels = _WIDTH_ * _HEIGHT_ * 3;
  uint8_t *rgba = new uint8_t[_WIDTH_ * _HEIGHT_ * 4];
  uint8_t *src = new uint8_t[pixels];
  uint8_t *dst = new uint8_t[pixels];
  uint64_t tStart = getTime();
  for (int t=0; t<1000; ++t) {
    copyAndOverlay(src, rgba, dst, _WIDTH_, _HEIGHT_);
  }
  printf("delta: %lu\n", (getTime() - tStart) / 1000);
  delete [] rgba;
  delete [] src;
  delete [] dst; 
  return 0;
}
Here is an SSE4 implementation that is a little more than 5 times faster than the code you posted with the question (without parallelization of the loop). As written it only works on RGBA buffers that are 16-byte aligned and sized in multiples of 64, and on RGB buffers that are 16-byte aligned and sized in multiples of 48. The size will requirments will jive perfectly with your 1920x1080 resolution, and you may need to add code to ensure your buffers are 16-byte aligned.
void copyAndOverlay(const uint8_t* aSourceRGB, const uint8_t* aOverlayRGBA, uint8_t* aDestinationRGB, int aWidth, int aHeight) {
  __m128i const ocmp     = _mm_setzero_si128();
  __m128i const omskshf1 = _mm_set_epi32(0x00000000, 0x0F0F0F0B, 0x0B0B0707, 0x07030303);
  __m128i const omskshf2 = _mm_set_epi32(0x07030303, 0x00000000, 0x0F0F0F0B, 0x0B0B0707);
  __m128i const omskshf3 = _mm_set_epi32(0x0B0B0707, 0x07030303, 0x00000000, 0x0F0F0F0B);
  __m128i const omskshf4 = _mm_set_epi32(0x0F0F0F0B, 0x0B0B0707, 0x07030303, 0x00000000);
  __m128i const ovalshf1 = _mm_set_epi32(0x00000000, 0x0E0D0C0A, 0x09080605, 0x04020100);
  __m128i const ovalshf2 = _mm_set_epi32(0x04020100, 0x00000000, 0x0E0D0C0A, 0x09080605);
  __m128i const ovalshf3 = _mm_set_epi32(0x09080605, 0x04020100, 0x00000000, 0x0E0D0C0A);
  __m128i const ovalshf4 = _mm_set_epi32(0x0E0D0C0A, 0x09080605, 0x04020100, 0x00000000);
  __m128i const blndmsk1 = _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000);
  __m128i const blndmsk2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
  __m128i const blndmsk3 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
  __m128i       a, b, c, x, y, z, w, p, q, r, s;
  uint8_t const *const aSourceRGBPast = aSourceRGB + 3 * aWidth * aHeight;
  while (aSourceRGB != aSourceRGBPast) {
    // source:
    //  aaabbbcccdddeeef
    //  ffggghhhiiijjjkk
    //  klllmmmnnnoooppp
    //
    // overlay:
    //  aaaabbbbccccdddd
    //  eeeeffffgggghhhh
    //  iiiijjjjkkkkllll
    //  mmmmnnnnoooopppp
    // load source
    a = _mm_load_si128((__m128i const*)(aSourceRGB       ));
    b = _mm_load_si128((__m128i const*)(aSourceRGB   + 16));
    c = _mm_load_si128((__m128i const*)(aSourceRGB   + 32));
    // load overlay
    x = _mm_load_si128((__m128i const*)(aOverlayRGBA     ));
    y = _mm_load_si128((__m128i const*)(aOverlayRGBA + 16));
    z = _mm_load_si128((__m128i const*)(aOverlayRGBA + 32));
    w = _mm_load_si128((__m128i const*)(aOverlayRGBA + 48));
    // compute blend mask, put 0xFF in bytes equal to zero
    p = _mm_cmpeq_epi8(x, ocmp);
    q = _mm_cmpeq_epi8(y, ocmp);
    r = _mm_cmpeq_epi8(z, ocmp);
    s = _mm_cmpeq_epi8(w, ocmp);
    // align overlay to be condensed to 3-byte color
    x = _mm_shuffle_epi8(x, ovalshf1);
    y = _mm_shuffle_epi8(y, ovalshf2);
    z = _mm_shuffle_epi8(z, ovalshf3);
    w = _mm_shuffle_epi8(w, ovalshf4);
    // condense overlay to 3-btye color
    x = _mm_blendv_epi8(x, y, blndmsk1);
    y = _mm_blendv_epi8(y, z, blndmsk2);
    z = _mm_blendv_epi8(z, w, blndmsk3);
    // align blend mask to be condensed to 3-byte color
    p = _mm_shuffle_epi8(p, omskshf1);
    q = _mm_shuffle_epi8(q, omskshf2);
    r = _mm_shuffle_epi8(r, omskshf3);
    s = _mm_shuffle_epi8(s, omskshf4);
    // condense blend mask to 3-btye color
    p = _mm_blendv_epi8(p, q, blndmsk1);
    q = _mm_blendv_epi8(q, r, blndmsk2);
    r = _mm_blendv_epi8(r, s, blndmsk3);
    // select from overlay and source based on blend mask
    x = _mm_blendv_epi8(x, a, p);
    y = _mm_blendv_epi8(y, b, q);
    z = _mm_blendv_epi8(z, c, r);
    // write colors to destination
    _mm_store_si128((__m128i*)(aDestinationRGB     ), x);
    _mm_store_si128((__m128i*)(aDestinationRGB + 16), y);
    _mm_store_si128((__m128i*)(aDestinationRGB + 32), z);
    // update poniters
    aSourceRGB      += 48;
    aOverlayRGBA    += 64;
    aDestinationRGB += 48;
  }
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With