How do I clear the 16 - i upper bytes of a __m128i?
I've tried this; it works, but I'm wondering if there is a better (shorter, faster) way:
int i = ...  //  0 < i < 16
__m128i x = ...
__m128i mask = _mm_set_epi8(
    0,
    (i > 14) ? -1 : 0,
    (i > 13) ? -1 : 0,
    (i > 12) ? -1 : 0,
    (i > 11) ? -1 : 0,
    (i > 10) ? -1 : 0,
    (i >  9) ? -1 : 0,
    (i >  8) ? -1 : 0,
    (i >  7) ? -1 : 0,
    (i >  6) ? -1 : 0,
    (i >  5) ? -1 : 0,
    (i >  4) ? -1 : 0,
    (i >  3) ? -1 : 0,
    (i >  2) ? -1 : 0,
    (i >  1) ? -1 : 0,
    -1);
x = _mm_and_si128(x, mask);
I tried a few different ways of implementing this and benchmarked them with a couple of different compilers on an early Core i7 @ 2.67 GHz and a recent Haswell @ 3.6 GHz:
//
// mask_shift_0
//
// use PSHUFB (note: SSSE3 required)
//
inline __m128i mask_shift_0(uint32_t n)
{
  const __m128i vmask = _mm_set1_epi8(255);
  const __m128i vperm = _mm_set_epi8(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
  __m128i vp = _mm_add_epi8(vperm, _mm_set1_epi8(n));
  return _mm_shuffle_epi8(vmask, vp);
}
//
// mask_shift_1
//
// use 16 element LUT
//
inline __m128i mask_shift_1(uint32_t n)
{
  static const int8_t mask_lut[16][16] __attribute__ ((aligned(16))) = {
    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1 }
  };
  return _mm_load_si128((__m128i *)&mask_lut[n]);
}
//
// mask_shift_2
//
// use misaligned load from 2 vector LUT
//
inline __m128i mask_shift_2(uint32_t n)
{
  static const int8_t mask_lut[32] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  };
  return _mm_loadu_si128((__m128i *)(mask_lut + 16 - n));
}
//
// mask_shift_3
//
// use compare and single vector LUT
//
inline __m128i mask_shift_3(uint32_t n)
{
  const __m128i vm = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
  __m128i vn = _mm_set1_epi8(n);
  return _mm_cmpgt_epi8(vm, vn);
}
//
// mask_shift_4
//
// use jump table and immediate shifts
//
inline __m128i mask_shift_4(uint32_t n)
{
  const __m128i vmask = _mm_set1_epi8(-1);
  switch (n)
  {
    case 0:
      return vmask;
    case 1:
      return _mm_slli_si128(vmask, 1);
    case 2:
      return _mm_slli_si128(vmask, 2);
    case 3:
      return _mm_slli_si128(vmask, 3);
    case 4:
      return _mm_slli_si128(vmask, 4);
    case 5:
      return _mm_slli_si128(vmask, 5);
    case 6:
      return _mm_slli_si128(vmask, 6);
    case 7:
      return _mm_slli_si128(vmask, 7);
    case 8:
      return _mm_slli_si128(vmask, 8);
    case 9:
      return _mm_slli_si128(vmask, 9);
    case 10:
      return _mm_slli_si128(vmask, 10);
    case 11:
      return _mm_slli_si128(vmask, 11);
    case 12:
      return _mm_slli_si128(vmask, 12);
    case 13:
      return _mm_slli_si128(vmask, 13);
    case 14:
      return _mm_slli_si128(vmask, 14);
    case 15:
      return _mm_slli_si128(vmask, 15);
  }
}
//
// lsb_mask_0
//
// Contributed by by @Leeor/@dtb
//
// uses _mm_set_epi64x
//
inline __m128i lsb_mask_0(int n)
{
  if (n >= 8)
    return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1);
  else
    return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8));
}
//
// lsb_mask_1
//
// Contributed by by @Leeor/@dtb
//
// same as lsb_mask_0 but uses conditional operator instead of if/else
//
inline __m128i lsb_mask_1(int n)
{
  return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8));
}
Results were interesting:
Core i7 @ 2.67 GHz, Apple LLVM gcc 4.2.1 (gcc -O3)
mask_shift_0: 2.23377 ns
mask_shift_1: 2.14724 ns
mask_shift_2: 2.14270 ns
mask_shift_3: 2.15063 ns
mask_shift_4: 2.98304 ns
lsb_mask_0:   2.15782 ns
lsb_mask_1:   2.96628 ns
Core i7 @ 2.67 GHz, Apple clang 4.2 (clang -Os)
mask_shift_0: 1.35014 ns
mask_shift_1: 1.12789 ns
mask_shift_2: 1.04329 ns
mask_shift_3: 1.09258 ns
mask_shift_4: 2.01478 ns
lsb_mask_0:   1.70573 ns
lsb_mask_1:   1.84337 ns
Haswell E3-1285 @ 3.6 GHz, gcc 4.7.2 (gcc -O2)
mask_shift_0: 0.851416 ns
mask_shift_1: 0.575245 ns
mask_shift_2: 0.577746 ns
mask_shift_3: 0.850086 ns
mask_shift_4: 1.398270 ns
lsb_mask_0:   1.359660 ns
lsb_mask_1:   1.709720 ns
So mask_shift_4 (switch/case) seems to be the slowest method in all cases, whereas the others are pretty similar. The LUT-based methods seem to be consistently the fastest overall.
NB: I get some suspiciously fast numbers with clang -O3 and gcc -O3 (gcc 4.7.2 only) - I need to look at the generated assembly for these cases to see what the compiler is doing, and make sure it is not doing anything "clever", such as optimise away some part of the timing test harness.
If anyone else has any further ideas on this or has another mask_shift implementation they'd like to try I would be happy to add it to the test suite and update the results.
If it were normal 64bit values, i'd use something like -
    mask = (1 << (i * 8)) - 1;
But take care when generalizing this to 128, the internal shift operators aren't necessarily working at these ranges.
For 128b, you could either just build an upper and lower masks, for e.g -
    __m128i mask = _mm_set_epi64x( 
       i > 7 ? 0xffffffff : (1 << ((i) * 8)) - 1 
       i > 7 ? (1 << ((i-8) * 8)) - 1 : 0 
    );
(assuming I didn't swap the order, check me on this one, i'm not very familiar with these intrinsics) Alternatively, you can do this on a 2-wide uint64 array and load the 128b mask directly from memory using it's address.
However, both these methods don't seem natural like the original one, they just extend the elements from 1 to 8 bytes, but are still partial. It would be much preferable to do a proper shift with a single 128b variable.
I just came across this topic regarding 128b shifts -
Looking for sse 128 bit shift operation for non-immediate shift value
looks like it's possible but i've never used it. You could try the above one-liner with the appropriate SSE intrinsitc from there. I'd give this one a shot -
    mask = _mm_slli_si128(1, i); //emmintrin.h shows the second argument is in bytes already
And then just subtract one using your preferred way (I'd be surprised if this type supports a plain old operator-)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With