After some thought, I came up with the following code for multiplying two quaternions using SSE:
#include <pmmintrin.h> /* SSE3 intrinsics */
/* multiplication of two quaternions (x, y, z, w) x (a, b, c, d) */
__m128 _mm_cross4_ps(__m128 xyzw, __m128 abcd)
{
    /* The product of two quaternions is:                                 */
    /* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */
    __m128 wzyx = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(0,1,2,3));
    __m128 baba = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0,1,0,1));
    __m128 dcdc = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2,3,2,3));
    /* variable names below are for parts of componens of result (X,Y,Z,W) */
    /* nX stands for -X and similarly for the other components             */
    /* znxwy  = (xb - ya, zb - wa, wd - zc, yd - xc) */
    __m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw, baba), _mm_mul_ps(wzyx, dcdc));
    /* xzynw  = (xd + yc, zd + wc, wb + za, yb + xa) */
    __m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw, dcdc), _mm_mul_ps(wzyx, baba));
    /* _mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)) */
    /*      = (xd + yc, zd + wc, wd - zc, yd - xc)        */
    /* _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)) */
    /*      = (zb - wa, xb - ya, yb + xa, wb + za)        */
    /* _mm_addsub_ps adds elements 1 and 3 and subtracts elements 0 and 2, so we get: */
    /* _mm_addsub_ps(*, *) = (xd+yc-zb+wa, xb-ya+zd+wc, wd-zc+yb+xa, yd-xc+wb+za)     */
    __m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)),
                                _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)));
    /* now we only need to shuffle the components in place and return the result      */
    return _mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0));
    /* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions   */
}
I expected the assembly to have a minimal amount of instructions. However, when I compile it to assembly with gcc -msse3 -S, the resulting function has 67 instructions:
    .text   
    .globl __Z13_mm_cross4_psU8__vectorfS_
__Z13_mm_cross4_psU8__vectorfS_:
LFB594:
    pushq   %rbp
LCFI0:
    movq    %rsp, %rbp
LCFI1:
    subq    $232, %rsp
    movaps  %xmm0, -336(%rbp)
    movaps  %xmm1, -352(%rbp)
    movaps  -336(%rbp), %xmm0
    movaps  -336(%rbp), %xmm1
    shufps  $27, %xmm1, %xmm0
    movaps  %xmm0, -16(%rbp)
    movaps  -352(%rbp), %xmm0
    movaps  -352(%rbp), %xmm1
    shufps  $17, %xmm1, %xmm0
    movaps  %xmm0, -32(%rbp)
    movaps  -352(%rbp), %xmm0
    movaps  -352(%rbp), %xmm1
    shufps  $187, %xmm1, %xmm0
    movaps  %xmm0, -48(%rbp)
    movaps  -16(%rbp), %xmm0
    movaps  %xmm0, -112(%rbp)
    movaps  -48(%rbp), %xmm0
    movaps  %xmm0, -128(%rbp)
    movaps  -128(%rbp), %xmm0
    movaps  -112(%rbp), %xmm1
    mulps   %xmm1, %xmm0
    movaps  -336(%rbp), %xmm1
    movaps  %xmm1, -144(%rbp)
    movaps  -32(%rbp), %xmm1
    movaps  %xmm1, -160(%rbp)
    movaps  -160(%rbp), %xmm1
    movaps  -144(%rbp), %xmm2
    mulps   %xmm2, %xmm1
    movaps  %xmm1, -176(%rbp)
    movaps  %xmm0, -192(%rbp)
    movaps  -176(%rbp), %xmm0
    hsubps  -192(%rbp), %xmm0
    movaps  %xmm0, -64(%rbp)
    movaps  -16(%rbp), %xmm0
    movaps  %xmm0, -208(%rbp)
    movaps  -32(%rbp), %xmm0
    movaps  %xmm0, -224(%rbp)
    movaps  -224(%rbp), %xmm0
    movaps  -208(%rbp), %xmm1
    mulps   %xmm1, %xmm0
    movaps  -336(%rbp), %xmm1
    movaps  %xmm1, -240(%rbp)
    movaps  -48(%rbp), %xmm1
    movaps  %xmm1, -256(%rbp)
    movaps  -256(%rbp), %xmm1
    movaps  -240(%rbp), %xmm2
    mulps   %xmm2, %xmm1
    movaps  %xmm1, -272(%rbp)
    movaps  %xmm0, -288(%rbp)
    movaps  -272(%rbp), %xmm0
    haddps  -288(%rbp), %xmm0
    movaps  %xmm0, -80(%rbp)
    movaps  -64(%rbp), %xmm0
    movaps  -80(%rbp), %xmm1
    shufps  $177, %xmm1, %xmm0
    movaps  -80(%rbp), %xmm1
    movaps  -64(%rbp), %xmm2
    shufps  $228, %xmm2, %xmm1
    movaps  %xmm1, -304(%rbp)
    movaps  %xmm0, -320(%rbp)
    movaps  -304(%rbp), %xmm0
    addsubps        -320(%rbp), %xmm0
    movaps  %xmm0, -96(%rbp)
    movaps  -96(%rbp), %xmm0
    movaps  -96(%rbp), %xmm1
    shufps  $156, %xmm1, %xmm0
    leave
LCFI2:
    ret
What am I doing wrong? There must a better way to shuffle the elements around without using so many movaps instructions.
Multiplying two quaternions together has the effect of performing one rotation around an axis and then performing another rotation about around an axis. Multiplies two quaternions.
Multiply(Quaternion, Single)Returns the quaternion that results from scaling all the components of a specified quaternion by a scalar factor.
The multiplication of quaternions represents composing the two rotations: perform one rotation and then perform the other one. It's clear that this should represent a rotation (imagine rotating, say, a bowling ball in place).
Never mind. If I compile the code with gcc -msse3 -O1 -S instead, I get the following:
    .text
    .align 4,0x90
    .globl __Z13_mm_cross4_psU8__vectorfS_
__Z13_mm_cross4_psU8__vectorfS_:
LFB644:
    movaps  %xmm0, %xmm5
    movaps  %xmm1, %xmm3
    movaps  %xmm0, %xmm2
    shufps  $27, %xmm0, %xmm5
    movaps  %xmm5, %xmm4
    shufps  $17, %xmm1, %xmm3
    shufps  $187, %xmm1, %xmm1
    mulps   %xmm3, %xmm2
    mulps   %xmm1, %xmm4
    mulps   %xmm5, %xmm3
    mulps   %xmm1, %xmm0
    hsubps  %xmm4, %xmm2
    haddps  %xmm3, %xmm0
    movaps  %xmm2, %xmm1
    shufps  $177, %xmm0, %xmm1
    shufps  $228, %xmm2, %xmm0
    addsubps        %xmm1, %xmm0
    shufps  $156, %xmm0, %xmm0
    ret
That's only 18 instructions now. That's what I expected in the beginning. Oops.
You may be interested in the Agner Fog's C++ vector class library. It provides a Quaternion4f and Quaternion4d classes (including * and *= operators, of course), implemented by using SSE2 and AVX instruction sets respectively. The library is an Open Source project, so you may dig into the code and find a good implementation example to build your function on.
Later on, you may consult the "optimizing subroutines in assembly language" manual and provide an optimized, pure assembly implementation of the function or, while being aware of some low-level tricks, try to redesign the intrinsics approach in C.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With