This function multiplies two 4x4 matrices in column-major format (as used with OpenGL).
It doesn't require special alignment on its parameters, but could be optimized to use the aligned intrinsics (without the 'u' in the name) if all matrices are aligned on 16 byte boundaries.
If the destination matrix is also used as one of the source matrices, a local copy is operated on.
#include <stdint.h>
#include <xmmintrin.h>
typedef float KbMatrix4[16];
void kbMatrix4Multiply(KbMatrix4 m, const KbMatrix4 mA, const KbMatrix4 mB)
{
KbMatrix4 _Alignas(16) t; // __declspec(align(16)) if you don't want C11
float *mT;
if ((m == mA) || (m == mB)) {
mT = t;
} else {
mT = m;
}
for (uint32_t i = 0; i < 4; i++) {
__m128 result = { .m128_i64 = {0,0} };
__m128 a_line = _mm_loadu_ps(&mA[i<<2]); // unaligned (~4x slower!)
__m128 b_line0 = _mm_loadu_ps(&mB[0]); // unaligned
__m128 b_line1 = _mm_loadu_ps(&mB[4]);
__m128 b_line2 = _mm_loadu_ps(&mB[8]);
__m128 b_line3 = _mm_loadu_ps(&mB[12]);
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x00), b_line0));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x55), b_line1));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xaa), b_line2));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xff), b_line3));
_mm_storeu_ps(&mT[i<<2], result); // unaligned
}
if (mT != m) {
// we can use the aligned load with the unaligned store here
__m128 result = _mm_load_ps(&mT[0]);
_mm_storeu_ps(&m[0], result);
result = _mm_load_ps(&mT[4]);
_mm_storeu_ps(&m[4], result);
result = _mm_load_ps(&mT[8]);
_mm_storeu_ps(&m[8], result);
result = _mm_load_ps(&mT[12]);
_mm_storeu_ps(&m[12], result);
}
}
Here's the slow non-intrinsic version for comparison...
void kbMatrix4Multiply(KbMatrix4 m, const KbMatrix4 mA, const KbMatrix4 mB)
{
KbMatrix4 t;
float *mT;
if ((m == mA) || (m == mB)) {
mT = t;
} else {
mT = m;
}
mT[ 0] = (mA[ 0] * mB[ 0]) + (mA[ 4] * mB[ 1]) + (mA[ 8] * mB[ 2]) + (mA[12] * mB[ 3]);
mT[ 1] = (mA[ 1] * mB[ 0]) + (mA[ 5] * mB[ 1]) + (mA[ 9] * mB[ 2]) + (mA[13] * mB[ 3]);
mT[ 2] = (mA[ 2] * mB[ 0]) + (mA[ 6] * mB[ 1]) + (mA[10] * mB[ 2]) + (mA[14] * mB[ 3]);
mT[ 3] = (mA[ 3] * mB[ 0]) + (mA[ 7] * mB[ 1]) + (mA[11] * mB[ 2]) + (mA[15] * mB[ 3]);
mT[ 4] = (mA[ 0] * mB[ 4]) + (mA[ 4] * mB[ 5]) + (mA[ 8] * mB[ 6]) + (mA[12] * mB[ 7]);
mT[ 5] = (mA[ 1] * mB[ 4]) + (mA[ 5] * mB[ 5]) + (mA[ 9] * mB[ 6]) + (mA[13] * mB[ 7]);
mT[ 6] = (mA[ 2] * mB[ 4]) + (mA[ 6] * mB[ 5]) + (mA[10] * mB[ 6]) + (mA[14] * mB[ 7]);
mT[ 7] = (mA[ 3] * mB[ 4]) + (mA[ 7] * mB[ 5]) + (mA[11] * mB[ 6]) + (mA[15] * mB[ 7]);
mT[ 8] = (mA[ 0] * mB[ 8]) + (mA[ 4] * mB[ 9]) + (mA[ 8] * mB[10]) + (mA[12] * mB[11]);
mT[ 9] = (mA[ 1] * mB[ 8]) + (mA[ 5] * mB[ 9]) + (mA[ 9] * mB[10]) + (mA[13] * mB[11]);
mT[10] = (mA[ 2] * mB[ 8]) + (mA[ 6] * mB[ 9]) + (mA[10] * mB[10]) + (mA[14] * mB[11]);
mT[11] = (mA[ 3] * mB[ 8]) + (mA[ 7] * mB[ 9]) + (mA[11] * mB[10]) + (mA[15] * mB[11]);
mT[12] = (mA[ 0] * mB[12]) + (mA[ 4] * mB[13]) + (mA[ 8] * mB[14]) + (mA[12] * mB[15]);
mT[13] = (mA[ 1] * mB[12]) + (mA[ 5] * mB[13]) + (mA[ 9] * mB[14]) + (mA[13] * mB[15]);
mT[14] = (mA[ 2] * mB[12]) + (mA[ 6] * mB[13]) + (mA[10] * mB[14]) + (mA[14] * mB[15]);
mT[15] = (mA[ 3] * mB[12]) + (mA[ 7] * mB[13]) + (mA[11] * mB[14]) + (mA[15] * mB[15]);
if (mT != m) {
for (int i = 0; i < 16; i++) {
m[i] = mT[i];
}
}
}