Multiply matrices using SSE intrinsics

Started by Kobold, July 27, 2023, 04:58:53 PM

Previous topic - Next topic

Kobold

This function multiplies two 4x4 matrices in column-major format (as used with OpenGL).
It doesn't require special alignment on its parameters, but could be optimized to use the aligned intrinsics (without the 'u' in the name) if all matrices are aligned on 16 byte boundaries.
If the destination matrix is also used as one of the source matrices, a local copy is operated on.

#include <stdint.h>
#include <xmmintrin.h>

typedef float KbMatrix4[16];

void kbMatrix4Multiply(KbMatrix4 m, const KbMatrix4 mA, const KbMatrix4 mB)
{
KbMatrix4 _Alignas(16) t; // __declspec(align(16)) if you don't want C11
float *mT;
if ((m == mA) || (m == mB)) {
mT = t;
} else {
mT = m;
}

for (uint32_t i = 0; i < 4; i++) {
__m128 result = { .m128_i64 = {0,0} };
__m128 a_line  = _mm_loadu_ps(&mA[i<<2]); // unaligned (~4x slower!)
__m128 b_line0 = _mm_loadu_ps(&mB[0]); // unaligned
__m128 b_line1 = _mm_loadu_ps(&mB[4]);
__m128 b_line2 = _mm_loadu_ps(&mB[8]);
__m128 b_line3 = _mm_loadu_ps(&mB[12]);
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x00), b_line0));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x55), b_line1));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xaa), b_line2));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xff), b_line3));
_mm_storeu_ps(&mT[i<<2], result); // unaligned
}

if (mT != m) {
// we can use the aligned load with the unaligned store here
__m128 result = _mm_load_ps(&mT[0]);
_mm_storeu_ps(&m[0], result);
result = _mm_load_ps(&mT[4]);
_mm_storeu_ps(&m[4], result);
result = _mm_load_ps(&mT[8]);
_mm_storeu_ps(&m[8], result);
result = _mm_load_ps(&mT[12]);
_mm_storeu_ps(&m[12], result);
}
}


Here's the slow non-intrinsic version for comparison...

void kbMatrix4Multiply(KbMatrix4 m, const KbMatrix4 mA, const KbMatrix4 mB)
{
KbMatrix4 t;
float *mT;
if ((m == mA) || (m == mB)) {
mT = t;
} else {
mT = m;
}
mT[ 0] = (mA[ 0] * mB[ 0]) + (mA[ 4] * mB[ 1]) + (mA[ 8] * mB[ 2]) + (mA[12] * mB[ 3]);
mT[ 1] = (mA[ 1] * mB[ 0]) + (mA[ 5] * mB[ 1]) + (mA[ 9] * mB[ 2]) + (mA[13] * mB[ 3]);
mT[ 2] = (mA[ 2] * mB[ 0]) + (mA[ 6] * mB[ 1]) + (mA[10] * mB[ 2]) + (mA[14] * mB[ 3]);
mT[ 3] = (mA[ 3] * mB[ 0]) + (mA[ 7] * mB[ 1]) + (mA[11] * mB[ 2]) + (mA[15] * mB[ 3]);

mT[ 4] = (mA[ 0] * mB[ 4]) + (mA[ 4] * mB[ 5]) + (mA[ 8] * mB[ 6]) + (mA[12] * mB[ 7]);
mT[ 5] = (mA[ 1] * mB[ 4]) + (mA[ 5] * mB[ 5]) + (mA[ 9] * mB[ 6]) + (mA[13] * mB[ 7]);
mT[ 6] = (mA[ 2] * mB[ 4]) + (mA[ 6] * mB[ 5]) + (mA[10] * mB[ 6]) + (mA[14] * mB[ 7]);
mT[ 7] = (mA[ 3] * mB[ 4]) + (mA[ 7] * mB[ 5]) + (mA[11] * mB[ 6]) + (mA[15] * mB[ 7]);

mT[ 8] = (mA[ 0] * mB[ 8]) + (mA[ 4] * mB[ 9]) + (mA[ 8] * mB[10]) + (mA[12] * mB[11]);
mT[ 9] = (mA[ 1] * mB[ 8]) + (mA[ 5] * mB[ 9]) + (mA[ 9] * mB[10]) + (mA[13] * mB[11]);
mT[10] = (mA[ 2] * mB[ 8]) + (mA[ 6] * mB[ 9]) + (mA[10] * mB[10]) + (mA[14] * mB[11]);
mT[11] = (mA[ 3] * mB[ 8]) + (mA[ 7] * mB[ 9]) + (mA[11] * mB[10]) + (mA[15] * mB[11]);

mT[12] = (mA[ 0] * mB[12]) + (mA[ 4] * mB[13]) + (mA[ 8] * mB[14]) + (mA[12] * mB[15]);
mT[13] = (mA[ 1] * mB[12]) + (mA[ 5] * mB[13]) + (mA[ 9] * mB[14]) + (mA[13] * mB[15]);
mT[14] = (mA[ 2] * mB[12]) + (mA[ 6] * mB[13]) + (mA[10] * mB[14]) + (mA[14] * mB[15]);
mT[15] = (mA[ 3] * mB[12]) + (mA[ 7] * mB[13]) + (mA[11] * mB[14]) + (mA[15] * mB[15]);
if (mT != m) {
for (int i = 0; i < 16; i++) {
m[i] = mT[i];
}
}
}

frankie

"It is better to be hated for what you are than to be loved for what you are not." - Andre Gide

Kobold

Quote from: frankie on July 28, 2023, 09:59:50 AM
Good job.  :)
Thanks  :-*
I'll post some more SIMD-powered functions while I'm implementing and testing them. It's quite hard to find good (working) examples of that stuff. Everyone seems to be using GLM (C++) or CGLM (C99 preprocessor macros from hell). I prefer code that is actually readable and adaptable.