Author Topic: Multiply matrices using SSE intrinsics (Read 712 times)

Kobold · « **on:** July 27, 2023, 04:58:53 PM »

This function multiplies two 4x4 matrices in column-major format (as used with OpenGL).
It doesn't require special alignment on its parameters, but could be optimized to use the aligned intrinsics (without the 'u' in the name) if all matrices are aligned on 16 byte boundaries.
If the destination matrix is also used as one of the source matrices, a local copy is operated on.

Code: [Select]

#include <stdint.h>
#include <xmmintrin.h>

typedef float KbMatrix4[16];

void kbMatrix4Multiply(KbMatrix4 m, const KbMatrix4 mA, const KbMatrix4 mB)
{
	KbMatrix4 _Alignas(16) t;	// __declspec(align(16)) if you don't want C11
	float *mT;
	if ((m == mA) || (m == mB)) {
		mT = t;
	} else {
		mT = m;
	}

	for (uint32_t i = 0; i < 4; i++) {
		__m128 result = { .m128_i64 = {0,0} };
		__m128 a_line  = _mm_loadu_ps(&mA[i<<2]);	// unaligned (~4x slower!)
		__m128 b_line0 = _mm_loadu_ps(&mB[0]);		// unaligned
		__m128 b_line1 = _mm_loadu_ps(&mB[4]);
		__m128 b_line2 = _mm_loadu_ps(&mB[8]);
		__m128 b_line3 = _mm_loadu_ps(&mB[12]);
		result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x00), b_line0));
		result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x55), b_line1));
		result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xaa), b_line2));
		result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xff), b_line3));
		_mm_storeu_ps(&mT[i<<2], result);	// unaligned
	}

	if (mT != m) {
		// we can use the aligned load with the unaligned store here
		__m128 result = _mm_load_ps(&mT[0]);
		_mm_storeu_ps(&m[0], result);
		result = _mm_load_ps(&mT[4]);
		_mm_storeu_ps(&m[4], result);
		result = _mm_load_ps(&mT[8]);
		_mm_storeu_ps(&m[8], result);
		result = _mm_load_ps(&mT[12]);
		_mm_storeu_ps(&m[12], result);
	}
}

Here's the slow non-intrinsic version for comparison...

Code: [Select]

void kbMatrix4Multiply(KbMatrix4 m, const KbMatrix4 mA, const KbMatrix4 mB)
{
	KbMatrix4 t;
	float *mT;
	if ((m == mA) || (m == mB)) {
		mT = t;
	} else {
		mT = m;
	}
	mT[ 0] = (mA[ 0] * mB[ 0]) + (mA[ 4] * mB[ 1]) + (mA[ 8] * mB[ 2]) + (mA[12] * mB[ 3]);
	mT[ 1] = (mA[ 1] * mB[ 0]) + (mA[ 5] * mB[ 1]) + (mA[ 9] * mB[ 2]) + (mA[13] * mB[ 3]);
	mT[ 2] = (mA[ 2] * mB[ 0]) + (mA[ 6] * mB[ 1]) + (mA[10] * mB[ 2]) + (mA[14] * mB[ 3]);
	mT[ 3] = (mA[ 3] * mB[ 0]) + (mA[ 7] * mB[ 1]) + (mA[11] * mB[ 2]) + (mA[15] * mB[ 3]);

	mT[ 4] = (mA[ 0] * mB[ 4]) + (mA[ 4] * mB[ 5]) + (mA[ 8] * mB[ 6]) + (mA[12] * mB[ 7]);
	mT[ 5] = (mA[ 1] * mB[ 4]) + (mA[ 5] * mB[ 5]) + (mA[ 9] * mB[ 6]) + (mA[13] * mB[ 7]);
	mT[ 6] = (mA[ 2] * mB[ 4]) + (mA[ 6] * mB[ 5]) + (mA[10] * mB[ 6]) + (mA[14] * mB[ 7]);
	mT[ 7] = (mA[ 3] * mB[ 4]) + (mA[ 7] * mB[ 5]) + (mA[11] * mB[ 6]) + (mA[15] * mB[ 7]);

	mT[ 8] = (mA[ 0] * mB[ 8]) + (mA[ 4] * mB[ 9]) + (mA[ 8] * mB[10]) + (mA[12] * mB[11]);
	mT[ 9] = (mA[ 1] * mB[ 8]) + (mA[ 5] * mB[ 9]) + (mA[ 9] * mB[10]) + (mA[13] * mB[11]);
	mT[10] = (mA[ 2] * mB[ 8]) + (mA[ 6] * mB[ 9]) + (mA[10] * mB[10]) + (mA[14] * mB[11]);
	mT[11] = (mA[ 3] * mB[ 8]) + (mA[ 7] * mB[ 9]) + (mA[11] * mB[10]) + (mA[15] * mB[11]);

	mT[12] = (mA[ 0] * mB[12]) + (mA[ 4] * mB[13]) + (mA[ 8] * mB[14]) + (mA[12] * mB[15]);
	mT[13] = (mA[ 1] * mB[12]) + (mA[ 5] * mB[13]) + (mA[ 9] * mB[14]) + (mA[13] * mB[15]);
	mT[14] = (mA[ 2] * mB[12]) + (mA[ 6] * mB[13]) + (mA[10] * mB[14]) + (mA[14] * mB[15]);
	mT[15] = (mA[ 3] * mB[12]) + (mA[ 7] * mB[13]) + (mA[11] * mB[14]) + (mA[15] * mB[15]);
	if (mT != m) {
		for (int i = 0; i < 16; i++) {
			m[i] = mT[i];
		}
	}
}

frankie · « **Reply #1 on:** July 28, 2023, 09:59:50 AM »

Good job.

Kobold · « **Reply #2 on:** July 28, 2023, 04:19:09 PM »

Quote from: frankie on July 28, 2023, 09:59:50 AM

Good job.

Thanks

I'll post some more SIMD-powered functions while I'm implementing and testing them. It's quite hard to find good (working) examples of that stuff. Everyone seems to be using GLM (C++) or CGLM (C99 preprocessor macros from hell). I prefer code that is actually readable and adaptable.

Pelles C forum

News:

Author Topic: Multiply matrices using SSE intrinsics (Read 712 times)

Kobold

Multiply matrices using SSE intrinsics

frankie

Re: Multiply matrices using SSE intrinsics

Kobold

Re: Multiply matrices using SSE intrinsics