Not sure if this is helpful:
#include <immintrin.h>
int __cdecl main(void)
{
__m256 a1, b1, c1;
a1 = _mm256_setzero_ps(); // vxorps ymm0, ymm0, ymm0
b1 = _mm256_setzero_ps(); // vxorps ymm1, ymm1, ymm1
c1 = _mm256_setzero_ps(); // vxorps ymm2, ymm2, ymm2
c1 = _mm256_fmadd_ps(a1,b1,c1); // vfmadd213ps ymm0, ymm1, ymm2
return 0;
}/*
VEX.256.66.0F38.W0 A8 /r VFMADD213PS ymm1, ymm2, ymm3/m256 A V/V FMA
Multiply packed single-precision floating-point values from ymm1 and ymm2, add to ymm3/mem and put result in ymm1.
*/
#include <immintrin.h>
int __cdecl main(void)
{
__m256 a1, b1, c1, c2;
a1 = _mm256_setzero_ps(); // vxorps ymm0, ymm0, ymm0
b1 = _mm256_setzero_ps(); // vxorps ymm1, ymm1, ymm1
c1 = _mm256_setzero_ps(); // vxorps ymm2, ymm2, ymm2
c2 = _mm256_setzero_ps(); // vxorps ymm?, ymm?, ymm?
c2 = _mm256_fmadd_ps(a1,b1,c1); // vfmadd213ps ymm0, ymm1, ymm2
return 0;
}