[clang] c728bd5 - [X86] AVX512FP16 instructions enabling 5/6
via cfe-commits
cfe-commits at lists.llvm.org
Mon Aug 23 18:40:39 PDT 2021
Author: Wang, Pengfei
Date: 2021-08-24T09:07:19+08:00
New Revision: c728bd5bbaab5dad3bf0703f1a85d65cd1237e79
URL: https://github.com/llvm/llvm-project/commit/c728bd5bbaab5dad3bf0703f1a85d65cd1237e79
DIFF: https://github.com/llvm/llvm-project/commit/c728bd5bbaab5dad3bf0703f1a85d65cd1237e79.diff
LOG: [X86] AVX512FP16 instructions enabling 5/6
Enable FP16 FMA instructions.
Ref.: https://software.intel.com/content/www/us/en/develop/download/intel-avx512-fp16-architecture-specification.html
Reviewed By: LuoYuanke
Differential Revision: https://reviews.llvm.org/D105268
Added:
llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll
llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll
llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
Modified:
clang/include/clang/Basic/BuiltinsX86.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Headers/avx512fp16intrin.h
clang/lib/Headers/avx512vlfp16intrin.h
clang/lib/Sema/SemaChecking.cpp
clang/test/CodeGen/X86/avx512fp16-builtins.c
clang/test/CodeGen/X86/avx512vlfp16-builtins.c
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFMA3Info.cpp
llvm/lib/Target/X86/X86InstrFoldTables.cpp
llvm/lib/Target/X86/X86InstrFormats.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86IntrinsicsInfo.h
llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
llvm/test/MC/Disassembler/X86/avx512fp16.txt
llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
llvm/test/MC/X86/avx512fp16.s
llvm/test/MC/X86/avx512fp16vl.s
llvm/test/MC/X86/intel-syntax-avx512fp16.s
llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index f21c17ee0ebe9..0ab1444e7120a 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1995,6 +1995,25 @@ TARGET_BUILTIN(__builtin_ia32_vcvtps2phx128_mask, "V8xV4fV8xUc", "ncV:128:", "av
TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_vfmsubaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_maskz, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+
// generic select intrinsics
TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a1007da6e5472..ca6987b378a85 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -12050,6 +12050,22 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
Intrinsic::ID IID = Intrinsic::not_intrinsic;
switch (BuiltinID) {
default: break;
+ case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
+ IID = llvm::Intrinsic::x86_avx512fp16_vfmadd_ph_512;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
+ IID = llvm::Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
+ break;
case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
Subtract = true;
LLVM_FALLTHROUGH;
@@ -12113,22 +12129,30 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
// Handle any required masking.
Value *MaskFalseVal = nullptr;
switch (BuiltinID) {
+ case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
MaskFalseVal = Ops[0];
break;
+ case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
break;
+ case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
@@ -12159,9 +12183,21 @@ static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
Value *Res;
if (Rnd != 4) {
- Intrinsic::ID IID = Ops[0]->getType()->getPrimitiveSizeInBits() == 32 ?
- Intrinsic::x86_avx512_vfmadd_f32 :
- Intrinsic::x86_avx512_vfmadd_f64;
+ Intrinsic::ID IID;
+
+ switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
+ case 16:
+ IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
+ break;
+ case 32:
+ IID = Intrinsic::x86_avx512_vfmadd_f32;
+ break;
+ case 64:
+ IID = Intrinsic::x86_avx512_vfmadd_f64;
+ break;
+ default:
+ llvm_unreachable("Unexpected size");
+ }
Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
{Ops[0], Ops[1], Ops[2], Ops[4]});
} else if (CGF.Builder.getIsFPConstrained()) {
@@ -12764,6 +12800,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vfmaddss3:
case X86::BI__builtin_ia32_vfmaddsd3:
+ case X86::BI__builtin_ia32_vfmaddsh3_mask:
case X86::BI__builtin_ia32_vfmaddss3_mask:
case X86::BI__builtin_ia32_vfmaddsd3_mask:
return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
@@ -12771,20 +12808,28 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vfmaddsd:
return EmitScalarFMAExpr(*this, E, Ops,
Constant::getNullValue(Ops[0]->getType()));
+ case X86::BI__builtin_ia32_vfmaddsh3_maskz:
case X86::BI__builtin_ia32_vfmaddss3_maskz:
case X86::BI__builtin_ia32_vfmaddsd3_maskz:
return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
+ case X86::BI__builtin_ia32_vfmaddsh3_mask3:
case X86::BI__builtin_ia32_vfmaddss3_mask3:
case X86::BI__builtin_ia32_vfmaddsd3_mask3:
return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
+ case X86::BI__builtin_ia32_vfmsubsh3_mask3:
case X86::BI__builtin_ia32_vfmsubss3_mask3:
case X86::BI__builtin_ia32_vfmsubsd3_mask3:
return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
/*NegAcc*/ true);
+ case X86::BI__builtin_ia32_vfmaddph:
case X86::BI__builtin_ia32_vfmaddps:
case X86::BI__builtin_ia32_vfmaddpd:
+ case X86::BI__builtin_ia32_vfmaddph256:
case X86::BI__builtin_ia32_vfmaddps256:
case X86::BI__builtin_ia32_vfmaddpd256:
+ case X86::BI__builtin_ia32_vfmaddph512_mask:
+ case X86::BI__builtin_ia32_vfmaddph512_maskz:
+ case X86::BI__builtin_ia32_vfmaddph512_mask3:
case X86::BI__builtin_ia32_vfmaddps512_mask:
case X86::BI__builtin_ia32_vfmaddps512_maskz:
case X86::BI__builtin_ia32_vfmaddps512_mask3:
@@ -12793,7 +12838,12 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vfmaddpd512_maskz:
case X86::BI__builtin_ia32_vfmaddpd512_mask3:
case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+ case X86::BI__builtin_ia32_vfmsubph512_mask3:
return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
+ case X86::BI__builtin_ia32_vfmaddsubph512_mask:
+ case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
+ case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
+ case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
case X86::BI__builtin_ia32_vfmaddsubps512_mask:
case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 48370d0bf0ee0..6440be3799df8 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -2423,6 +2423,492 @@ _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
_MM_FROUND_CUR_DIRECTION);
}
+#define _mm512_fmadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fmsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fnmadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fnmsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ (__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz(
+ -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fmsubadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmadd_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
+ -(__v8hf)__B, (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
+ -(__v8hf)__B, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmsub_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
+ -(__v8hf)__C, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)R))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmadd_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmsub_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ph(__m512h __W) {
return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 1809211fd4066..8f48b0156cd69 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -1371,6 +1371,378 @@ _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
(__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
}
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
+ __mmask8 __U,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
+ -(__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
+ __mmask8 __U,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
+ -(__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
+ -(__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
+ -(__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
+ -(__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
+ -(__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__C);
+}
+
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
__m128h __A,
__m128h __W) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 4e7d5b66bca7f..ee3efc14c1ab8 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4084,6 +4084,9 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmaddss3_mask:
case X86::BI__builtin_ia32_vfmaddss3_maskz:
case X86::BI__builtin_ia32_vfmaddss3_mask3:
+ case X86::BI__builtin_ia32_vfmaddsh3_mask:
+ case X86::BI__builtin_ia32_vfmaddsh3_maskz:
+ case X86::BI__builtin_ia32_vfmaddsh3_mask3:
case X86::BI__builtin_ia32_vfmaddpd512_mask:
case X86::BI__builtin_ia32_vfmaddpd512_maskz:
case X86::BI__builtin_ia32_vfmaddpd512_mask3:
@@ -4092,6 +4095,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmaddps512_maskz:
case X86::BI__builtin_ia32_vfmaddps512_mask3:
case X86::BI__builtin_ia32_vfmsubps512_mask3:
+ case X86::BI__builtin_ia32_vfmaddph512_mask:
+ case X86::BI__builtin_ia32_vfmaddph512_maskz:
+ case X86::BI__builtin_ia32_vfmaddph512_mask3:
+ case X86::BI__builtin_ia32_vfmsubph512_mask3:
case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
@@ -4100,6 +4107,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+ case X86::BI__builtin_ia32_vfmaddsubph512_mask:
+ case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
+ case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
+ case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
ArgNum = 4;
HasRC = true;
break;
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index 42591662606eb..1a6ddeea15fca 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -3163,6 +3163,839 @@ __m256h test_mm512_maskz_cvtxps_ph(__mmask16 A, __m512 B) {
return _mm512_maskz_cvtxps_ph(A, B);
}
+__m512h test_mm512_fmadd_round_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmadd_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ return _mm512_fmadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fmadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmadd_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fmadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fmadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fmsub_round_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmsub_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ return _mm512_fmsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fmsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmsub_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fmsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fnmadd_round_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fnmadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ return _mm512_fnmadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fnmadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fnmadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fnmadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fnmadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fnmsub_round_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fnmsub_round_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ return _mm512_fnmsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fnmsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fnmsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fmadd_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmadd_ph
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ return _mm512_fmadd_ph(__A, __B, __C);
+}
+
+__m512h test_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmadd_ph
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ return _mm512_mask_fmadd_ph(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmadd_ph
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmadd_ph(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmadd_ph
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmadd_ph(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fmsub_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ return _mm512_fmsub_ph(__A, __B, __C);
+}
+
+__m512h test_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmsub_ph(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmsub_ph(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ return _mm512_fnmadd_ph(__A, __B, __C);
+}
+
+__m512h test_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fnmadd_ph(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fnmadd_ph(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ return _mm512_fnmsub_ph(__A, __B, __C);
+}
+
+__m512h test_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fnmsub_ph(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fmaddsub_round_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmaddsub_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ return _mm512_fmaddsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fmaddsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmaddsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fmaddsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmaddsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fmaddsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ph
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmaddsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fmsubadd_round_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmsubadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ return _mm512_fmsubadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fmsubadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmsubadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fmsubadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmsubadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
+ return _mm512_fmaddsub_ph(__A, __B, __C);
+}
+
+__m512h test_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmaddsub_ph(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmaddsub_ph(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmaddsub_ph(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
+ return _mm512_fmsubadd_ph(__A, __B, __C);
+}
+
+__m512h test_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fmsubadd_ph(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
+ return _mm512_maskz_fmsubadd_ph(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_mask3_fmsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmsub_ph(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_mask3_fmsubadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmsubadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fmsubadd_ph(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_mask_fnmadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ph
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fnmadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fnmadd_ph(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_mask_fnmsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fnmsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fnmsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fnmsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_fnmsub_ph(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask3_fnmsub_ph(__A, __B, __C, __U);
+}
+
+__m128h test_mm_fmadd_sh(__m128h __W, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fmadd_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ return _mm_fmadd_sh(__W, __A, __B);
+}
+
+__m128h test_mm_mask_fmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmadd_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_mask_fmadd_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_fmadd_round_sh(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmadd_round_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0
+ return _mm_fmadd_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fmadd_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmadd_round_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_mask_fmadd_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmadd_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_maskz_fmadd_sh(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fmadd_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmadd_round_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_maskz_fmadd_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmadd_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fmadd_sh(__W, __X, __Y, __U);
+}
+
+__m128h test_mm_mask3_fmadd_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmadd_round_sh
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fmadd_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_fmsub_sh(__m128h __W, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fmsub_sh
+ // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
+ // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+ // CHECK-NEXT: ret <8 x half> %{{.*}}
+ return _mm_fmsub_sh(__W, __A, __B);
+}
+
+__m128h test_mm_mask_fmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmsub_sh
+ // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
+ // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
+ // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+ // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+ // CHECK-NEXT: ret <8 x half> %{{.*}}
+ return _mm_mask_fmsub_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_fmsub_round_sh(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmsub_round_sh
+ // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11)
+ // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+ // CHECK-NEXT: ret <8 x half> %{{.*}}
+ return _mm_fmsub_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fmsub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmsub_round_sh
+ // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
+ // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11)
+ // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+ // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+ // CHECK-NEXT: ret <8 x half> %{{.*}}
+ return _mm_mask_fmsub_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmsub_sh
+ // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
+ // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
+ // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half 0xH0000
+ // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+ // CHECK-NEXT: ret <8 x half> %{{.*}}
+ return _mm_maskz_fmsub_sh(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fmsub_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmsub_round_sh
+ // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
+ // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11)
+ // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half 0xH0000
+ // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+ // CHECK-NEXT: ret <8 x half> %{{.*}}
+ return _mm_maskz_fmsub_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmsub_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fmsub_sh(__W, __X, __Y, __U);
+}
+
+__m128h test_mm_mask3_fmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmsub_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_fnmadd_sh(__m128h __W, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fnmadd_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ return _mm_fnmadd_sh(__W, __A, __B);
+}
+
+__m128h test_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fnmadd_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_mask_fnmadd_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_fnmadd_round_sh(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fnmadd_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0
+ return _mm_fnmadd_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fnmadd_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fnmadd_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_mask_fnmadd_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fnmadd_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_maskz_fnmadd_sh(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fnmadd_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_maskz_fnmadd_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fnmadd_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fnmadd_sh(__W, __X, __Y, __U);
+}
+
+__m128h test_mm_mask3_fnmadd_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fnmadd_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_fnmsub_sh(__m128h __W, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fnmsub_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ return _mm_fnmsub_sh(__W, __A, __B);
+}
+
+__m128h test_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fnmsub_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_mask_fnmsub_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_fnmsub_round_sh(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fnmsub_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0
+ return _mm_fnmsub_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fnmsub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fnmsub_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_mask_fnmsub_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fnmsub_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_maskz_fnmsub_sh(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fnmsub_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
+ return _mm_maskz_fnmsub_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fnmsub_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
+ // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fnmsub_sh(__W, __X, __Y, __U);
+}
+
+__m128h test_mm_mask3_fnmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sh
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: [[NEG2:%.+]] = fneg
+ // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
+ // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
+ // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
+ // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+ // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
+ // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
+ return _mm_mask3_fnmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
_Float16 test_mm512_reduce_add_ph(__m512h __W) {
// CHECK-LABEL: @test_mm512_reduce_add_ph
// CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
index cb99d655f21c6..8644309b63224 100644
--- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
@@ -2321,6 +2321,411 @@ __m128h test_mm256_maskz_cvtxps_ph(__mmask8 A, __m256 B) {
return _mm256_maskz_cvtxps_ph(A, B);
}
+__m128h test_mm_fmadd_ph(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmadd_ph
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ return _mm_fmadd_ph(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmadd_ph
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_fmadd_ph(__A, __U, __B, __C);
+}
+
+__m128h test_mm_fmsub_ph(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ return _mm_fmsub_ph(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_fmsub_ph(__A, __U, __B, __C);
+}
+
+__m128h test_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmadd_ph
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask3_fmadd_ph(__A, __B, __C, __U);
+}
+
+__m128h test_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask3_fnmadd_ph(__A, __B, __C, __U);
+}
+
+__m128h test_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmadd_ph
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_fmadd_ph(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_fmsub_ph(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_fnmadd_ph(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_fnmsub_ph(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_fmadd_ph(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fmadd_ph
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ return _mm256_fmadd_ph(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fmadd_ph(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fmadd_ph
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_fmadd_ph(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_fmsub_ph(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ return _mm256_fmsub_ph(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_fmsub_ph(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fmadd_ph
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask3_fmadd_ph(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask3_fnmadd_ph(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fmadd_ph
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_fmadd_ph(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_fmsub_ph(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_fnmadd_ph(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_fnmsub_ph(__U, __A, __B, __C);
+}
+
+__m128h test_mm_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ return _mm_fmaddsub_ph(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_fmaddsub_ph(__A, __U, __B, __C);
+}
+
+__m128h test_mm_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]])
+ return _mm_fmsubadd_ph(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]])
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_fmsubadd_ph(__A, __U, __B, __C);
+}
+
+__m128h test_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask3_fmaddsub_ph(__A, __B, __C, __U);
+}
+
+__m128h test_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_fmaddsub_ph(__U, __A, __B, __C);
+}
+
+__m128h test_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]])
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_fmsubadd_ph(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ return _mm256_fmaddsub_ph(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_fmaddsub_ph(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]])
+ return _mm256_fmsubadd_ph(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]])
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_fmsubadd_ph(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask3_fmaddsub_ph(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fmaddsub_ph
+ // CHECK-NOT: fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_fmaddsub_ph(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]])
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_fmsubadd_ph(__U, __A, __B, __C);
+}
+
+__m128h test_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask3_fmsub_ph(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fmsub_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask3_fmsub_ph(__A, __B, __C, __U);
+}
+
+__m128h test_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]])
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask3_fmsubadd_ph(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fmsubadd_ph
+ // CHECK: [[NEG:%.+]] = fneg
+ // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]])
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask3_fmsubadd_ph(__A, __B, __C, __U);
+}
+
+__m128h test_mm_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ return _mm_fnmadd_ph(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_fnmadd_ph(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ return _mm256_fnmadd_ph(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fnmadd_ph
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_fnmadd_ph(__A, __U, __B, __C);
+}
+
+__m128h test_mm_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ return _mm_fnmsub_ph(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_fnmsub_ph(__A, __U, __B, __C);
+}
+
+__m128h test_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}})
+ // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask3_fnmsub_ph(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ return _mm256_fnmsub_ph(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_fnmsub_ph(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fnmsub_ph
+ // CHECK: fneg
+ // CHECK: fneg
+ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
+ return _mm256_mask3_fnmsub_ph(__A, __B, __C, __U);
+}
__m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) {
// CHECK-LABEL: @test_mm_mask_blend_ph
// CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index c79c6118db680..680e649290653 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5709,4 +5709,27 @@ let TargetPrefix = "x86" in {
[ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
llvm_i32_ty ],
[ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+
+ def int_x86_avx512fp16_vfmadd_ph_512
+ : Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+ def int_x86_avx512fp16_vfmaddsub_ph_128
+ : GCCBuiltin<"__builtin_ia32_vfmaddsubph">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_vfmaddsub_ph_256
+ : GCCBuiltin<"__builtin_ia32_vfmaddsubph256">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_vfmaddsub_ph_512
+ : Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+ def int_x86_avx512fp16_vfmadd_f16
+ : Intrinsic<[ llvm_half_ty ],
+ [ llvm_half_ty, llvm_half_ty, llvm_half_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c70715033cc03..9f8e76c786f9a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1934,6 +1934,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LOAD, VT, Legal);
setOperationAction(ISD::STORE, VT, Legal);
+ setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -32720,6 +32722,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
return false;
switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f16:
+ return Subtarget.hasFP16();
case MVT::f32:
case MVT::f64:
return true;
@@ -49021,7 +49025,9 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
}
EVT ScalarVT = VT.getScalarType();
- if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+ if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+ !Subtarget.hasAnyFMA()) &&
+ !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
return SDValue();
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index c92abc7e8c95d..df3e1554320ef 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -6760,14 +6760,14 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
(_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
- AVX512FMA3Base, Sched<[sched]>;
+ EVEX_4V, Sched<[sched]>;
defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
(_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
- AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6777,7 +6777,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
_.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
(MaskOpNode _.RC:$src2,
_.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
- AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6791,21 +6791,22 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
(_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched,
- AVX512VLVectorVTInfo _, string Suff> {
- let Predicates = [HasAVX512] in {
+ AVX512VLVectorVTInfo _, string Suff,
+ Predicate prd = HasAVX512> {
+ let Predicates = [prd] in {
defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, _.info512, Suff>,
avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
_.info512, Suff>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
- let Predicates = [HasVLX, HasAVX512] in {
+ let Predicates = [HasVLX, prd] in {
defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.YMM, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
@@ -6817,12 +6818,15 @@ multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOpera
multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd> {
+ defm PH : avx512_fma3p_213_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f16_info, "PH", HasFP16>, T_MAP6PD;
defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
- avx512vl_f32_info, "PS">;
+ avx512vl_f32_info, "PS">, T8PD;
defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
- avx512vl_f64_info, "PD">, VEX_W;
+ avx512vl_f64_info, "PD">, T8PD, VEX_W;
}
defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
@@ -6849,14 +6853,14 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(null_frag),
(_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
- AVX512FMA3Base, Sched<[sched]>;
+ EVEX_4V, Sched<[sched]>;
defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
(_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
- AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6867,7 +6871,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
_.RC:$src1)),
(_.VT (MaskOpNode _.RC:$src2,
(_.VT (_.BroadcastLdFrag addr:$src3)),
- _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+ _.RC:$src1)), 1, 0>, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6882,21 +6886,22 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(null_frag),
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
- 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+ 1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched,
- AVX512VLVectorVTInfo _, string Suff> {
- let Predicates = [HasAVX512] in {
+ AVX512VLVectorVTInfo _, string Suff,
+ Predicate prd = HasAVX512> {
+ let Predicates = [prd] in {
defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, _.info512, Suff>,
avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
_.info512, Suff>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
- let Predicates = [HasVLX, HasAVX512] in {
+ let Predicates = [HasVLX, prd] in {
defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.YMM, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
@@ -6908,12 +6913,15 @@ multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOpera
multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd > {
+ defm PH : avx512_fma3p_231_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f16_info, "PH", HasFP16>, T_MAP6PD;
defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
- avx512vl_f32_info, "PS">;
+ avx512vl_f32_info, "PS">, T8PD;
defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
- avx512vl_f64_info, "PD">, VEX_W;
+ avx512vl_f64_info, "PD">, T8PD, VEX_W;
}
defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
@@ -6939,7 +6947,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(null_frag),
(_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
- AVX512FMA3Base, Sched<[sched]>;
+ EVEX_4V, Sched<[sched]>;
// Pattern is 312 order so that the load is in a
diff erent place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6948,7 +6956,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
(_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
- AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
// Pattern is 312 order so that the load is in a
diff erent place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6960,7 +6968,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
_.RC:$src1, _.RC:$src2)),
(_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2)), 1, 0>,
- AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6974,21 +6982,22 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(null_frag),
(_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
- 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+ 1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched,
- AVX512VLVectorVTInfo _, string Suff> {
- let Predicates = [HasAVX512] in {
+ AVX512VLVectorVTInfo _, string Suff,
+ Predicate prd = HasAVX512> {
+ let Predicates = [prd] in {
defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, _.info512, Suff>,
avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
_.info512, Suff>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
- let Predicates = [HasVLX, HasAVX512] in {
+ let Predicates = [HasVLX, prd] in {
defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.YMM, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
@@ -7000,12 +7009,15 @@ multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOpera
multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd > {
+ defm PH : avx512_fma3p_132_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f16_info, "PH", HasFP16>, T_MAP6PD;
defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
- avx512vl_f32_info, "PS">;
+ avx512vl_f32_info, "PS">, T8PD;
defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
OpNodeRnd, SchedWriteFMA,
- avx512vl_f64_info, "PD">, VEX_W;
+ avx512vl_f64_info, "PD">, T8PD, VEX_W;
}
defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
@@ -7028,39 +7040,39 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
"$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
- AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+ EVEX_4V, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
let mayLoad = 1 in
defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
"$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
- AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+ EVEX_4V, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
let Uses = [MXCSR] in
defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
let isCodeGenOnly = 1, isCommutable = 1 in {
- def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+ def r : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
- def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
+ !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX_4V, SIMD_EXC;
+ def m : AVX512<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+ [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, EVEX_4V, SIMD_EXC;
let Uses = [MXCSR] in
- def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+ def rb : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
!strconcat(OpcodeStr,
"\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
!if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
- Sched<[SchedWriteFMA.Scl]>;
+ Sched<[SchedWriteFMA.Scl]>, EVEX_4V;
}// isCodeGenOnly = 1
}// Constraints = "$src1 = $dst"
}
@@ -7104,10 +7116,15 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
OpNodeRnd, f32x_info, "SS">,
- EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
OpNodeRnd, f64x_info, "SD">,
- EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD;
+ }
+ let Predicates = [HasFP16] in {
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnd, f16x_info, "SH">,
+ EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6PD;
}
}
@@ -7119,8 +7136,9 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86Fnmsu
multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
SDNode RndOp, string Prefix,
string Suffix, SDNode Move,
- X86VectorVTInfo _, PatLeaf ZeroFP> {
- let Predicates = [HasAVX512] in {
+ X86VectorVTInfo _, PatLeaf ZeroFP,
+ Predicate prd = HasAVX512> {
+ let Predicates = [prd] in {
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(Op _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
@@ -7318,6 +7336,14 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
}
}
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD", "SH",
+ X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB", "SH",
+ X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SH",
+ X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SH",
+ X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
"SS", X86Movss, v4f32x_info, fp32imm0>;
@@ -7350,13 +7376,13 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
- AVX512FMA3Base, Sched<[sched]>;
+ T8PD, EVEX_4V, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
- AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -7365,7 +7391,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode _.RC:$src2,
(_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1)>,
- AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
} // Constraints = "$src1 = $dst"
@@ -12355,13 +12381,13 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
- AVX512FMA3Base, Sched<[sched]>;
+ T8PD, EVEX_4V, Sched<[sched]>;
defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.LdFrag addr:$src3))))>,
- AVX512FMA3Base,
+ T8PD, EVEX_4V,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -12377,7 +12403,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
"$src2, ${src3}"#VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
- AVX512FMA3Base, EVEX_B,
+ T8PD, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 6d803e931b685..52b2a62316cde 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -28,35 +28,43 @@ using namespace llvm;
FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \
FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked)
-#define FMA3GROUP_PACKED_WIDTHS(Name, Suf, Attrs) \
- FMA3GROUP(Name, Suf##Ym, Attrs) \
- FMA3GROUP(Name, Suf##Yr, Attrs) \
+#define FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Z256m, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Z256r, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Zm, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Zr, Attrs) \
+
+#define FMA3GROUP_PACKED_WIDTHS_ALL(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##Ym, Attrs) \
+ FMA3GROUP(Name, Suf##Yr, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \
FMA3GROUP(Name, Suf##m, Attrs) \
FMA3GROUP(Name, Suf##r, Attrs)
#define FMA3GROUP_PACKED(Name, Attrs) \
- FMA3GROUP_PACKED_WIDTHS(Name, PD, Attrs) \
- FMA3GROUP_PACKED_WIDTHS(Name, PS, Attrs)
+ FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS_ALL(Name, PS, Attrs)
-#define FMA3GROUP_SCALAR_WIDTHS(Name, Suf, Attrs) \
+#define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
FMA3GROUP(Name, Suf##Zm, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
FMA3GROUP(Name, Suf##Zr, Attrs) \
FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+
+#define FMA3GROUP_SCALAR_WIDTHS_ALL(Name, Suf, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
FMA3GROUP(Name, Suf##m, Attrs) \
FMA3GROUP(Name, Suf##m_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
FMA3GROUP(Name, Suf##r, Attrs) \
FMA3GROUP(Name, Suf##r_Int, Attrs | X86InstrFMA3Group::Intrinsic)
#define FMA3GROUP_SCALAR(Name, Attrs) \
- FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
- FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs)
+ FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SD, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS_Z(Name, SH, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SS, Attrs)
#define FMA3GROUP_FULL(Name, Attrs) \
FMA3GROUP_PACKED(Name, Attrs) \
@@ -78,15 +86,19 @@ static const X86InstrFMA3Group Groups[] = {
#define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \
FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
#define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \
FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, PHZ##Suf, Attrs) \
FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs)
#define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \
FMA3GROUP(Name, SDZ##Suf, Attrs) \
FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \
+ FMA3GROUP(Name, SHZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, SHZ##Suf##_Int, Attrs) \
FMA3GROUP(Name, SSZ##Suf, Attrs) \
FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
@@ -130,14 +142,16 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
// FMA3 instructions have a well defined encoding pattern we can exploit.
uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
- bool IsFMA3 = ((TSFlags & X86II::EncodingMask) == X86II::VEX ||
- (TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
- (TSFlags & X86II::OpMapMask) == X86II::T8 &&
- (TSFlags & X86II::OpPrefixMask) == X86II::PD &&
- ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
- (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
- (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
- if (!IsFMA3)
+ bool IsFMA3Opcode = ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
+ (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
+ (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
+ bool IsFMA3Encoding = ((TSFlags & X86II::EncodingMask) == X86II::VEX &&
+ (TSFlags & X86II::OpMapMask) == X86II::T8) ||
+ ((TSFlags & X86II::EncodingMask) == X86II::EVEX &&
+ ((TSFlags & X86II::OpMapMask) == X86II::T8 ||
+ (TSFlags & X86II::OpMapMask) == X86II::T_MAP6));
+ bool IsFMA3Prefix = (TSFlags & X86II::OpPrefixMask) == X86II::PD;
+ if (!IsFMA3Opcode || !IsFMA3Encoding || !IsFMA3Prefix)
return nullptr;
verifyTables();
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 959c8d4a2d886..235f0d4b92613 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -3288,6 +3288,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, 0 },
{ X86::VFMADD132PDZr, X86::VFMADD132PDZm, 0 },
{ X86::VFMADD132PDr, X86::VFMADD132PDm, 0 },
+ { X86::VFMADD132PHZ128r, X86::VFMADD132PHZ128m, 0 },
+ { X86::VFMADD132PHZ256r, X86::VFMADD132PHZ256m, 0 },
+ { X86::VFMADD132PHZr, X86::VFMADD132PHZm, 0 },
{ X86::VFMADD132PSYr, X86::VFMADD132PSYm, 0 },
{ X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128m, 0 },
{ X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256m, 0 },
@@ -3297,6 +3300,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD132SDZr_Int, X86::VFMADD132SDZm_Int, TB_NO_REVERSE },
{ X86::VFMADD132SDr, X86::VFMADD132SDm, 0 },
{ X86::VFMADD132SDr_Int, X86::VFMADD132SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SHZr, X86::VFMADD132SHZm, 0 },
+ { X86::VFMADD132SHZr_Int, X86::VFMADD132SHZm_Int, TB_NO_REVERSE },
{ X86::VFMADD132SSZr, X86::VFMADD132SSZm, 0 },
{ X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_NO_REVERSE },
{ X86::VFMADD132SSr, X86::VFMADD132SSm, 0 },
@@ -3306,6 +3311,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, 0 },
{ X86::VFMADD213PDZr, X86::VFMADD213PDZm, 0 },
{ X86::VFMADD213PDr, X86::VFMADD213PDm, 0 },
+ { X86::VFMADD213PHZ128r, X86::VFMADD213PHZ128m, 0 },
+ { X86::VFMADD213PHZ256r, X86::VFMADD213PHZ256m, 0 },
+ { X86::VFMADD213PHZr, X86::VFMADD213PHZm, 0 },
{ X86::VFMADD213PSYr, X86::VFMADD213PSYm, 0 },
{ X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128m, 0 },
{ X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256m, 0 },
@@ -3315,6 +3323,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD213SDZr_Int, X86::VFMADD213SDZm_Int, TB_NO_REVERSE },
{ X86::VFMADD213SDr, X86::VFMADD213SDm, 0 },
{ X86::VFMADD213SDr_Int, X86::VFMADD213SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SHZr, X86::VFMADD213SHZm, 0 },
+ { X86::VFMADD213SHZr_Int, X86::VFMADD213SHZm_Int, TB_NO_REVERSE },
{ X86::VFMADD213SSZr, X86::VFMADD213SSZm, 0 },
{ X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_NO_REVERSE },
{ X86::VFMADD213SSr, X86::VFMADD213SSm, 0 },
@@ -3324,6 +3334,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, 0 },
{ X86::VFMADD231PDZr, X86::VFMADD231PDZm, 0 },
{ X86::VFMADD231PDr, X86::VFMADD231PDm, 0 },
+ { X86::VFMADD231PHZ128r, X86::VFMADD231PHZ128m, 0 },
+ { X86::VFMADD231PHZ256r, X86::VFMADD231PHZ256m, 0 },
+ { X86::VFMADD231PHZr, X86::VFMADD231PHZm, 0 },
{ X86::VFMADD231PSYr, X86::VFMADD231PSYm, 0 },
{ X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128m, 0 },
{ X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256m, 0 },
@@ -3333,6 +3346,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD231SDZr_Int, X86::VFMADD231SDZm_Int, TB_NO_REVERSE },
{ X86::VFMADD231SDr, X86::VFMADD231SDm, 0 },
{ X86::VFMADD231SDr_Int, X86::VFMADD231SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SHZr, X86::VFMADD231SHZm, 0 },
+ { X86::VFMADD231SHZr_Int, X86::VFMADD231SHZm_Int, TB_NO_REVERSE },
{ X86::VFMADD231SSZr, X86::VFMADD231SSZm, 0 },
{ X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_NO_REVERSE },
{ X86::VFMADD231SSr, X86::VFMADD231SSm, 0 },
@@ -3350,6 +3365,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256m, 0 },
{ X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZm, 0 },
{ X86::VFMADDSUB132PDr, X86::VFMADDSUB132PDm, 0 },
+ { X86::VFMADDSUB132PHZ128r, X86::VFMADDSUB132PHZ128m, 0 },
+ { X86::VFMADDSUB132PHZ256r, X86::VFMADDSUB132PHZ256m, 0 },
+ { X86::VFMADDSUB132PHZr, X86::VFMADDSUB132PHZm, 0 },
{ X86::VFMADDSUB132PSYr, X86::VFMADDSUB132PSYm, 0 },
{ X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128m, 0 },
{ X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256m, 0 },
@@ -3360,6 +3378,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256m, 0 },
{ X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZm, 0 },
{ X86::VFMADDSUB213PDr, X86::VFMADDSUB213PDm, 0 },
+ { X86::VFMADDSUB213PHZ128r, X86::VFMADDSUB213PHZ128m, 0 },
+ { X86::VFMADDSUB213PHZ256r, X86::VFMADDSUB213PHZ256m, 0 },
+ { X86::VFMADDSUB213PHZr, X86::VFMADDSUB213PHZm, 0 },
{ X86::VFMADDSUB213PSYr, X86::VFMADDSUB213PSYm, 0 },
{ X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128m, 0 },
{ X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256m, 0 },
@@ -3370,6 +3391,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256m, 0 },
{ X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZm, 0 },
{ X86::VFMADDSUB231PDr, X86::VFMADDSUB231PDm, 0 },
+ { X86::VFMADDSUB231PHZ128r, X86::VFMADDSUB231PHZ128m, 0 },
+ { X86::VFMADDSUB231PHZ256r, X86::VFMADDSUB231PHZ256m, 0 },
+ { X86::VFMADDSUB231PHZr, X86::VFMADDSUB231PHZm, 0 },
{ X86::VFMADDSUB231PSYr, X86::VFMADDSUB231PSYm, 0 },
{ X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128m, 0 },
{ X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256m, 0 },
@@ -3384,6 +3408,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, 0 },
{ X86::VFMSUB132PDZr, X86::VFMSUB132PDZm, 0 },
{ X86::VFMSUB132PDr, X86::VFMSUB132PDm, 0 },
+ { X86::VFMSUB132PHZ128r, X86::VFMSUB132PHZ128m, 0 },
+ { X86::VFMSUB132PHZ256r, X86::VFMSUB132PHZ256m, 0 },
+ { X86::VFMSUB132PHZr, X86::VFMSUB132PHZm, 0 },
{ X86::VFMSUB132PSYr, X86::VFMSUB132PSYm, 0 },
{ X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128m, 0 },
{ X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256m, 0 },
@@ -3393,6 +3420,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUB132SDZr_Int, X86::VFMSUB132SDZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB132SDr, X86::VFMSUB132SDm, 0 },
{ X86::VFMSUB132SDr_Int, X86::VFMSUB132SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SHZr, X86::VFMSUB132SHZm, 0 },
+ { X86::VFMSUB132SHZr_Int, X86::VFMSUB132SHZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB132SSZr, X86::VFMSUB132SSZm, 0 },
{ X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB132SSr, X86::VFMSUB132SSm, 0 },
@@ -3402,6 +3431,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, 0 },
{ X86::VFMSUB213PDZr, X86::VFMSUB213PDZm, 0 },
{ X86::VFMSUB213PDr, X86::VFMSUB213PDm, 0 },
+ { X86::VFMSUB213PHZ128r, X86::VFMSUB213PHZ128m, 0 },
+ { X86::VFMSUB213PHZ256r, X86::VFMSUB213PHZ256m, 0 },
+ { X86::VFMSUB213PHZr, X86::VFMSUB213PHZm, 0 },
{ X86::VFMSUB213PSYr, X86::VFMSUB213PSYm, 0 },
{ X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128m, 0 },
{ X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256m, 0 },
@@ -3411,6 +3443,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUB213SDZr_Int, X86::VFMSUB213SDZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB213SDr, X86::VFMSUB213SDm, 0 },
{ X86::VFMSUB213SDr_Int, X86::VFMSUB213SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SHZr, X86::VFMSUB213SHZm, 0 },
+ { X86::VFMSUB213SHZr_Int, X86::VFMSUB213SHZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB213SSZr, X86::VFMSUB213SSZm, 0 },
{ X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB213SSr, X86::VFMSUB213SSm, 0 },
@@ -3420,6 +3454,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, 0 },
{ X86::VFMSUB231PDZr, X86::VFMSUB231PDZm, 0 },
{ X86::VFMSUB231PDr, X86::VFMSUB231PDm, 0 },
+ { X86::VFMSUB231PHZ128r, X86::VFMSUB231PHZ128m, 0 },
+ { X86::VFMSUB231PHZ256r, X86::VFMSUB231PHZ256m, 0 },
+ { X86::VFMSUB231PHZr, X86::VFMSUB231PHZm, 0 },
{ X86::VFMSUB231PSYr, X86::VFMSUB231PSYm, 0 },
{ X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128m, 0 },
{ X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256m, 0 },
@@ -3429,6 +3466,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUB231SDZr_Int, X86::VFMSUB231SDZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB231SDr, X86::VFMSUB231SDm, 0 },
{ X86::VFMSUB231SDr_Int, X86::VFMSUB231SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SHZr, X86::VFMSUB231SHZm, 0 },
+ { X86::VFMSUB231SHZr_Int, X86::VFMSUB231SHZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB231SSZr, X86::VFMSUB231SSZm, 0 },
{ X86::VFMSUB231SSZr_Int, X86::VFMSUB231SSZm_Int, TB_NO_REVERSE },
{ X86::VFMSUB231SSr, X86::VFMSUB231SSm, 0 },
@@ -3438,6 +3477,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256m, 0 },
{ X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZm, 0 },
{ X86::VFMSUBADD132PDr, X86::VFMSUBADD132PDm, 0 },
+ { X86::VFMSUBADD132PHZ128r, X86::VFMSUBADD132PHZ128m, 0 },
+ { X86::VFMSUBADD132PHZ256r, X86::VFMSUBADD132PHZ256m, 0 },
+ { X86::VFMSUBADD132PHZr, X86::VFMSUBADD132PHZm, 0 },
{ X86::VFMSUBADD132PSYr, X86::VFMSUBADD132PSYm, 0 },
{ X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128m, 0 },
{ X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256m, 0 },
@@ -3448,6 +3490,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256m, 0 },
{ X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZm, 0 },
{ X86::VFMSUBADD213PDr, X86::VFMSUBADD213PDm, 0 },
+ { X86::VFMSUBADD213PHZ128r, X86::VFMSUBADD213PHZ128m, 0 },
+ { X86::VFMSUBADD213PHZ256r, X86::VFMSUBADD213PHZ256m, 0 },
+ { X86::VFMSUBADD213PHZr, X86::VFMSUBADD213PHZm, 0 },
{ X86::VFMSUBADD213PSYr, X86::VFMSUBADD213PSYm, 0 },
{ X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128m, 0 },
{ X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256m, 0 },
@@ -3458,6 +3503,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256m, 0 },
{ X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZm, 0 },
{ X86::VFMSUBADD231PDr, X86::VFMSUBADD231PDm, 0 },
+ { X86::VFMSUBADD231PHZ128r, X86::VFMSUBADD231PHZ128m, 0 },
+ { X86::VFMSUBADD231PHZ256r, X86::VFMSUBADD231PHZ256m, 0 },
+ { X86::VFMSUBADD231PHZr, X86::VFMSUBADD231PHZm, 0 },
{ X86::VFMSUBADD231PSYr, X86::VFMSUBADD231PSYm, 0 },
{ X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128m, 0 },
{ X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256m, 0 },
@@ -3480,6 +3528,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0 },
{ X86::VFNMADD132PDZr, X86::VFNMADD132PDZm, 0 },
{ X86::VFNMADD132PDr, X86::VFNMADD132PDm, 0 },
+ { X86::VFNMADD132PHZ128r, X86::VFNMADD132PHZ128m, 0 },
+ { X86::VFNMADD132PHZ256r, X86::VFNMADD132PHZ256m, 0 },
+ { X86::VFNMADD132PHZr, X86::VFNMADD132PHZm, 0 },
{ X86::VFNMADD132PSYr, X86::VFNMADD132PSYm, 0 },
{ X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128m, 0 },
{ X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256m, 0 },
@@ -3489,6 +3540,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMADD132SDZr_Int, X86::VFNMADD132SDZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD132SDr, X86::VFNMADD132SDm, 0 },
{ X86::VFNMADD132SDr_Int, X86::VFNMADD132SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SHZr, X86::VFNMADD132SHZm, 0 },
+ { X86::VFNMADD132SHZr_Int, X86::VFNMADD132SHZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD132SSZr, X86::VFNMADD132SSZm, 0 },
{ X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD132SSr, X86::VFNMADD132SSm, 0 },
@@ -3498,6 +3551,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, 0 },
{ X86::VFNMADD213PDZr, X86::VFNMADD213PDZm, 0 },
{ X86::VFNMADD213PDr, X86::VFNMADD213PDm, 0 },
+ { X86::VFNMADD213PHZ128r, X86::VFNMADD213PHZ128m, 0 },
+ { X86::VFNMADD213PHZ256r, X86::VFNMADD213PHZ256m, 0 },
+ { X86::VFNMADD213PHZr, X86::VFNMADD213PHZm, 0 },
{ X86::VFNMADD213PSYr, X86::VFNMADD213PSYm, 0 },
{ X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128m, 0 },
{ X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256m, 0 },
@@ -3507,6 +3563,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMADD213SDZr_Int, X86::VFNMADD213SDZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD213SDr, X86::VFNMADD213SDm, 0 },
{ X86::VFNMADD213SDr_Int, X86::VFNMADD213SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SHZr, X86::VFNMADD213SHZm, 0 },
+ { X86::VFNMADD213SHZr_Int, X86::VFNMADD213SHZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD213SSZr, X86::VFNMADD213SSZm, 0 },
{ X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD213SSr, X86::VFNMADD213SSm, 0 },
@@ -3516,6 +3574,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, 0 },
{ X86::VFNMADD231PDZr, X86::VFNMADD231PDZm, 0 },
{ X86::VFNMADD231PDr, X86::VFNMADD231PDm, 0 },
+ { X86::VFNMADD231PHZ128r, X86::VFNMADD231PHZ128m, 0 },
+ { X86::VFNMADD231PHZ256r, X86::VFNMADD231PHZ256m, 0 },
+ { X86::VFNMADD231PHZr, X86::VFNMADD231PHZm, 0 },
{ X86::VFNMADD231PSYr, X86::VFNMADD231PSYm, 0 },
{ X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128m, 0 },
{ X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256m, 0 },
@@ -3525,6 +3586,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMADD231SDZr_Int, X86::VFNMADD231SDZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD231SDr, X86::VFNMADD231SDm, 0 },
{ X86::VFNMADD231SDr_Int, X86::VFNMADD231SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SHZr, X86::VFNMADD231SHZm, 0 },
+ { X86::VFNMADD231SHZr_Int, X86::VFNMADD231SHZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD231SSZr, X86::VFNMADD231SSZm, 0 },
{ X86::VFNMADD231SSZr_Int, X86::VFNMADD231SSZm_Int, TB_NO_REVERSE },
{ X86::VFNMADD231SSr, X86::VFNMADD231SSm, 0 },
@@ -3542,6 +3605,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, 0 },
{ X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZm, 0 },
{ X86::VFNMSUB132PDr, X86::VFNMSUB132PDm, 0 },
+ { X86::VFNMSUB132PHZ128r, X86::VFNMSUB132PHZ128m, 0 },
+ { X86::VFNMSUB132PHZ256r, X86::VFNMSUB132PHZ256m, 0 },
+ { X86::VFNMSUB132PHZr, X86::VFNMSUB132PHZm, 0 },
{ X86::VFNMSUB132PSYr, X86::VFNMSUB132PSYm, 0 },
{ X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128m, 0 },
{ X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256m, 0 },
@@ -3551,6 +3617,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMSUB132SDZr_Int, X86::VFNMSUB132SDZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB132SDr, X86::VFNMSUB132SDm, 0 },
{ X86::VFNMSUB132SDr_Int, X86::VFNMSUB132SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SHZr, X86::VFNMSUB132SHZm, 0 },
+ { X86::VFNMSUB132SHZr_Int, X86::VFNMSUB132SHZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB132SSZr, X86::VFNMSUB132SSZm, 0 },
{ X86::VFNMSUB132SSZr_Int, X86::VFNMSUB132SSZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, 0 },
@@ -3560,6 +3628,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, 0 },
{ X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZm, 0 },
{ X86::VFNMSUB213PDr, X86::VFNMSUB213PDm, 0 },
+ { X86::VFNMSUB213PHZ128r, X86::VFNMSUB213PHZ128m, 0 },
+ { X86::VFNMSUB213PHZ256r, X86::VFNMSUB213PHZ256m, 0 },
+ { X86::VFNMSUB213PHZr, X86::VFNMSUB213PHZm, 0 },
{ X86::VFNMSUB213PSYr, X86::VFNMSUB213PSYm, 0 },
{ X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128m, 0 },
{ X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256m, 0 },
@@ -3569,6 +3640,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMSUB213SDZr_Int, X86::VFNMSUB213SDZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB213SDr, X86::VFNMSUB213SDm, 0 },
{ X86::VFNMSUB213SDr_Int, X86::VFNMSUB213SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SHZr, X86::VFNMSUB213SHZm, 0 },
+ { X86::VFNMSUB213SHZr_Int, X86::VFNMSUB213SHZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB213SSZr, X86::VFNMSUB213SSZm, 0 },
{ X86::VFNMSUB213SSZr_Int, X86::VFNMSUB213SSZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, 0 },
@@ -3578,6 +3651,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, 0 },
{ X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZm, 0 },
{ X86::VFNMSUB231PDr, X86::VFNMSUB231PDm, 0 },
+ { X86::VFNMSUB231PHZ128r, X86::VFNMSUB231PHZ128m, 0 },
+ { X86::VFNMSUB231PHZ256r, X86::VFNMSUB231PHZ256m, 0 },
+ { X86::VFNMSUB231PHZr, X86::VFNMSUB231PHZm, 0 },
{ X86::VFNMSUB231PSYr, X86::VFNMSUB231PSYm, 0 },
{ X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128m, 0 },
{ X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256m, 0 },
@@ -3587,6 +3663,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFNMSUB231SDZr_Int, X86::VFNMSUB231SDZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB231SDr, X86::VFNMSUB231SDm, 0 },
{ X86::VFNMSUB231SDr_Int, X86::VFNMSUB231SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SHZr, X86::VFNMSUB231SHZm, 0 },
+ { X86::VFNMSUB231SHZr_Int, X86::VFNMSUB231SHZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB231SSZr, X86::VFNMSUB231SSZm, 0 },
{ X86::VFNMSUB231SSZr_Int, X86::VFNMSUB231SSZm_Int, TB_NO_REVERSE },
{ X86::VFNMSUB231SSr, X86::VFNMSUB231SSm, 0 },
@@ -4599,6 +4677,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD132PDZ256rkz, X86::VFMADD132PDZ256mkz, 0 },
{ X86::VFMADD132PDZrk, X86::VFMADD132PDZmk, 0 },
{ X86::VFMADD132PDZrkz, X86::VFMADD132PDZmkz, 0 },
+ { X86::VFMADD132PHZ128rk, X86::VFMADD132PHZ128mk, 0 },
+ { X86::VFMADD132PHZ128rkz, X86::VFMADD132PHZ128mkz, 0 },
+ { X86::VFMADD132PHZ256rk, X86::VFMADD132PHZ256mk, 0 },
+ { X86::VFMADD132PHZ256rkz, X86::VFMADD132PHZ256mkz, 0 },
+ { X86::VFMADD132PHZrk, X86::VFMADD132PHZmk, 0 },
+ { X86::VFMADD132PHZrkz, X86::VFMADD132PHZmkz, 0 },
{ X86::VFMADD132PSZ128rk, X86::VFMADD132PSZ128mk, 0 },
{ X86::VFMADD132PSZ128rkz, X86::VFMADD132PSZ128mkz, 0 },
{ X86::VFMADD132PSZ256rk, X86::VFMADD132PSZ256mk, 0 },
@@ -4607,6 +4691,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0 },
{ X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD132SHZr_Intk, X86::VFMADD132SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mk, 0 },
@@ -4615,6 +4701,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD213PDZ256rkz, X86::VFMADD213PDZ256mkz, 0 },
{ X86::VFMADD213PDZrk, X86::VFMADD213PDZmk, 0 },
{ X86::VFMADD213PDZrkz, X86::VFMADD213PDZmkz, 0 },
+ { X86::VFMADD213PHZ128rk, X86::VFMADD213PHZ128mk, 0 },
+ { X86::VFMADD213PHZ128rkz, X86::VFMADD213PHZ128mkz, 0 },
+ { X86::VFMADD213PHZ256rk, X86::VFMADD213PHZ256mk, 0 },
+ { X86::VFMADD213PHZ256rkz, X86::VFMADD213PHZ256mkz, 0 },
+ { X86::VFMADD213PHZrk, X86::VFMADD213PHZmk, 0 },
+ { X86::VFMADD213PHZrkz, X86::VFMADD213PHZmkz, 0 },
{ X86::VFMADD213PSZ128rk, X86::VFMADD213PSZ128mk, 0 },
{ X86::VFMADD213PSZ128rkz, X86::VFMADD213PSZ128mkz, 0 },
{ X86::VFMADD213PSZ256rk, X86::VFMADD213PSZ256mk, 0 },
@@ -4623,6 +4715,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0 },
{ X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD213SHZr_Intk, X86::VFMADD213SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mk, 0 },
@@ -4631,6 +4725,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD231PDZ256rkz, X86::VFMADD231PDZ256mkz, 0 },
{ X86::VFMADD231PDZrk, X86::VFMADD231PDZmk, 0 },
{ X86::VFMADD231PDZrkz, X86::VFMADD231PDZmkz, 0 },
+ { X86::VFMADD231PHZ128rk, X86::VFMADD231PHZ128mk, 0 },
+ { X86::VFMADD231PHZ128rkz, X86::VFMADD231PHZ128mkz, 0 },
+ { X86::VFMADD231PHZ256rk, X86::VFMADD231PHZ256mk, 0 },
+ { X86::VFMADD231PHZ256rkz, X86::VFMADD231PHZ256mkz, 0 },
+ { X86::VFMADD231PHZrk, X86::VFMADD231PHZmk, 0 },
+ { X86::VFMADD231PHZrkz, X86::VFMADD231PHZmkz, 0 },
{ X86::VFMADD231PSZ128rk, X86::VFMADD231PSZ128mk, 0 },
{ X86::VFMADD231PSZ128rkz, X86::VFMADD231PSZ128mkz, 0 },
{ X86::VFMADD231PSZ256rk, X86::VFMADD231PSZ256mk, 0 },
@@ -4639,6 +4739,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0 },
{ X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD231SHZr_Intk, X86::VFMADD231SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADDSUB132PDZ128rk, X86::VFMADDSUB132PDZ128mk, 0 },
@@ -4647,6 +4749,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADDSUB132PDZ256rkz, X86::VFMADDSUB132PDZ256mkz, 0 },
{ X86::VFMADDSUB132PDZrk, X86::VFMADDSUB132PDZmk, 0 },
{ X86::VFMADDSUB132PDZrkz, X86::VFMADDSUB132PDZmkz, 0 },
+ { X86::VFMADDSUB132PHZ128rk, X86::VFMADDSUB132PHZ128mk, 0 },
+ { X86::VFMADDSUB132PHZ128rkz, X86::VFMADDSUB132PHZ128mkz, 0 },
+ { X86::VFMADDSUB132PHZ256rk, X86::VFMADDSUB132PHZ256mk, 0 },
+ { X86::VFMADDSUB132PHZ256rkz, X86::VFMADDSUB132PHZ256mkz, 0 },
+ { X86::VFMADDSUB132PHZrk, X86::VFMADDSUB132PHZmk, 0 },
+ { X86::VFMADDSUB132PHZrkz, X86::VFMADDSUB132PHZmkz, 0 },
{ X86::VFMADDSUB132PSZ128rk, X86::VFMADDSUB132PSZ128mk, 0 },
{ X86::VFMADDSUB132PSZ128rkz, X86::VFMADDSUB132PSZ128mkz, 0 },
{ X86::VFMADDSUB132PSZ256rk, X86::VFMADDSUB132PSZ256mk, 0 },
@@ -4659,6 +4767,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADDSUB213PDZ256rkz, X86::VFMADDSUB213PDZ256mkz, 0 },
{ X86::VFMADDSUB213PDZrk, X86::VFMADDSUB213PDZmk, 0 },
{ X86::VFMADDSUB213PDZrkz, X86::VFMADDSUB213PDZmkz, 0 },
+ { X86::VFMADDSUB213PHZ128rk, X86::VFMADDSUB213PHZ128mk, 0 },
+ { X86::VFMADDSUB213PHZ128rkz, X86::VFMADDSUB213PHZ128mkz, 0 },
+ { X86::VFMADDSUB213PHZ256rk, X86::VFMADDSUB213PHZ256mk, 0 },
+ { X86::VFMADDSUB213PHZ256rkz, X86::VFMADDSUB213PHZ256mkz, 0 },
+ { X86::VFMADDSUB213PHZrk, X86::VFMADDSUB213PHZmk, 0 },
+ { X86::VFMADDSUB213PHZrkz, X86::VFMADDSUB213PHZmkz, 0 },
{ X86::VFMADDSUB213PSZ128rk, X86::VFMADDSUB213PSZ128mk, 0 },
{ X86::VFMADDSUB213PSZ128rkz, X86::VFMADDSUB213PSZ128mkz, 0 },
{ X86::VFMADDSUB213PSZ256rk, X86::VFMADDSUB213PSZ256mk, 0 },
@@ -4671,6 +4785,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADDSUB231PDZ256rkz, X86::VFMADDSUB231PDZ256mkz, 0 },
{ X86::VFMADDSUB231PDZrk, X86::VFMADDSUB231PDZmk, 0 },
{ X86::VFMADDSUB231PDZrkz, X86::VFMADDSUB231PDZmkz, 0 },
+ { X86::VFMADDSUB231PHZ128rk, X86::VFMADDSUB231PHZ128mk, 0 },
+ { X86::VFMADDSUB231PHZ128rkz, X86::VFMADDSUB231PHZ128mkz, 0 },
+ { X86::VFMADDSUB231PHZ256rk, X86::VFMADDSUB231PHZ256mk, 0 },
+ { X86::VFMADDSUB231PHZ256rkz, X86::VFMADDSUB231PHZ256mkz, 0 },
+ { X86::VFMADDSUB231PHZrk, X86::VFMADDSUB231PHZmk, 0 },
+ { X86::VFMADDSUB231PHZrkz, X86::VFMADDSUB231PHZmkz, 0 },
{ X86::VFMADDSUB231PSZ128rk, X86::VFMADDSUB231PSZ128mk, 0 },
{ X86::VFMADDSUB231PSZ128rkz, X86::VFMADDSUB231PSZ128mkz, 0 },
{ X86::VFMADDSUB231PSZ256rk, X86::VFMADDSUB231PSZ256mk, 0 },
@@ -4683,6 +4803,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUB132PDZ256rkz, X86::VFMSUB132PDZ256mkz, 0 },
{ X86::VFMSUB132PDZrk, X86::VFMSUB132PDZmk, 0 },
{ X86::VFMSUB132PDZrkz, X86::VFMSUB132PDZmkz, 0 },
+ { X86::VFMSUB132PHZ128rk, X86::VFMSUB132PHZ128mk, 0 },
+ { X86::VFMSUB132PHZ128rkz, X86::VFMSUB132PHZ128mkz, 0 },
+ { X86::VFMSUB132PHZ256rk, X86::VFMSUB132PHZ256mk, 0 },
+ { X86::VFMSUB132PHZ256rkz, X86::VFMSUB132PHZ256mkz, 0 },
+ { X86::VFMSUB132PHZrk, X86::VFMSUB132PHZmk, 0 },
+ { X86::VFMSUB132PHZrkz, X86::VFMSUB132PHZmkz, 0 },
{ X86::VFMSUB132PSZ128rk, X86::VFMSUB132PSZ128mk, 0 },
{ X86::VFMSUB132PSZ128rkz, X86::VFMSUB132PSZ128mkz, 0 },
{ X86::VFMSUB132PSZ256rk, X86::VFMSUB132PSZ256mk, 0 },
@@ -4691,6 +4817,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0 },
{ X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE },
{ X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB132SHZr_Intk, X86::VFMSUB132SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mk, 0 },
@@ -4699,6 +4827,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUB213PDZ256rkz, X86::VFMSUB213PDZ256mkz, 0 },
{ X86::VFMSUB213PDZrk, X86::VFMSUB213PDZmk, 0 },
{ X86::VFMSUB213PDZrkz, X86::VFMSUB213PDZmkz, 0 },
+ { X86::VFMSUB213PHZ128rk, X86::VFMSUB213PHZ128mk, 0 },
+ { X86::VFMSUB213PHZ128rkz, X86::VFMSUB213PHZ128mkz, 0 },
+ { X86::VFMSUB213PHZ256rk, X86::VFMSUB213PHZ256mk, 0 },
+ { X86::VFMSUB213PHZ256rkz, X86::VFMSUB213PHZ256mkz, 0 },
+ { X86::VFMSUB213PHZrk, X86::VFMSUB213PHZmk, 0 },
+ { X86::VFMSUB213PHZrkz, X86::VFMSUB213PHZmkz, 0 },
{ X86::VFMSUB213PSZ128rk, X86::VFMSUB213PSZ128mk, 0 },
{ X86::VFMSUB213PSZ128rkz, X86::VFMSUB213PSZ128mkz, 0 },
{ X86::VFMSUB213PSZ256rk, X86::VFMSUB213PSZ256mk, 0 },
@@ -4707,6 +4841,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0 },
{ X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE },
{ X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB213SHZr_Intk, X86::VFMSUB213SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mk, 0 },
@@ -4715,6 +4851,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUB231PDZ256rkz, X86::VFMSUB231PDZ256mkz, 0 },
{ X86::VFMSUB231PDZrk, X86::VFMSUB231PDZmk, 0 },
{ X86::VFMSUB231PDZrkz, X86::VFMSUB231PDZmkz, 0 },
+ { X86::VFMSUB231PHZ128rk, X86::VFMSUB231PHZ128mk, 0 },
+ { X86::VFMSUB231PHZ128rkz, X86::VFMSUB231PHZ128mkz, 0 },
+ { X86::VFMSUB231PHZ256rk, X86::VFMSUB231PHZ256mk, 0 },
+ { X86::VFMSUB231PHZ256rkz, X86::VFMSUB231PHZ256mkz, 0 },
+ { X86::VFMSUB231PHZrk, X86::VFMSUB231PHZmk, 0 },
+ { X86::VFMSUB231PHZrkz, X86::VFMSUB231PHZmkz, 0 },
{ X86::VFMSUB231PSZ128rk, X86::VFMSUB231PSZ128mk, 0 },
{ X86::VFMSUB231PSZ128rkz, X86::VFMSUB231PSZ128mkz, 0 },
{ X86::VFMSUB231PSZ256rk, X86::VFMSUB231PSZ256mk, 0 },
@@ -4723,6 +4865,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0 },
{ X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE },
{ X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB231SHZr_Intk, X86::VFMSUB231SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB231SHZr_Intkz, X86::VFMSUB231SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0 },
@@ -4731,6 +4875,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUBADD132PDZ256rkz, X86::VFMSUBADD132PDZ256mkz, 0 },
{ X86::VFMSUBADD132PDZrk, X86::VFMSUBADD132PDZmk, 0 },
{ X86::VFMSUBADD132PDZrkz, X86::VFMSUBADD132PDZmkz, 0 },
+ { X86::VFMSUBADD132PHZ128rk, X86::VFMSUBADD132PHZ128mk, 0 },
+ { X86::VFMSUBADD132PHZ128rkz, X86::VFMSUBADD132PHZ128mkz, 0 },
+ { X86::VFMSUBADD132PHZ256rk, X86::VFMSUBADD132PHZ256mk, 0 },
+ { X86::VFMSUBADD132PHZ256rkz, X86::VFMSUBADD132PHZ256mkz, 0 },
+ { X86::VFMSUBADD132PHZrk, X86::VFMSUBADD132PHZmk, 0 },
+ { X86::VFMSUBADD132PHZrkz, X86::VFMSUBADD132PHZmkz, 0 },
{ X86::VFMSUBADD132PSZ128rk, X86::VFMSUBADD132PSZ128mk, 0 },
{ X86::VFMSUBADD132PSZ128rkz, X86::VFMSUBADD132PSZ128mkz, 0 },
{ X86::VFMSUBADD132PSZ256rk, X86::VFMSUBADD132PSZ256mk, 0 },
@@ -4743,6 +4893,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUBADD213PDZ256rkz, X86::VFMSUBADD213PDZ256mkz, 0 },
{ X86::VFMSUBADD213PDZrk, X86::VFMSUBADD213PDZmk, 0 },
{ X86::VFMSUBADD213PDZrkz, X86::VFMSUBADD213PDZmkz, 0 },
+ { X86::VFMSUBADD213PHZ128rk, X86::VFMSUBADD213PHZ128mk, 0 },
+ { X86::VFMSUBADD213PHZ128rkz, X86::VFMSUBADD213PHZ128mkz, 0 },
+ { X86::VFMSUBADD213PHZ256rk, X86::VFMSUBADD213PHZ256mk, 0 },
+ { X86::VFMSUBADD213PHZ256rkz, X86::VFMSUBADD213PHZ256mkz, 0 },
+ { X86::VFMSUBADD213PHZrk, X86::VFMSUBADD213PHZmk, 0 },
+ { X86::VFMSUBADD213PHZrkz, X86::VFMSUBADD213PHZmkz, 0 },
{ X86::VFMSUBADD213PSZ128rk, X86::VFMSUBADD213PSZ128mk, 0 },
{ X86::VFMSUBADD213PSZ128rkz, X86::VFMSUBADD213PSZ128mkz, 0 },
{ X86::VFMSUBADD213PSZ256rk, X86::VFMSUBADD213PSZ256mk, 0 },
@@ -4755,6 +4911,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUBADD231PDZ256rkz, X86::VFMSUBADD231PDZ256mkz, 0 },
{ X86::VFMSUBADD231PDZrk, X86::VFMSUBADD231PDZmk, 0 },
{ X86::VFMSUBADD231PDZrkz, X86::VFMSUBADD231PDZmkz, 0 },
+ { X86::VFMSUBADD231PHZ128rk, X86::VFMSUBADD231PHZ128mk, 0 },
+ { X86::VFMSUBADD231PHZ128rkz, X86::VFMSUBADD231PHZ128mkz, 0 },
+ { X86::VFMSUBADD231PHZ256rk, X86::VFMSUBADD231PHZ256mk, 0 },
+ { X86::VFMSUBADD231PHZ256rkz, X86::VFMSUBADD231PHZ256mkz, 0 },
+ { X86::VFMSUBADD231PHZrk, X86::VFMSUBADD231PHZmk, 0 },
+ { X86::VFMSUBADD231PHZrkz, X86::VFMSUBADD231PHZmkz, 0 },
{ X86::VFMSUBADD231PSZ128rk, X86::VFMSUBADD231PSZ128mk, 0 },
{ X86::VFMSUBADD231PSZ128rkz, X86::VFMSUBADD231PSZ128mkz, 0 },
{ X86::VFMSUBADD231PSZ256rk, X86::VFMSUBADD231PSZ256mk, 0 },
@@ -4767,6 +4929,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMADD132PDZ256rkz, X86::VFNMADD132PDZ256mkz, 0 },
{ X86::VFNMADD132PDZrk, X86::VFNMADD132PDZmk, 0 },
{ X86::VFNMADD132PDZrkz, X86::VFNMADD132PDZmkz, 0 },
+ { X86::VFNMADD132PHZ128rk, X86::VFNMADD132PHZ128mk, 0 },
+ { X86::VFNMADD132PHZ128rkz, X86::VFNMADD132PHZ128mkz, 0 },
+ { X86::VFNMADD132PHZ256rk, X86::VFNMADD132PHZ256mk, 0 },
+ { X86::VFNMADD132PHZ256rkz, X86::VFNMADD132PHZ256mkz, 0 },
+ { X86::VFNMADD132PHZrk, X86::VFNMADD132PHZmk, 0 },
+ { X86::VFNMADD132PHZrkz, X86::VFNMADD132PHZmkz, 0 },
{ X86::VFNMADD132PSZ128rk, X86::VFNMADD132PSZ128mk, 0 },
{ X86::VFNMADD132PSZ128rkz, X86::VFNMADD132PSZ128mkz, 0 },
{ X86::VFNMADD132PSZ256rk, X86::VFNMADD132PSZ256mk, 0 },
@@ -4775,6 +4943,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0 },
{ X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE },
{ X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD132SHZr_Intk, X86::VFNMADD132SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mk, 0 },
@@ -4783,6 +4953,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMADD213PDZ256rkz, X86::VFNMADD213PDZ256mkz, 0 },
{ X86::VFNMADD213PDZrk, X86::VFNMADD213PDZmk, 0 },
{ X86::VFNMADD213PDZrkz, X86::VFNMADD213PDZmkz, 0 },
+ { X86::VFNMADD213PHZ128rk, X86::VFNMADD213PHZ128mk, 0 },
+ { X86::VFNMADD213PHZ128rkz, X86::VFNMADD213PHZ128mkz, 0 },
+ { X86::VFNMADD213PHZ256rk, X86::VFNMADD213PHZ256mk, 0 },
+ { X86::VFNMADD213PHZ256rkz, X86::VFNMADD213PHZ256mkz, 0 },
+ { X86::VFNMADD213PHZrk, X86::VFNMADD213PHZmk, 0 },
+ { X86::VFNMADD213PHZrkz, X86::VFNMADD213PHZmkz, 0 },
{ X86::VFNMADD213PSZ128rk, X86::VFNMADD213PSZ128mk, 0 },
{ X86::VFNMADD213PSZ128rkz, X86::VFNMADD213PSZ128mkz, 0 },
{ X86::VFNMADD213PSZ256rk, X86::VFNMADD213PSZ256mk, 0 },
@@ -4791,6 +4967,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0 },
{ X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE },
{ X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD213SHZr_Intk, X86::VFNMADD213SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mk, 0 },
@@ -4799,6 +4977,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMADD231PDZ256rkz, X86::VFNMADD231PDZ256mkz, 0 },
{ X86::VFNMADD231PDZrk, X86::VFNMADD231PDZmk, 0 },
{ X86::VFNMADD231PDZrkz, X86::VFNMADD231PDZmkz, 0 },
+ { X86::VFNMADD231PHZ128rk, X86::VFNMADD231PHZ128mk, 0 },
+ { X86::VFNMADD231PHZ128rkz, X86::VFNMADD231PHZ128mkz, 0 },
+ { X86::VFNMADD231PHZ256rk, X86::VFNMADD231PHZ256mk, 0 },
+ { X86::VFNMADD231PHZ256rkz, X86::VFNMADD231PHZ256mkz, 0 },
+ { X86::VFNMADD231PHZrk, X86::VFNMADD231PHZmk, 0 },
+ { X86::VFNMADD231PHZrkz, X86::VFNMADD231PHZmkz, 0 },
{ X86::VFNMADD231PSZ128rk, X86::VFNMADD231PSZ128mk, 0 },
{ X86::VFNMADD231PSZ128rkz, X86::VFNMADD231PSZ128mkz, 0 },
{ X86::VFNMADD231PSZ256rk, X86::VFNMADD231PSZ256mk, 0 },
@@ -4807,6 +4991,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0 },
{ X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE },
{ X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD231SHZr_Intk, X86::VFNMADD231SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mk, 0 },
@@ -4815,6 +5001,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB132PDZ256rkz, X86::VFNMSUB132PDZ256mkz, 0 },
{ X86::VFNMSUB132PDZrk, X86::VFNMSUB132PDZmk, 0 },
{ X86::VFNMSUB132PDZrkz, X86::VFNMSUB132PDZmkz, 0 },
+ { X86::VFNMSUB132PHZ128rk, X86::VFNMSUB132PHZ128mk, 0 },
+ { X86::VFNMSUB132PHZ128rkz, X86::VFNMSUB132PHZ128mkz, 0 },
+ { X86::VFNMSUB132PHZ256rk, X86::VFNMSUB132PHZ256mk, 0 },
+ { X86::VFNMSUB132PHZ256rkz, X86::VFNMSUB132PHZ256mkz, 0 },
+ { X86::VFNMSUB132PHZrk, X86::VFNMSUB132PHZmk, 0 },
+ { X86::VFNMSUB132PHZrkz, X86::VFNMSUB132PHZmkz, 0 },
{ X86::VFNMSUB132PSZ128rk, X86::VFNMSUB132PSZ128mk, 0 },
{ X86::VFNMSUB132PSZ128rkz, X86::VFNMSUB132PSZ128mkz, 0 },
{ X86::VFNMSUB132PSZ256rk, X86::VFNMSUB132PSZ256mk, 0 },
@@ -4823,6 +5015,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0 },
{ X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB132SHZr_Intk, X86::VFNMSUB132SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mk, 0 },
@@ -4831,6 +5025,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB213PDZ256rkz, X86::VFNMSUB213PDZ256mkz, 0 },
{ X86::VFNMSUB213PDZrk, X86::VFNMSUB213PDZmk, 0 },
{ X86::VFNMSUB213PDZrkz, X86::VFNMSUB213PDZmkz, 0 },
+ { X86::VFNMSUB213PHZ128rk, X86::VFNMSUB213PHZ128mk, 0 },
+ { X86::VFNMSUB213PHZ128rkz, X86::VFNMSUB213PHZ128mkz, 0 },
+ { X86::VFNMSUB213PHZ256rk, X86::VFNMSUB213PHZ256mk, 0 },
+ { X86::VFNMSUB213PHZ256rkz, X86::VFNMSUB213PHZ256mkz, 0 },
+ { X86::VFNMSUB213PHZrk, X86::VFNMSUB213PHZmk, 0 },
+ { X86::VFNMSUB213PHZrkz, X86::VFNMSUB213PHZmkz, 0 },
{ X86::VFNMSUB213PSZ128rk, X86::VFNMSUB213PSZ128mk, 0 },
{ X86::VFNMSUB213PSZ128rkz, X86::VFNMSUB213PSZ128mkz, 0 },
{ X86::VFNMSUB213PSZ256rk, X86::VFNMSUB213PSZ256mk, 0 },
@@ -4839,6 +5039,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0 },
{ X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB213SHZr_Intk, X86::VFNMSUB213SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mk, 0 },
@@ -4847,6 +5049,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB231PDZ256rkz, X86::VFNMSUB231PDZ256mkz, 0 },
{ X86::VFNMSUB231PDZrk, X86::VFNMSUB231PDZmk, 0 },
{ X86::VFNMSUB231PDZrkz, X86::VFNMSUB231PDZmkz, 0 },
+ { X86::VFNMSUB231PHZ128rk, X86::VFNMSUB231PHZ128mk, 0 },
+ { X86::VFNMSUB231PHZ128rkz, X86::VFNMSUB231PHZ128mkz, 0 },
+ { X86::VFNMSUB231PHZ256rk, X86::VFNMSUB231PHZ256mk, 0 },
+ { X86::VFNMSUB231PHZ256rkz, X86::VFNMSUB231PHZ256mkz, 0 },
+ { X86::VFNMSUB231PHZrk, X86::VFNMSUB231PHZmk, 0 },
+ { X86::VFNMSUB231PHZrkz, X86::VFNMSUB231PHZmkz, 0 },
{ X86::VFNMSUB231PSZ128rk, X86::VFNMSUB231PSZ128mk, 0 },
{ X86::VFNMSUB231PSZ128rkz, X86::VFNMSUB231PSZ128mkz, 0 },
{ X86::VFNMSUB231PSZ256rk, X86::VFNMSUB231PSZ256mk, 0 },
@@ -4855,6 +5063,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0 },
{ X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB231SHZr_Intk, X86::VFNMSUB231SHZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB231SHZr_Intkz, X86::VFNMSUB231SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE },
{ X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE },
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 6d6a2d6ad099a..1949c9c1a4fb6 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -882,7 +882,6 @@ class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: I<o, F, outs, ins, asm, pattern>, T8PD,
EVEX_4V, Requires<[HasAVX512]>;
-class AVX512FMA3Base : T8PD, EVEX_4V;
class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 7f0e151b9eba2..baa8f5d7222d9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6137,6 +6137,24 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz:
case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz:
case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz:
+ case X86::VFMADD132SHZr_Int: case X86::VFNMADD132SHZr_Int:
+ case X86::VFMADD213SHZr_Int: case X86::VFNMADD213SHZr_Int:
+ case X86::VFMADD231SHZr_Int: case X86::VFNMADD231SHZr_Int:
+ case X86::VFMSUB132SHZr_Int: case X86::VFNMSUB132SHZr_Int:
+ case X86::VFMSUB213SHZr_Int: case X86::VFNMSUB213SHZr_Int:
+ case X86::VFMSUB231SHZr_Int: case X86::VFNMSUB231SHZr_Int:
+ case X86::VFMADD132SHZr_Intk: case X86::VFNMADD132SHZr_Intk:
+ case X86::VFMADD213SHZr_Intk: case X86::VFNMADD213SHZr_Intk:
+ case X86::VFMADD231SHZr_Intk: case X86::VFNMADD231SHZr_Intk:
+ case X86::VFMSUB132SHZr_Intk: case X86::VFNMSUB132SHZr_Intk:
+ case X86::VFMSUB213SHZr_Intk: case X86::VFNMSUB213SHZr_Intk:
+ case X86::VFMSUB231SHZr_Intk: case X86::VFNMSUB231SHZr_Intk:
+ case X86::VFMADD132SHZr_Intkz: case X86::VFNMADD132SHZr_Intkz:
+ case X86::VFMADD213SHZr_Intkz: case X86::VFNMADD213SHZr_Intkz:
+ case X86::VFMADD231SHZr_Intkz: case X86::VFNMADD231SHZr_Intkz:
+ case X86::VFMSUB132SHZr_Intkz: case X86::VFNMSUB132SHZr_Intkz:
+ case X86::VFMSUB213SHZr_Intkz: case X86::VFNMSUB213SHZr_Intkz:
+ case X86::VFMSUB231SHZr_Intkz: case X86::VFNMSUB231SHZr_Intkz:
return false;
default:
return true;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index efc4811084f94..b0e1e808a5369 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1187,6 +1187,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512fp16_vcvtusi642sh, INTR_TYPE_2OP,
X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512fp16_vfmadd_f16, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512fp16_vfmadd_ph_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_128, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll
new file mode 100644
index 0000000000000..3d03494273742
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll
@@ -0,0 +1,1363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
+
+declare half @llvm.fma.f16(half, half, half)
+declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
+declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
+declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
+
+define half @fma_123_f16(half %x, half %y, half %z) {
+; CHECK-LABEL: fma_123_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fma.f16(half %x, half %y, half %z)
+ ret half %a
+}
+
+define half @fma_213_f16(half %x, half %y, half %z) {
+; CHECK-LABEL: fma_213_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fma.f16(half %y, half %x, half %z)
+ ret half %a
+}
+
+define half @fma_231_f16(half %x, half %y, half %z) {
+; CHECK-LABEL: fma_231_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231sh %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fma.f16(half %y, half %z, half %x)
+ ret half %a
+}
+
+define half @fma_321_f16(half %x, half %y, half %z) {
+; CHECK-LABEL: fma_321_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231sh %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fma.f16(half %z, half %y, half %x)
+ ret half %a
+}
+
+define half @fma_132_f16(half %x, half %y, half %z) {
+; CHECK-LABEL: fma_132_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213sh %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fma.f16(half %x, half %z, half %y)
+ ret half %a
+}
+
+define half @fma_312_f16(half %x, half %y, half %z) {
+; CHECK-LABEL: fma_312_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213sh %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fma.f16(half %z, half %x, half %y)
+ ret half %a
+}
+
+define half @fma_load_123_f16(half %x, half %y, half* %zp) {
+; CHECK-LABEL: fma_load_123_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213sh (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load half, half* %zp
+ %a = call half @llvm.fma.f16(half %x, half %y, half %z)
+ ret half %a
+}
+
+define half @fma_load_213_f16(half %x, half %y, half* %zp) {
+; CHECK-LABEL: fma_load_213_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213sh (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load half, half* %zp
+ %a = call half @llvm.fma.f16(half %y, half %x, half %z)
+ ret half %a
+}
+
+define half @fma_load_231_f16(half %x, half %y, half* %zp) {
+; CHECK-LABEL: fma_load_231_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231sh (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load half, half* %zp
+ %a = call half @llvm.fma.f16(half %y, half %z, half %x)
+ ret half %a
+}
+
+define half @fma_load_321_f16(half %x, half %y, half* %zp) {
+; CHECK-LABEL: fma_load_321_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231sh (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load half, half* %zp
+ %a = call half @llvm.fma.f16(half %z, half %y, half %x)
+ ret half %a
+}
+
+define half @fma_load_132_f16(half %x, half %y, half* %zp) {
+; CHECK-LABEL: fma_load_132_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132sh (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load half, half* %zp
+ %a = call half @llvm.fma.f16(half %x, half %z, half %y)
+ ret half %a
+}
+
+define half @fma_load_312_f16(half %x, half %y, half* %zp) {
+; CHECK-LABEL: fma_load_312_f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132sh (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load half, half* %zp
+ %a = call half @llvm.fma.f16(half %z, half %x, half %y)
+ ret half %a
+}
+
+define <8 x half> @fma_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
+; CHECK-LABEL: fma_123_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
+; CHECK-LABEL: fma_213_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
+; CHECK-LABEL: fma_231_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
+; CHECK-LABEL: fma_321_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
+; CHECK-LABEL: fma_132_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
+; CHECK-LABEL: fma_312_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_load_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) {
+; CHECK-LABEL: fma_load_123_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_load_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) {
+; CHECK-LABEL: fma_load_213_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_load_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) {
+; CHECK-LABEL: fma_load_231_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_load_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) {
+; CHECK-LABEL: fma_load_321_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_load_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) {
+; CHECK-LABEL: fma_load_132_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_load_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) {
+; CHECK-LABEL: fma_load_312_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
+ ret <8 x half> %a
+}
+
+define <8 x half> @fma_mask_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_123_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd132ph %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_213_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_231_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_321_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_132_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd132ph %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_312_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_123_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_213_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_231_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_321_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_132_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_312_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_load_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_123_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_load_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_213_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_load_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_231_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_load_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_321_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_load_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_132_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_mask_load_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_312_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_load_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_123_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_load_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_213_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_load_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_231_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_load_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_321_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_load_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_132_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <8 x half> @fma_maskz_load_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_312_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <8 x half>, <8 x half>* %zp
+ %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
+ %b = bitcast i8 %mask to <8 x i1>
+ %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
+ ret <8 x half> %c
+}
+
+define <16 x half> @fma_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
+; CHECK-LABEL: fma_123_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
+; CHECK-LABEL: fma_213_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
+; CHECK-LABEL: fma_231_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
+; CHECK-LABEL: fma_321_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
+; CHECK-LABEL: fma_132_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
+; CHECK-LABEL: fma_312_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_load_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) {
+; CHECK-LABEL: fma_load_123_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_load_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) {
+; CHECK-LABEL: fma_load_213_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_load_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) {
+; CHECK-LABEL: fma_load_231_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_load_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) {
+; CHECK-LABEL: fma_load_321_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_load_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) {
+; CHECK-LABEL: fma_load_132_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_load_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) {
+; CHECK-LABEL: fma_load_312_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
+ ret <16 x half> %a
+}
+
+define <16 x half> @fma_mask_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_123_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd132ph %ymm1, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_213_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_231_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_321_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_132_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd132ph %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_312_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_123_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_213_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_231_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_321_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_132_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_312_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_load_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_123_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_load_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_213_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_load_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_231_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_load_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_321_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_load_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_132_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_mask_load_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_312_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_load_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_123_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_load_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_213_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_load_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_231_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_load_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_321_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_load_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_132_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <16 x half> @fma_maskz_load_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_312_v16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <16 x half>, <16 x half>* %zp
+ %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
+ %b = bitcast i16 %mask to <16 x i1>
+ %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
+ ret <16 x half> %c
+}
+
+define <32 x half> @fma_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
+; CHECK-LABEL: fma_123_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
+; CHECK-LABEL: fma_213_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
+; CHECK-LABEL: fma_231_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
+; CHECK-LABEL: fma_321_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
+; CHECK-LABEL: fma_132_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
+; CHECK-LABEL: fma_312_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_load_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) {
+; CHECK-LABEL: fma_load_123_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_load_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) {
+; CHECK-LABEL: fma_load_213_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_load_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) {
+; CHECK-LABEL: fma_load_231_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_load_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) {
+; CHECK-LABEL: fma_load_321_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_load_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) {
+; CHECK-LABEL: fma_load_132_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_load_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) {
+; CHECK-LABEL: fma_load_312_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
+ ret <32 x half> %a
+}
+
+define <32 x half> @fma_mask_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_123_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_213_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_231_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_321_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_132_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd132ph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_312_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_123_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_213_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_231_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_321_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_132_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_312_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_load_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_123_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_load_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_213_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_load_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_231_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_load_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_321_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_load_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_132_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_mask_load_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_312_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_load_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_123_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_load_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_213_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_load_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_231_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_load_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_321_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_load_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_132_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
+
+define <32 x half> @fma_maskz_load_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_312_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %z = load <32 x half>, <32 x half>* %zp
+ %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
+ %b = bitcast i32 %mask to <32 x i1>
+ %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
+ ret <32 x half> %c
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
new file mode 100644
index 0000000000000..0729a9ee40857
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
@@ -0,0 +1,585 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+
+
+declare <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half>, <32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_x86_vfnmadd_ph_z(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_x86_vfnmadd_ph_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmadd213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xac,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %a2)
+ ret <32 x half> %2
+}
+
+define <32 x half> @test_mask_vfnmadd_ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_vfnmadd_ph:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vfnmadd_ph:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %a2)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @test_x86_vfnmsubph_z(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_x86_vfnmsubph_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsub213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xae,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @test_mask_vfnmsub_ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_vfnmsub_ph:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vfnmsub_ph:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %2)
+ %4 = bitcast i32 %mask to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %a0
+ ret <32 x half> %5
+}
+
+define <32 x half> @test_x86_vfmaddsubph_z(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_x86_vfmaddsubph_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsub213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xa6,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) #2
+ ret <32 x half> %res
+}
+
+define <32 x half> @test_mask_fmaddsub_ph(<32 x half> %a, <32 x half> %b, <32 x half> %c, i32 %mask) {
+; X86-LABEL: test_mask_fmaddsub_ph:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_fmaddsub_ph:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a, <32 x half> %b, <32 x half> %c, i32 4)
+ %bc = bitcast i32 %mask to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a
+ ret <32 x half> %sel
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32) nounwind readnone
+
+define <32 x half>@test_int_x86_avx512_mask_vfmaddsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfmaddsub_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfmaddsub_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4)
+ %bc = bitcast i32 %x3 to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x0
+ ret <32 x half> %sel
+}
+
+define <32 x half>@test_int_x86_avx512_mask3_vfmaddsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb6,0xd1]
+; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb6,0xd1]
+; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4)
+ %bc = bitcast i32 %x3 to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x2
+ ret <32 x half> %sel
+}
+
+define <32 x half>@test_int_x86_avx512_maskz_vfmaddsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa6,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa6,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4)
+ %bc = bitcast i32 %x3 to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> zeroinitializer
+ ret <32 x half> %sel
+}
+
+define <32 x half>@test_int_x86_avx512_mask3_vfmsubadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmsubadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb7,0xd1]
+; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmsubadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb7,0xd1]
+; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %neg = fneg <32 x half> %x2
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %neg, i32 4)
+ %bc = bitcast i32 %x3 to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x2
+ ret <32 x half> %sel
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rne(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rne:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd132ph {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x19,0x98,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rne:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd132ph {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x19,0x98,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 8) nounwind
+ %bc = bitcast i32 %mask to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0
+ ret <32 x half> %sel
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rtn(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rtn:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd132ph {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x39,0x98,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rtn:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd132ph {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x39,0x98,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 9) nounwind
+ %bc = bitcast i32 %mask to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0
+ ret <32 x half> %sel
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rtp(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rtp:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd132ph {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x59,0x98,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rtp:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd132ph {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x59,0x98,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 10) nounwind
+ %bc = bitcast i32 %mask to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0
+ ret <32 x half> %sel
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rtz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rtz:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd132ph {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x79,0x98,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rtz:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd132ph {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x79,0x98,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 11) nounwind
+ %bc = bitcast i32 %mask to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0
+ ret <32 x half> %sel
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrb_current(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_current:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_current:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) nounwind
+ %bc = bitcast i32 %mask to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0
+ ret <32 x half> %sel
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rne(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rne:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph {rn-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x18,0xa8,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 8) nounwind
+ ret <32 x half> %res
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rtn(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rtn:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph {rd-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x38,0xa8,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 9) nounwind
+ ret <32 x half> %res
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rtp(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rtp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph {ru-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x58,0xa8,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 10) nounwind
+ ret <32 x half> %res
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rtz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rtz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph {rz-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x78,0xa8,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 11) nounwind
+ ret <32 x half> %res
+}
+
+define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_current(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_current:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xa8,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) nounwind
+ ret <32 x half> %res
+}
+
+define <32 x half>@test_int_x86_avx512_mask3_vfmsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xba,0xd1]
+; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xba,0xd1]
+; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %1)
+ %3 = bitcast i32 %x3 to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %x2
+ ret <32 x half> %4
+}
+
+define <32 x half>@test_int_x86_avx512_mask_vfmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4)
+ %bc = bitcast i32 %x3 to <32 x i1>
+ %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x0
+ ret <32 x half> %sel
+}
+
+define <32 x half>@test_int_x86_avx512_mask3_vfmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb8,0xd1]
+; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb8,0xd1]
+; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2)
+ %2 = bitcast i32 %x3 to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %x2
+ ret <32 x half> %3
+}
+
+define <32 x half> @test_int_x86_avx512_maskz_vfmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa8,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa8,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2)
+ %2 = bitcast i32 %x3 to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer
+ ret <32 x half> %3
+}
+
+define <32 x half>@test_int_x86_avx512_mask_vfnmsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x1
+ %2 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %1, <32 x half> %2)
+ %4 = bitcast i32 %x3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %x0
+ ret <32 x half> %5
+}
+
+define <32 x half>@test_int_x86_avx512_mask3_vfnmsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xbe,0xd1]
+; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xbe,0xd1]
+; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x0
+ %2 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %1, <32 x half> %x1, <32 x half> %2)
+ %4 = bitcast i32 %x3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %x2
+ ret <32 x half> %5
+}
+
+define <32 x half>@test_int_x86_avx512_mask_vfnmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_512:
+; X86: # %bb.0:
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %1, <32 x half> %x2)
+ %3 = bitcast i32 %x3 to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %x0
+ ret <32 x half> %4
+}
+
+define <32 x half> @test_x86_fma_vfnmadd_ph_512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmadd213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xac,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %1, <32 x half> %a1, <32 x half> %a2)
+ ret <32 x half> %2
+}
+
+define <32 x half> @test_x86_fma_vfnmsub_ph_512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsub213ph %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xae,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %2 = fsub <32 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %1, <32 x half> %a1, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, half *%ptr_b, i8 %x3, i32 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT: vfmadd231sh (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x08]
+; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT: vfmadd231sh (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x0f]
+; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %q = load half, half* %ptr_b
+ %vecinit.i = insertelement <8 x half> undef, half %q, i32 0
+ %1 = extractelement <8 x half> %x0, i64 0
+ %2 = extractelement <8 x half> %vecinit.i, i64 0
+ %3 = extractelement <8 x half> %x1, i64 0
+ %4 = call half @llvm.fma.f16(half %1, half %2, half %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, half %4, half %3
+ %8 = insertelement <8 x half> %x1, half %7, i64 0
+ ret <8 x half> %8
+}
+
+define <8 x half>@test_int_x86_avx512_maskz_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3, i32 %x4 ){
+; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sh:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa9,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sh:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa9,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = extractelement <8 x half> %x0, i64 0
+ %2 = extractelement <8 x half> %x1, i64 0
+ %3 = extractelement <8 x half> %x2, i64 0
+ %4 = call half @llvm.fma.f16(half %1, half %2, half %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, half %4, half 0.000000e+00
+ %8 = insertelement <8 x half> %x0, half %7, i64 0
+ %9 = extractelement <8 x half> %x0, i64 0
+ %10 = extractelement <8 x half> %x1, i64 0
+ %11 = extractelement <8 x half> %x2, i64 0
+ %12 = call half @llvm.x86.avx512fp16.vfmadd.f16(half %9, half %10, half %11, i32 3)
+ %13 = bitcast i8 %x3 to <8 x i1>
+ %14 = extractelement <8 x i1> %13, i64 0
+ %15 = select i1 %14, half %12, half 0.000000e+00
+ %16 = insertelement <8 x half> %x0, half %15, i64 0
+ %res2 = fadd <8 x half> %8, %16
+ ret <8 x half> %8
+}
+
+define void @fmadd_sh_mask_memfold(half* %a, half* %b, i8 %c) {
+; X86-LABEL: fmadd_sh_mask_memfold:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT: vmovsh (%ecx), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01]
+; X86-NEXT: vmovsh (%eax), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08]
+; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8]
+; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1]
+; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: fmadd_sh_mask_memfold:
+; X64: # %bb.0:
+; X64-NEXT: vmovsh (%rdi), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x07]
+; X64-NEXT: vmovsh (%rsi), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x0e]
+; X64-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8]
+; X64-NEXT: kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
+; X64-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1]
+; X64-NEXT: vmovsh %xmm0, (%rdi) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %a.val = load half, half* %a
+ %av0 = insertelement <8 x half> undef, half %a.val, i32 0
+ %av1 = insertelement <8 x half> %av0, half 0.000000e+00, i32 1
+ %av2 = insertelement <8 x half> %av1, half 0.000000e+00, i32 2
+ %av3 = insertelement <8 x half> %av2, half 0.000000e+00, i32 3
+ %av4 = insertelement <8 x half> %av3, half 0.000000e+00, i32 4
+ %av5 = insertelement <8 x half> %av4, half 0.000000e+00, i32 5
+ %av6 = insertelement <8 x half> %av5, half 0.000000e+00, i32 6
+ %av = insertelement <8 x half> %av6, half 0.000000e+00, i32 7
+
+ %b.val = load half, half* %b
+ %bv0 = insertelement <8 x half> undef, half %b.val, i32 0
+ %bv1 = insertelement <8 x half> %bv0, half 0.000000e+00, i32 1
+ %bv2 = insertelement <8 x half> %bv1, half 0.000000e+00, i32 2
+ %bv3 = insertelement <8 x half> %bv2, half 0.000000e+00, i32 3
+ %bv4 = insertelement <8 x half> %bv3, half 0.000000e+00, i32 4
+ %bv5 = insertelement <8 x half> %bv4, half 0.000000e+00, i32 5
+ %bv6 = insertelement <8 x half> %bv5, half 0.000000e+00, i32 6
+ %bv = insertelement <8 x half> %bv6, half 0.000000e+00, i32 7
+ %1 = extractelement <8 x half> %av, i64 0
+ %2 = extractelement <8 x half> %bv, i64 0
+ %3 = extractelement <8 x half> %av, i64 0
+ %4 = call half @llvm.fma.f16(half %1, half %2, half %3)
+ %5 = bitcast i8 %c to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, half %4, half %1
+ %8 = insertelement <8 x half> %av, half %7, i64 0
+ %sr = extractelement <8 x half> %8, i32 0
+ store half %sr, half* %a
+ ret void
+}
+
+declare half @llvm.fma.f16(half, half, half)
+declare half @llvm.x86.avx512fp16.vfmadd.f16(half, half, half, i32)
+
+declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll
new file mode 100644
index 0000000000000..237c9aa0309a5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll
@@ -0,0 +1,530 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+
+
+define <16 x half> @test_x86_vfnmadd_ph_z_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+; CHECK-LABEL: test_x86_vfnmadd_ph_z_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmadd213ph %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xac,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %a2)
+ ret <16 x half> %2
+}
+
+define <16 x half> @test_mask_vfnmadd_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+; X86-LABEL: test_mask_vfnmadd_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vfnmadd_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %a2)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @test_x86_vfnmsubph_z_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+; CHECK-LABEL: test_x86_vfnmsubph_z_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsub213ph %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xae,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @test_mask_vfnmsub_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+; X86-LABEL: test_mask_vfnmsub_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vfnmsub_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %2)
+ %4 = bitcast i16 %mask to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %a0
+ ret <16 x half> %5
+}
+
+define <16 x half>@test_int_x86_avx512_mask3_vfmaddsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb6,0xd1]
+; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb6,0xd1]
+; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2)
+ %bc = bitcast i16 %x3 to <16 x i1>
+ %sel = select <16 x i1> %bc, <16 x half> %res, <16 x half> %x2
+ ret <16 x half> %sel
+}
+declare <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half>, <16 x half>, <16 x half>)
+
+define <16 x half>@test_int_x86_avx512_maskz_vfmaddsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa6,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa6,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2)
+ %bc = bitcast i16 %x3 to <16 x i1>
+ %sel = select <16 x i1> %bc, <16 x half> %res, <16 x half> zeroinitializer
+ ret <16 x half> %sel
+}
+
+define <16 x half>@test_int_x86_avx512_mask3_vfmsubadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmsubadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb7,0xd1]
+; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmsubadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb7,0xd1]
+; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %neg = fneg <16 x half> %x2
+ %res = call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %neg)
+ %bc = bitcast i16 %x3 to <16 x i1>
+ %sel = select <16 x i1> %bc, <16 x half> %res, <16 x half> %x2
+ ret <16 x half> %sel
+}
+
+define <16 x half>@test_int_x86_avx512_mask3_vfmsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xba,0xd1]
+; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xba,0xd1]
+; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %1)
+ %3 = bitcast i16 %x3 to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %x2
+ ret <16 x half> %4
+}
+
+define <16 x half>@test_int_x86_avx512_mask3_vfmadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb8,0xd1]
+; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb8,0xd1]
+; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> %x2
+ ret <16 x half> %3
+}
+
+define <16 x half> @test_int_x86_avx512_maskz_vfmadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa8,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa8,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> zeroinitializer
+ ret <16 x half> %3
+}
+
+define <16 x half>@test_int_x86_avx512_mask_vfnmsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x1
+ %2 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %1, <16 x half> %2)
+ %4 = bitcast i16 %x3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %x0
+ ret <16 x half> %5
+}
+
+define <16 x half>@test_int_x86_avx512_mask3_vfnmsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xbe,0xd1]
+; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xbe,0xd1]
+; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x0
+ %2 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %1, <16 x half> %x1, <16 x half> %2)
+ %4 = bitcast i16 %x3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %x2
+ ret <16 x half> %5
+}
+
+define <16 x half>@test_int_x86_avx512_mask_vfnmadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_256:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_256:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %1, <16 x half> %x2)
+ %3 = bitcast i16 %x3 to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %x0
+ ret <16 x half> %4
+}
+
+define <16 x half> @test_x86_fma_vfnmadd_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmadd213ph %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xac,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %1, <16 x half> %a1, <16 x half> %a2)
+ ret <16 x half> %2
+}
+
+define <16 x half> @test_x86_fma_vfnmsub_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsub213ph %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xae,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %2 = fsub <16 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %1, <16 x half> %a1, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
+
+define <8 x half> @test_x86_vfnmadd_ph_z_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+; CHECK-LABEL: test_x86_vfnmadd_ph_z_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmadd213ph %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xac,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %a2)
+ ret <8 x half> %2
+}
+
+define <8 x half> @test_mask_vfnmadd_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; X86-LABEL: test_mask_vfnmadd_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vfnmadd_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %a2)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @test_x86_vfnmsubph_z_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+; CHECK-LABEL: test_x86_vfnmsubph_z_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsub213ph %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xae,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @test_mask_vfnmsub_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; X86-LABEL: test_mask_vfnmsub_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vfnmsub_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a1
+ %2 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %2)
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %a0
+ ret <8 x half> %5
+}
+
+define <8 x half>@test_int_x86_avx512_mask3_vfmaddsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb6,0xd1]
+; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb6,0xd1]
+; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2)
+ %bc = bitcast i8 %x3 to <8 x i1>
+ %sel = select <8 x i1> %bc, <8 x half> %res, <8 x half> %x2
+ ret <8 x half> %sel
+}
+declare <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half>, <8 x half>, <8 x half>)
+
+define <8 x half>@test_int_x86_avx512_maskz_vfmaddsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmaddsub213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa6,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmaddsub213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa6,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2)
+ %bc = bitcast i8 %x3 to <8 x i1>
+ %sel = select <8 x i1> %bc, <8 x half> %res, <8 x half> zeroinitializer
+ ret <8 x half> %sel
+}
+
+define <8 x half>@test_int_x86_avx512_mask3_vfmsubadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmsubadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb7,0xd1]
+; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmsubadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb7,0xd1]
+; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %neg = fneg <8 x half> %x2
+ %res = call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %neg)
+ %bc = bitcast i8 %x3 to <8 x i1>
+ %sel = select <8 x i1> %bc, <8 x half> %res, <8 x half> %x2
+ ret <8 x half> %sel
+}
+
+define <8 x half>@test_int_x86_avx512_mask3_vfmsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xba,0xd1]
+; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xba,0xd1]
+; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %1)
+ %3 = bitcast i8 %x3 to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %x2
+ ret <8 x half> %4
+}
+
+define <8 x half>@test_int_x86_avx512_mask3_vfmadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb8,0xd1]
+; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb8,0xd1]
+; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> %x2
+ ret <8 x half> %3
+}
+
+define <8 x half> @test_int_x86_avx512_maskz_vfmadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa8,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa8,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> zeroinitializer
+ ret <8 x half> %3
+}
+
+define <8 x half>@test_int_x86_avx512_mask_vfnmsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x1
+ %2 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %1, <8 x half> %2)
+ %4 = bitcast i8 %x3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %x0
+ ret <8 x half> %5
+}
+
+define <8 x half>@test_int_x86_avx512_mask3_vfnmsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xbe,0xd1]
+; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xbe,0xd1]
+; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x0
+ %2 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %1, <8 x half> %x1, <8 x half> %2)
+ %4 = bitcast i8 %x3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %x2
+ ret <8 x half> %5
+}
+
+define <8 x half>@test_int_x86_avx512_mask_vfnmadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){
+; X86-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_128:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_128:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %x1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %1, <8 x half> %x2)
+ %3 = bitcast i8 %x3 to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %x0
+ ret <8 x half> %4
+}
+
+define <8 x half> @test_x86_fma_vfnmadd_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmadd213ph %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xac,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %1, <8 x half> %a1, <8 x half> %a2)
+ ret <8 x half> %2
+}
+
+define <8 x half> @test_x86_fma_vfnmsub_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsub213ph %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xae,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %1 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %2 = fsub <8 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %a0
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %1, <8 x half> %a1, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index e7e52f153bc35..cb334aba458f7 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -11,6 +11,7 @@ declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
define half @fadd_f16(half %a, half %b) nounwind strictfp {
; X86-LABEL: fadd_f16:
@@ -197,4 +198,22 @@ define void @fsqrt_f16(half* %a) nounwind strictfp {
ret void
}
+define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
+; X86-LABEL: fma_f16:
+; X86: # %bb.0:
+; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: vfmadd213sh {{[0-9]+}}(%esp), %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: fma_f16:
+; X64: # %bb.0:
+; X64-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %c,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
new file mode 100644
index 0000000000000..a0cc87f87db8f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
@@ -0,0 +1,2526 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
+
+define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmadd123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_mask:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmadd213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_mask:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmadd231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_mask:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmadd321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_mask:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmadd132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_mask:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmadd312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_mask:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_maskz:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_maskz:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_maskz:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_maskz:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_maskz:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_maskz:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsub123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_mask:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsub213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_mask:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsub231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_mask:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsub321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_mask:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsub132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_mask:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsub312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_mask:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_maskz:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_maskz:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_maskz:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_maskz:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_maskz:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_maskz:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fnmadd123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_mask:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmadd213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_mask:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmadd231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_mask:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmadd321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_mask:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmadd132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_mask:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmadd312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_mask:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = fneg <32 x half> %a2
+ %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = fneg <32 x half> %a2
+ %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = fneg <32 x half> %a0
+ %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = fneg <32 x half> %a0
+ %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = fneg <32 x half> %a1
+ %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = fneg <32 x half> %a1
+ %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_mask:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %neg1 = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_mask:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %neg1 = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_mask:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %neg1 = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_mask:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %neg1 = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_mask:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %neg1 = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_mask:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %neg1 = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %neg1 = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %neg1 = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %neg1 = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %neg1 = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %neg1 = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %neg1 = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd123sh:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
+ ret half %2
+}
+declare half @llvm.fma.f16(half, half, half)
+
+define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd213sh:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
+ ret half %2
+}
+
+define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd231sh:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
+ ret half %2
+}
+
+define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd321sh:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
+ ret half %2
+}
+
+define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd132sh:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
+ ret half %2
+}
+
+define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd312sh:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
+ ret half %2
+}
+
+define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub123sh:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a2
+ %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
+ ret half %3
+}
+
+define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub213sh:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a2
+ %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
+ ret half %3
+}
+
+define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub231sh:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a0
+ %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
+ ret half %3
+}
+
+define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub321sh:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a0
+ %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
+ ret half %3
+}
+
+define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub132sh:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a1
+ %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
+ ret half %3
+}
+
+define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub312sh:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a1
+ %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
+ ret half %3
+}
+
+define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd123sh:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a0
+ %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
+ ret half %3
+}
+
+define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd213sh:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a1
+ %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
+ ret half %3
+}
+
+define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd231sh:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a1
+ %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
+ ret half %3
+}
+
+define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd321sh:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a2
+ %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
+ ret half %3
+}
+
+define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd132sh:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a0
+ %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
+ ret half %3
+}
+
+define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd312sh:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a2
+ %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
+ ret half %3
+}
+
+define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub123sh:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a0
+ %3 = fneg half %a2
+ %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
+ ret half %4
+}
+
+define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub213sh:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a1
+ %3 = fneg half %a2
+ %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
+ ret half %4
+}
+
+define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub231sh:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a1
+ %3 = fneg half %a0
+ %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
+ ret half %4
+}
+
+define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub321sh:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a2
+ %3 = fneg half %a0
+ %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
+ ret half %4
+}
+
+define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub132sh:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a0
+ %3 = fneg half %a1
+ %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
+ ret half %4
+}
+
+define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub312sh:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg half %a2
+ %3 = fneg half %a1
+ %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
+ ret half %4
+}
+
+define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmadd123sh_int:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmadd213sh_int:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmadd231sh_int:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmadd321sh_int:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmadd132sh_int:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmadd312sh_int:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmsub123sh_int:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmsub213sh_int:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmsub231sh_int:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmsub321sh_int:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmsub132sh_int:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fmsub312sh_int:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmadd123sh_int:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmadd213sh_int:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmadd231sh_int:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmadd321sh_int:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmadd132sh_int:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmadd312sh_int:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmsub123sh_int:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmsub213sh_int:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmsub231sh_int:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmsub321sh_int:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmsub132sh_int:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
+ ;CHECK-LABEL: stack_fold_fnmsub312sh_int:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
+ %res = insertelement <8 x half> %a0v, half %2, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123sh_intk:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213sh_intk:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231sh_intk:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321sh_intk:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132sh_intk:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312sh_intk:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123sh_intk:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213sh_intk:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231sh_intk:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321sh_intk:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132sh_intk:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312sh_intk:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123sh_intk:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213sh_intk:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231sh_intk:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321sh_intk:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132sh_intk:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312sh_intk:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123sh_intk:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213sh_intk:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231sh_intk:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321sh_intk:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132sh_intk:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312sh_intk:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half %a0
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123sh_intkz:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213sh_intkz:
+ ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231sh_intkz:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321sh_intkz:
+ ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132sh_intkz:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312sh_intkz:
+ ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123sh_intkz:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213sh_intkz:
+ ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231sh_intkz:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321sh_intkz:
+ ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132sh_intkz:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312sh_intkz:
+ ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123sh_intkz:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213sh_intkz:
+ ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231sh_intkz:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321sh_intkz:
+ ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132sh_intkz:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312sh_intkz:
+ ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123sh_intkz:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213sh_intkz:
+ ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a2
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231sh_intkz:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %neg1 = fneg half %a1
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321sh_intkz:
+ ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a0
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132sh_intkz:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %neg1 = fneg half %a0
+ %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312sh_intkz:
+ ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = extractelement <8 x half> %a0v, i64 0
+ %a1 = extractelement <8 x half> %a1v, i64 0
+ %a2 = extractelement <8 x half> %a2v, i64 0
+ %neg = fneg half %a1
+ %neg1 = fneg half %a2
+ %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = extractelement <8 x i1> %4, i64 0
+ %6 = select i1 %5, half %2, half zeroinitializer
+ %res = insertelement <8 x half> %a0v, half %6, i64 0
+ ret <8 x half> %res
+}
+
+define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddsub123ph:
+ ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32)
+
+define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddsub213ph:
+ ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddsub231ph:
+ ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddsub321ph:
+ ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddsub132ph:
+ ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddsub312ph:
+ ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
+ ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_fmaddsub123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub123ph_mask:
+ ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmaddsub213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub213ph_mask:
+ ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmaddsub231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub231ph_mask:
+ ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmaddsub321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub321ph_mask:
+ ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmaddsub132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub132ph_mask:
+ ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmaddsub312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub312ph_mask:
+ ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub123ph_maskz:
+ ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub213ph_maskz:
+ ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub231ph_maskz:
+ ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub321ph_maskz:
+ ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub132ph_maskz:
+ ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddsub312ph_maskz:
+ ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsubadd123ph:
+ ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsubadd213ph:
+ ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a2
+ %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsubadd231ph:
+ ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsubadd321ph:
+ ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a0
+ %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsubadd132ph:
+ ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsubadd312ph:
+ ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <32 x half> %a1
+ %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_fmsubadd123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd123ph_mask:
+ ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsubadd213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd213ph_mask:
+ ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsubadd231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd231ph_mask:
+ ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsubadd321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd321ph_mask:
+ ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsubadd132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd132ph_mask:
+ ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsubadd312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd312ph_mask:
+ ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <32 x half>, <32 x half>* %p
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
+ ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd123ph_maskz:
+ ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd213ph_maskz:
+ ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a2
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd231ph_maskz:
+ ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd321ph_maskz:
+ ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a0
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd132ph_maskz:
+ ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsubadd312ph_maskz:
+ ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <32 x half> %a1
+ %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
+ %3 = load i32, i32* %mask
+ %4 = bitcast i32 %3 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
new file mode 100644
index 0000000000000..0415d47009d5a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
@@ -0,0 +1,1595 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <8 x half> @stack_fold_fmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
+
+define <8 x half> @stack_fold_fmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
+ ret <8 x half> %2
+}
+
+define <8 x half> @stack_fold_fmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
+ ret <8 x half> %2
+}
+
+define <8 x half> @stack_fold_fmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
+ ret <8 x half> %2
+}
+
+define <8 x half> @stack_fold_fmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
+ ret <8 x half> %2
+}
+
+define <8 x half> @stack_fold_fmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
+ ret <8 x half> %2
+}
+
+define <8 x half> @stack_fold_fmadd123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_mask:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmadd213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_mask:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmadd231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_mask:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmadd321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_mask:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmadd132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_mask:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmadd312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_mask:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_maskz:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_maskz:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_maskz:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_maskz:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_maskz:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_maskz:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a0
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a0
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a1
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a1
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fmsub123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_mask:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmsub213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_mask:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmsub231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_mask:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmsub321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_mask:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmsub132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_mask:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmsub312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_mask:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_maskz:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_maskz:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_maskz:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_maskz:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_maskz:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_maskz:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a0
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fnmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a1
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a2)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fnmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a1
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a0)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fnmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a0)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fnmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a0
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a1)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fnmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a2
+ %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a1)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_fnmadd123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_mask:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmadd213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_mask:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmadd231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_mask:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmadd321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_mask:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmadd132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_mask:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmadd312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_mask:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a0
+ %3 = fneg <8 x half> %a2
+ %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3)
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a1
+ %3 = fneg <8 x half> %a2
+ %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3)
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a1
+ %3 = fneg <8 x half> %a0
+ %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3)
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a2
+ %3 = fneg <8 x half> %a0
+ %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3)
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a0
+ %3 = fneg <8 x half> %a1
+ %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3)
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <8 x half> %a2
+ %3 = fneg <8 x half> %a1
+ %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3)
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_mask:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a2
+ %neg1 = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_mask:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a2
+ %neg1 = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_mask:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a0
+ %neg1 = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_mask:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a0
+ %neg1 = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_mask:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a1
+ %neg1 = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_mask:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x half>, <8 x half>* %p
+ %neg = fneg <8 x half> %a1
+ %neg1 = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
+ ret <8 x half> %4
+}
+
+define <8 x half> @stack_fold_fnmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a2
+ %neg1 = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a2
+ %neg1 = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a0
+ %neg1 = fneg <8 x half> %a1
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a0
+ %neg1 = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a1
+ %neg1 = fneg <8 x half> %a0
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_fnmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <8 x half> %a1
+ %neg1 = fneg <8 x half> %a2
+ %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
+ %3 = load i8, i8* %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <16 x half> @stack_fold_fmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_ymm:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
+
+define <16 x half> @stack_fold_fmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_ymm:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
+ ret <16 x half> %2
+}
+
+define <16 x half> @stack_fold_fmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_ymm:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
+ ret <16 x half> %2
+}
+
+define <16 x half> @stack_fold_fmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_ymm:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
+ ret <16 x half> %2
+}
+
+define <16 x half> @stack_fold_fmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_ymm:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
+ ret <16 x half> %2
+}
+
+define <16 x half> @stack_fold_fmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_ymm:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
+ ret <16 x half> %2
+}
+
+define <16 x half> @stack_fold_fmadd123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_mask_ymm:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmadd213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_mask_ymm:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmadd231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_mask_ymm:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmadd321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_mask_ymm:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmadd132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_mask_ymm:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmadd312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_mask_ymm:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd123ph_maskz_ymm:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd213ph_maskz_ymm:
+ ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd231ph_maskz_ymm:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd321ph_maskz_ymm:
+ ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd132ph_maskz_ymm:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmadd312ph_maskz_ymm:
+ ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_ymm:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_ymm:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_ymm:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a0
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_ymm:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a0
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_ymm:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a1
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_ymm:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a1
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fmsub123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_mask_ymm:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmsub213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_mask_ymm:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmsub231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_mask_ymm:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmsub321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_mask_ymm:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmsub132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_mask_ymm:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmsub312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_mask_ymm:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub123ph_maskz_ymm:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub213ph_maskz_ymm:
+ ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub231ph_maskz_ymm:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub321ph_maskz_ymm:
+ ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub132ph_maskz_ymm:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmsub312ph_maskz_ymm:
+ ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_ymm:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a0
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fnmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_ymm:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a1
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a2)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fnmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_ymm:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a1
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a0)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fnmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_ymm:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a0)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fnmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_ymm:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a0
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a1)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fnmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_ymm:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a2
+ %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a1)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_fnmadd123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_mask_ymm:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmadd213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_mask_ymm:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmadd231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_mask_ymm:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmadd321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_mask_ymm:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmadd132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_mask_ymm:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmadd312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_mask_ymm:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz_ymm:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz_ymm:
+ ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz_ymm:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz_ymm:
+ ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz_ymm:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz_ymm:
+ ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_ymm:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a0
+ %3 = fneg <16 x half> %a2
+ %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3)
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_ymm:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a1
+ %3 = fneg <16 x half> %a2
+ %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3)
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_ymm:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a1
+ %3 = fneg <16 x half> %a0
+ %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3)
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_ymm:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a2
+ %3 = fneg <16 x half> %a0
+ %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3)
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_ymm:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a0
+ %3 = fneg <16 x half> %a1
+ %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3)
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_ymm:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fneg <16 x half> %a2
+ %3 = fneg <16 x half> %a1
+ %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3)
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_mask_ymm:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a2
+ %neg1 = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_mask_ymm:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a2
+ %neg1 = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_mask_ymm:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a0
+ %neg1 = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_mask_ymm:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a0
+ %neg1 = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_mask_ymm:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a1
+ %neg1 = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_mask_ymm:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x half>, <16 x half>* %p
+ %neg = fneg <16 x half> %a1
+ %neg1 = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
+ ret <16 x half> %4
+}
+
+define <16 x half> @stack_fold_fnmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz_ymm:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a2
+ %neg1 = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz_ymm:
+ ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a2
+ %neg1 = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz_ymm:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a0
+ %neg1 = fneg <16 x half> %a1
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz_ymm:
+ ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a0
+ %neg1 = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz_ymm:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a1
+ %neg1 = fneg <16 x half> %a0
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_fnmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz_ymm:
+ ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %neg = fneg <16 x half> %a1
+ %neg1 = fneg <16 x half> %a2
+ %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
+ %3 = load i16, i16* %mask
+ %4 = bitcast i16 %3 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index f73742947b5cc..2c3d7ceb37d03 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -17,6 +17,7 @@ declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata)
declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata)
declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata)
+declare <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half>, <8 x half>, <8 x half>, metadata, metadata)
define <8 x half> @f2(<8 x half> %a, <8 x half> %b) #0 {
; CHECK-LABEL: f2:
@@ -101,6 +102,17 @@ define <2 x double> @f12(<2 x double> %a0, <8 x half> %a1) #0 {
ret <2 x double> %res
}
+define <8 x half> @f13(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: f13:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <8 x half> %res
+}
+
define <2 x double> @f15(<2 x half> %a) #0 {
; CHECK-LABEL: f15:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
index d5868287823fb..a2e02508327c8 100644
--- a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
@@ -7,6 +7,7 @@ declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16
declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata)
declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata)
declare <16 x half> @llvm.experimental.constrained.sqrt.v16f16(<16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.fma.v16f16(<16 x half>, <16 x half>, <16 x half>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata)
declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata)
declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(<4 x double>, metadata, metadata)
@@ -98,6 +99,17 @@ define <4 x half> @f12(<4 x double> %a) #0 {
ret <4 x half> %ret
}
+define <16 x half> @f13(<16 x half> %a, <16 x half> %b, <16 x half> %c) #0 {
+; CHECK-LABEL: f13:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x half> @llvm.experimental.constrained.fma.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <16 x half> %res
+}
+
define <8 x float> @f14(<8 x half> %a) #0 {
; CHECK-LABEL: f14:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
index 6273a525b15d6..dfbc11a43d3d7 100644
--- a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
@@ -11,6 +11,7 @@ declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>
declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata)
declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(<8 x double>, metadata, metadata)
declare <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(<16 x float>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.fma.v32f16(<32 x half>, <32 x half>, <32 x half>, metadata, metadata)
declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, metadata)
declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, metadata)
declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, metadata)
@@ -97,6 +98,17 @@ define <8 x half> @f12(<8 x double> %a) #0 {
ret <8 x half> %ret
}
+define <32 x half> @f13(<32 x half> %a, <32 x half> %b, <32 x half> %c) #0 {
+; CHECK-LABEL: f13:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x half> @llvm.experimental.constrained.fma.v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <32 x half> %res
+}
+
define <16 x float> @f14(<16 x half> %a) #0 {
; CHECK-LABEL: f14:
; CHECK: # %bb.0:
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
index 67514e50b1e12..ca1772175f95d 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
@@ -1764,3 +1764,723 @@
# ATT: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
# INTEL: vsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
0x62,0x65,0x16,0x87,0x51,0x72,0x80
+
+# ATT: vfmadd132ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmadd132ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x98,0xf4
+
+# ATT: vfmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmadd132ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x98,0xf4
+
+# ATT: vfmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmadd132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd132ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmadd132ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x98,0x31
+
+# ATT: vfmadd132ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmadd132ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x98,0x71,0x7f
+
+# ATT: vfmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmadd132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x98,0x72,0x80
+
+# ATT: vfmadd132sh %xmm28, %xmm29, %xmm30
+# INTEL: vfmadd132sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x99,0xf4
+
+# ATT: vfmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmadd132sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x99,0xf4
+
+# ATT: vfmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmadd132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x99,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd132sh (%r9), %xmm29, %xmm30
+# INTEL: vfmadd132sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x99,0x31
+
+# ATT: vfmadd132sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfmadd132sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x99,0x71,0x7f
+
+# ATT: vfmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmadd132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x99,0x72,0x80
+
+# ATT: vfmadd213ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmadd213ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xa8,0xf4
+
+# ATT: vfmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmadd213ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xa8,0xf4
+
+# ATT: vfmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmadd213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd213ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmadd213ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xa8,0x31
+
+# ATT: vfmadd213ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmadd213ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xa8,0x71,0x7f
+
+# ATT: vfmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmadd213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xa8,0x72,0x80
+
+# ATT: vfmadd213sh %xmm28, %xmm29, %xmm30
+# INTEL: vfmadd213sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xa9,0xf4
+
+# ATT: vfmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmadd213sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xa9,0xf4
+
+# ATT: vfmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmadd213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xa9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd213sh (%r9), %xmm29, %xmm30
+# INTEL: vfmadd213sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xa9,0x31
+
+# ATT: vfmadd213sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfmadd213sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xa9,0x71,0x7f
+
+# ATT: vfmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmadd213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xa9,0x72,0x80
+
+# ATT: vfmadd231ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmadd231ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xb8,0xf4
+
+# ATT: vfmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmadd231ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xb8,0xf4
+
+# ATT: vfmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmadd231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd231ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmadd231ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xb8,0x31
+
+# ATT: vfmadd231ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmadd231ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xb8,0x71,0x7f
+
+# ATT: vfmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmadd231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xb8,0x72,0x80
+
+# ATT: vfmadd231sh %xmm28, %xmm29, %xmm30
+# INTEL: vfmadd231sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xb9,0xf4
+
+# ATT: vfmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmadd231sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xb9,0xf4
+
+# ATT: vfmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmadd231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xb9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd231sh (%r9), %xmm29, %xmm30
+# INTEL: vfmadd231sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xb9,0x31
+
+# ATT: vfmadd231sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfmadd231sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xb9,0x71,0x7f
+
+# ATT: vfmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmadd231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xb9,0x72,0x80
+
+# ATT: vfmaddsub132ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddsub132ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x96,0xf4
+
+# ATT: vfmaddsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddsub132ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x96,0xf4
+
+# ATT: vfmaddsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmaddsub132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub132ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmaddsub132ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x96,0x31
+
+# ATT: vfmaddsub132ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmaddsub132ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x96,0x71,0x7f
+
+# ATT: vfmaddsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmaddsub132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x96,0x72,0x80
+
+# ATT: vfmaddsub213ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddsub213ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xa6,0xf4
+
+# ATT: vfmaddsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddsub213ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xa6,0xf4
+
+# ATT: vfmaddsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmaddsub213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub213ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmaddsub213ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xa6,0x31
+
+# ATT: vfmaddsub213ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmaddsub213ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xa6,0x71,0x7f
+
+# ATT: vfmaddsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmaddsub213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xa6,0x72,0x80
+
+# ATT: vfmaddsub231ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddsub231ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xb6,0xf4
+
+# ATT: vfmaddsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddsub231ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xb6,0xf4
+
+# ATT: vfmaddsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmaddsub231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub231ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmaddsub231ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xb6,0x31
+
+# ATT: vfmaddsub231ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmaddsub231ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xb6,0x71,0x7f
+
+# ATT: vfmaddsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmaddsub231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xb6,0x72,0x80
+
+# ATT: vfmsub132ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmsub132ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x9a,0xf4
+
+# ATT: vfmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmsub132ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x9a,0xf4
+
+# ATT: vfmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmsub132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub132ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmsub132ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x9a,0x31
+
+# ATT: vfmsub132ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmsub132ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x9a,0x71,0x7f
+
+# ATT: vfmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmsub132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x9a,0x72,0x80
+
+# ATT: vfmsub132sh %xmm28, %xmm29, %xmm30
+# INTEL: vfmsub132sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x9b,0xf4
+
+# ATT: vfmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmsub132sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x9b,0xf4
+
+# ATT: vfmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmsub132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x9b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub132sh (%r9), %xmm29, %xmm30
+# INTEL: vfmsub132sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x9b,0x31
+
+# ATT: vfmsub132sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfmsub132sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x9b,0x71,0x7f
+
+# ATT: vfmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmsub132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x9b,0x72,0x80
+
+# ATT: vfmsub213ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmsub213ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xaa,0xf4
+
+# ATT: vfmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmsub213ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xaa,0xf4
+
+# ATT: vfmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmsub213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub213ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmsub213ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xaa,0x31
+
+# ATT: vfmsub213ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmsub213ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xaa,0x71,0x7f
+
+# ATT: vfmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmsub213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xaa,0x72,0x80
+
+# ATT: vfmsub213sh %xmm28, %xmm29, %xmm30
+# INTEL: vfmsub213sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xab,0xf4
+
+# ATT: vfmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmsub213sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xab,0xf4
+
+# ATT: vfmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmsub213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xab,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub213sh (%r9), %xmm29, %xmm30
+# INTEL: vfmsub213sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xab,0x31
+
+# ATT: vfmsub213sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfmsub213sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xab,0x71,0x7f
+
+# ATT: vfmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmsub213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xab,0x72,0x80
+
+# ATT: vfmsub231ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmsub231ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xba,0xf4
+
+# ATT: vfmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmsub231ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xba,0xf4
+
+# ATT: vfmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmsub231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub231ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmsub231ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xba,0x31
+
+# ATT: vfmsub231ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmsub231ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xba,0x71,0x7f
+
+# ATT: vfmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmsub231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xba,0x72,0x80
+
+# ATT: vfmsub231sh %xmm28, %xmm29, %xmm30
+# INTEL: vfmsub231sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xbb,0xf4
+
+# ATT: vfmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmsub231sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xbb,0xf4
+
+# ATT: vfmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmsub231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xbb,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub231sh (%r9), %xmm29, %xmm30
+# INTEL: vfmsub231sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xbb,0x31
+
+# ATT: vfmsub231sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfmsub231sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xbb,0x71,0x7f
+
+# ATT: vfmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmsub231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xbb,0x72,0x80
+
+# ATT: vfmsubadd132ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmsubadd132ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x97,0xf4
+
+# ATT: vfmsubadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmsubadd132ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x97,0xf4
+
+# ATT: vfmsubadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmsubadd132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd132ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmsubadd132ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x97,0x31
+
+# ATT: vfmsubadd132ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmsubadd132ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x97,0x71,0x7f
+
+# ATT: vfmsubadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmsubadd132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x97,0x72,0x80
+
+# ATT: vfmsubadd213ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmsubadd213ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xa7,0xf4
+
+# ATT: vfmsubadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmsubadd213ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xa7,0xf4
+
+# ATT: vfmsubadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmsubadd213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd213ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmsubadd213ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xa7,0x31
+
+# ATT: vfmsubadd213ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmsubadd213ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xa7,0x71,0x7f
+
+# ATT: vfmsubadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmsubadd213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xa7,0x72,0x80
+
+# ATT: vfmsubadd231ph %zmm28, %zmm29, %zmm30
+# INTEL: vfmsubadd231ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xb7,0xf4
+
+# ATT: vfmsubadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmsubadd231ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xb7,0xf4
+
+# ATT: vfmsubadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmsubadd231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd231ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfmsubadd231ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xb7,0x31
+
+# ATT: vfmsubadd231ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmsubadd231ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xb7,0x71,0x7f
+
+# ATT: vfmsubadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmsubadd231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xb7,0x72,0x80
+
+# ATT: vfnmadd132ph %zmm28, %zmm29, %zmm30
+# INTEL: vfnmadd132ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x9c,0xf4
+
+# ATT: vfnmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfnmadd132ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x9c,0xf4
+
+# ATT: vfnmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfnmadd132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd132ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfnmadd132ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x9c,0x31
+
+# ATT: vfnmadd132ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfnmadd132ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x9c,0x71,0x7f
+
+# ATT: vfnmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfnmadd132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x9c,0x72,0x80
+
+# ATT: vfnmadd132sh %xmm28, %xmm29, %xmm30
+# INTEL: vfnmadd132sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x9d,0xf4
+
+# ATT: vfnmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfnmadd132sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x9d,0xf4
+
+# ATT: vfnmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfnmadd132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x9d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd132sh (%r9), %xmm29, %xmm30
+# INTEL: vfnmadd132sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x9d,0x31
+
+# ATT: vfnmadd132sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfnmadd132sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x9d,0x71,0x7f
+
+# ATT: vfnmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfnmadd132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x9d,0x72,0x80
+
+# ATT: vfnmadd213ph %zmm28, %zmm29, %zmm30
+# INTEL: vfnmadd213ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xac,0xf4
+
+# ATT: vfnmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfnmadd213ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xac,0xf4
+
+# ATT: vfnmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfnmadd213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd213ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfnmadd213ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xac,0x31
+
+# ATT: vfnmadd213ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfnmadd213ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xac,0x71,0x7f
+
+# ATT: vfnmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfnmadd213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xac,0x72,0x80
+
+# ATT: vfnmadd213sh %xmm28, %xmm29, %xmm30
+# INTEL: vfnmadd213sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xad,0xf4
+
+# ATT: vfnmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfnmadd213sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xad,0xf4
+
+# ATT: vfnmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfnmadd213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xad,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd213sh (%r9), %xmm29, %xmm30
+# INTEL: vfnmadd213sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xad,0x31
+
+# ATT: vfnmadd213sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfnmadd213sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xad,0x71,0x7f
+
+# ATT: vfnmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfnmadd213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xad,0x72,0x80
+
+# ATT: vfnmadd231ph %zmm28, %zmm29, %zmm30
+# INTEL: vfnmadd231ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xbc,0xf4
+
+# ATT: vfnmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfnmadd231ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xbc,0xf4
+
+# ATT: vfnmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfnmadd231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd231ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfnmadd231ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xbc,0x31
+
+# ATT: vfnmadd231ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfnmadd231ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xbc,0x71,0x7f
+
+# ATT: vfnmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfnmadd231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xbc,0x72,0x80
+
+# ATT: vfnmadd231sh %xmm28, %xmm29, %xmm30
+# INTEL: vfnmadd231sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xbd,0xf4
+
+# ATT: vfnmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfnmadd231sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xbd,0xf4
+
+# ATT: vfnmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfnmadd231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xbd,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd231sh (%r9), %xmm29, %xmm30
+# INTEL: vfnmadd231sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xbd,0x31
+
+# ATT: vfnmadd231sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfnmadd231sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xbd,0x71,0x7f
+
+# ATT: vfnmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfnmadd231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xbd,0x72,0x80
+
+# ATT: vfnmsub132ph %zmm28, %zmm29, %zmm30
+# INTEL: vfnmsub132ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x9e,0xf4
+
+# ATT: vfnmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfnmsub132ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x9e,0xf4
+
+# ATT: vfnmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfnmsub132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub132ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfnmsub132ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x9e,0x31
+
+# ATT: vfnmsub132ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfnmsub132ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x9e,0x71,0x7f
+
+# ATT: vfnmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfnmsub132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x9e,0x72,0x80
+
+# ATT: vfnmsub132sh %xmm28, %xmm29, %xmm30
+# INTEL: vfnmsub132sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x9f,0xf4
+
+# ATT: vfnmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfnmsub132sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x9f,0xf4
+
+# ATT: vfnmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfnmsub132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x9f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub132sh (%r9), %xmm29, %xmm30
+# INTEL: vfnmsub132sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x9f,0x31
+
+# ATT: vfnmsub132sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfnmsub132sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x9f,0x71,0x7f
+
+# ATT: vfnmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfnmsub132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x9f,0x72,0x80
+
+# ATT: vfnmsub213ph %zmm28, %zmm29, %zmm30
+# INTEL: vfnmsub213ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xae,0xf4
+
+# ATT: vfnmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfnmsub213ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xae,0xf4
+
+# ATT: vfnmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfnmsub213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub213ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfnmsub213ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xae,0x31
+
+# ATT: vfnmsub213ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfnmsub213ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xae,0x71,0x7f
+
+# ATT: vfnmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfnmsub213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xae,0x72,0x80
+
+# ATT: vfnmsub213sh %xmm28, %xmm29, %xmm30
+# INTEL: vfnmsub213sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xaf,0xf4
+
+# ATT: vfnmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfnmsub213sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xaf,0xf4
+
+# ATT: vfnmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfnmsub213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xaf,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub213sh (%r9), %xmm29, %xmm30
+# INTEL: vfnmsub213sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xaf,0x31
+
+# ATT: vfnmsub213sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfnmsub213sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xaf,0x71,0x7f
+
+# ATT: vfnmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfnmsub213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xaf,0x72,0x80
+
+# ATT: vfnmsub231ph %zmm28, %zmm29, %zmm30
+# INTEL: vfnmsub231ph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0xbe,0xf4
+
+# ATT: vfnmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfnmsub231ph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xbe,0xf4
+
+# ATT: vfnmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfnmsub231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub231ph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vfnmsub231ph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0xbe,0x31
+
+# ATT: vfnmsub231ph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfnmsub231ph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0xbe,0x71,0x7f
+
+# ATT: vfnmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfnmsub231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0xbe,0x72,0x80
+
+# ATT: vfnmsub231sh %xmm28, %xmm29, %xmm30
+# INTEL: vfnmsub231sh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0xbf,0xf4
+
+# ATT: vfnmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfnmsub231sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0xbf,0xf4
+
+# ATT: vfnmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfnmsub231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0xbf,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub231sh (%r9), %xmm29, %xmm30
+# INTEL: vfnmsub231sh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0xbf,0x31
+
+# ATT: vfnmsub231sh 254(%rcx), %xmm29, %xmm30
+# INTEL: vfnmsub231sh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0xbf,0x71,0x7f
+
+# ATT: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfnmsub231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0xbf,0x72,0x80
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
index 8f480fc13d82f..390622b2d4824 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
@@ -1492,3 +1492,723 @@
# ATT: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
# INTEL: vsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80
+
+# ATT: vfmadd132ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmadd132ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x98,0xf4
+
+# ATT: vfmadd132ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmadd132ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x98,0xf4
+
+# ATT: vfmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmadd132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd132ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmadd132ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x98,0x31
+
+# ATT: vfmadd132ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmadd132ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x98,0x71,0x7f
+
+# ATT: vfmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmadd132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x98,0x72,0x80
+
+# ATT: vfmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmadd132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd132ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmadd132ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x98,0x31
+
+# ATT: vfmadd132ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmadd132ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x98,0x71,0x7f
+
+# ATT: vfmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmadd132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x98,0x72,0x80
+
+# ATT: vfmadd213ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmadd213ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xa8,0xf4
+
+# ATT: vfmadd213ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmadd213ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xa8,0xf4
+
+# ATT: vfmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmadd213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd213ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmadd213ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xa8,0x31
+
+# ATT: vfmadd213ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmadd213ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xa8,0x71,0x7f
+
+# ATT: vfmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmadd213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xa8,0x72,0x80
+
+# ATT: vfmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmadd213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd213ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmadd213ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xa8,0x31
+
+# ATT: vfmadd213ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmadd213ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xa8,0x71,0x7f
+
+# ATT: vfmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmadd213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xa8,0x72,0x80
+
+# ATT: vfmadd231ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmadd231ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xb8,0xf4
+
+# ATT: vfmadd231ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmadd231ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xb8,0xf4
+
+# ATT: vfmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmadd231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd231ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmadd231ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xb8,0x31
+
+# ATT: vfmadd231ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmadd231ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xb8,0x71,0x7f
+
+# ATT: vfmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmadd231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xb8,0x72,0x80
+
+# ATT: vfmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmadd231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmadd231ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmadd231ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xb8,0x31
+
+# ATT: vfmadd231ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmadd231ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xb8,0x71,0x7f
+
+# ATT: vfmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmadd231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xb8,0x72,0x80
+
+# ATT: vfmaddsub132ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmaddsub132ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x96,0xf4
+
+# ATT: vfmaddsub132ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmaddsub132ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x96,0xf4
+
+# ATT: vfmaddsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmaddsub132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub132ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmaddsub132ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x96,0x31
+
+# ATT: vfmaddsub132ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmaddsub132ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x96,0x71,0x7f
+
+# ATT: vfmaddsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmaddsub132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x96,0x72,0x80
+
+# ATT: vfmaddsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmaddsub132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub132ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmaddsub132ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x96,0x31
+
+# ATT: vfmaddsub132ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmaddsub132ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x96,0x71,0x7f
+
+# ATT: vfmaddsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmaddsub132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x96,0x72,0x80
+
+# ATT: vfmaddsub213ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmaddsub213ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xa6,0xf4
+
+# ATT: vfmaddsub213ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmaddsub213ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xa6,0xf4
+
+# ATT: vfmaddsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmaddsub213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub213ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmaddsub213ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xa6,0x31
+
+# ATT: vfmaddsub213ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmaddsub213ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xa6,0x71,0x7f
+
+# ATT: vfmaddsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmaddsub213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xa6,0x72,0x80
+
+# ATT: vfmaddsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmaddsub213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub213ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmaddsub213ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xa6,0x31
+
+# ATT: vfmaddsub213ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmaddsub213ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xa6,0x71,0x7f
+
+# ATT: vfmaddsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmaddsub213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xa6,0x72,0x80
+
+# ATT: vfmaddsub231ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmaddsub231ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xb6,0xf4
+
+# ATT: vfmaddsub231ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmaddsub231ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xb6,0xf4
+
+# ATT: vfmaddsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmaddsub231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub231ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmaddsub231ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xb6,0x31
+
+# ATT: vfmaddsub231ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmaddsub231ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xb6,0x71,0x7f
+
+# ATT: vfmaddsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmaddsub231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xb6,0x72,0x80
+
+# ATT: vfmaddsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmaddsub231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddsub231ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmaddsub231ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xb6,0x31
+
+# ATT: vfmaddsub231ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmaddsub231ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xb6,0x71,0x7f
+
+# ATT: vfmaddsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmaddsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xb6,0x72,0x80
+
+# ATT: vfmsub132ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmsub132ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x9a,0xf4
+
+# ATT: vfmsub132ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmsub132ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x9a,0xf4
+
+# ATT: vfmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmsub132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub132ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmsub132ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x9a,0x31
+
+# ATT: vfmsub132ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmsub132ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x9a,0x71,0x7f
+
+# ATT: vfmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmsub132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x9a,0x72,0x80
+
+# ATT: vfmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmsub132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub132ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmsub132ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x9a,0x31
+
+# ATT: vfmsub132ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmsub132ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x9a,0x71,0x7f
+
+# ATT: vfmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmsub132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x9a,0x72,0x80
+
+# ATT: vfmsub213ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmsub213ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xaa,0xf4
+
+# ATT: vfmsub213ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmsub213ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xaa,0xf4
+
+# ATT: vfmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmsub213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub213ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmsub213ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xaa,0x31
+
+# ATT: vfmsub213ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmsub213ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xaa,0x71,0x7f
+
+# ATT: vfmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmsub213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xaa,0x72,0x80
+
+# ATT: vfmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmsub213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub213ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmsub213ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xaa,0x31
+
+# ATT: vfmsub213ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmsub213ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xaa,0x71,0x7f
+
+# ATT: vfmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmsub213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xaa,0x72,0x80
+
+# ATT: vfmsub231ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmsub231ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xba,0xf4
+
+# ATT: vfmsub231ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmsub231ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xba,0xf4
+
+# ATT: vfmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmsub231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub231ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmsub231ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xba,0x31
+
+# ATT: vfmsub231ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmsub231ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xba,0x71,0x7f
+
+# ATT: vfmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmsub231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xba,0x72,0x80
+
+# ATT: vfmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmsub231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsub231ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmsub231ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xba,0x31
+
+# ATT: vfmsub231ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmsub231ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xba,0x71,0x7f
+
+# ATT: vfmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xba,0x72,0x80
+
+# ATT: vfmsubadd132ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmsubadd132ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x97,0xf4
+
+# ATT: vfmsubadd132ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmsubadd132ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x97,0xf4
+
+# ATT: vfmsubadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmsubadd132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd132ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmsubadd132ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x97,0x31
+
+# ATT: vfmsubadd132ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmsubadd132ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x97,0x71,0x7f
+
+# ATT: vfmsubadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmsubadd132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x97,0x72,0x80
+
+# ATT: vfmsubadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmsubadd132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd132ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmsubadd132ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x97,0x31
+
+# ATT: vfmsubadd132ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmsubadd132ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x97,0x71,0x7f
+
+# ATT: vfmsubadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmsubadd132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x97,0x72,0x80
+
+# ATT: vfmsubadd213ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmsubadd213ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xa7,0xf4
+
+# ATT: vfmsubadd213ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmsubadd213ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xa7,0xf4
+
+# ATT: vfmsubadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmsubadd213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd213ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmsubadd213ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xa7,0x31
+
+# ATT: vfmsubadd213ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmsubadd213ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xa7,0x71,0x7f
+
+# ATT: vfmsubadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmsubadd213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xa7,0x72,0x80
+
+# ATT: vfmsubadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmsubadd213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd213ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmsubadd213ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xa7,0x31
+
+# ATT: vfmsubadd213ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmsubadd213ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xa7,0x71,0x7f
+
+# ATT: vfmsubadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmsubadd213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xa7,0x72,0x80
+
+# ATT: vfmsubadd231ph %ymm4, %ymm5, %ymm6
+# INTEL: vfmsubadd231ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xb7,0xf4
+
+# ATT: vfmsubadd231ph %xmm4, %xmm5, %xmm6
+# INTEL: vfmsubadd231ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xb7,0xf4
+
+# ATT: vfmsubadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmsubadd231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd231ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfmsubadd231ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xb7,0x31
+
+# ATT: vfmsubadd231ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmsubadd231ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xb7,0x71,0x7f
+
+# ATT: vfmsubadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmsubadd231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xb7,0x72,0x80
+
+# ATT: vfmsubadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmsubadd231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmsubadd231ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfmsubadd231ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xb7,0x31
+
+# ATT: vfmsubadd231ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmsubadd231ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xb7,0x71,0x7f
+
+# ATT: vfmsubadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmsubadd231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xb7,0x72,0x80
+
+# ATT: vfnmadd132ph %ymm4, %ymm5, %ymm6
+# INTEL: vfnmadd132ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x9c,0xf4
+
+# ATT: vfnmadd132ph %xmm4, %xmm5, %xmm6
+# INTEL: vfnmadd132ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x9c,0xf4
+
+# ATT: vfnmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfnmadd132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd132ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfnmadd132ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x9c,0x31
+
+# ATT: vfnmadd132ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfnmadd132ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x9c,0x71,0x7f
+
+# ATT: vfnmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfnmadd132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x9c,0x72,0x80
+
+# ATT: vfnmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfnmadd132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd132ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfnmadd132ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x9c,0x31
+
+# ATT: vfnmadd132ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfnmadd132ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x9c,0x71,0x7f
+
+# ATT: vfnmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfnmadd132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x9c,0x72,0x80
+
+# ATT: vfnmadd213ph %ymm4, %ymm5, %ymm6
+# INTEL: vfnmadd213ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xac,0xf4
+
+# ATT: vfnmadd213ph %xmm4, %xmm5, %xmm6
+# INTEL: vfnmadd213ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xac,0xf4
+
+# ATT: vfnmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfnmadd213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd213ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfnmadd213ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xac,0x31
+
+# ATT: vfnmadd213ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfnmadd213ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xac,0x71,0x7f
+
+# ATT: vfnmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfnmadd213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xac,0x72,0x80
+
+# ATT: vfnmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfnmadd213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd213ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfnmadd213ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xac,0x31
+
+# ATT: vfnmadd213ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfnmadd213ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xac,0x71,0x7f
+
+# ATT: vfnmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfnmadd213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xac,0x72,0x80
+
+# ATT: vfnmadd231ph %ymm4, %ymm5, %ymm6
+# INTEL: vfnmadd231ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xbc,0xf4
+
+# ATT: vfnmadd231ph %xmm4, %xmm5, %xmm6
+# INTEL: vfnmadd231ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xbc,0xf4
+
+# ATT: vfnmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfnmadd231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd231ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfnmadd231ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xbc,0x31
+
+# ATT: vfnmadd231ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfnmadd231ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xbc,0x71,0x7f
+
+# ATT: vfnmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfnmadd231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xbc,0x72,0x80
+
+# ATT: vfnmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfnmadd231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmadd231ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfnmadd231ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xbc,0x31
+
+# ATT: vfnmadd231ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfnmadd231ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xbc,0x71,0x7f
+
+# ATT: vfnmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfnmadd231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xbc,0x72,0x80
+
+# ATT: vfnmsub132ph %ymm4, %ymm5, %ymm6
+# INTEL: vfnmsub132ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x9e,0xf4
+
+# ATT: vfnmsub132ph %xmm4, %xmm5, %xmm6
+# INTEL: vfnmsub132ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x9e,0xf4
+
+# ATT: vfnmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfnmsub132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub132ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfnmsub132ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x9e,0x31
+
+# ATT: vfnmsub132ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfnmsub132ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x9e,0x71,0x7f
+
+# ATT: vfnmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfnmsub132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x9e,0x72,0x80
+
+# ATT: vfnmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfnmsub132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub132ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfnmsub132ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x9e,0x31
+
+# ATT: vfnmsub132ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfnmsub132ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x9e,0x71,0x7f
+
+# ATT: vfnmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfnmsub132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x9e,0x72,0x80
+
+# ATT: vfnmsub213ph %ymm4, %ymm5, %ymm6
+# INTEL: vfnmsub213ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xae,0xf4
+
+# ATT: vfnmsub213ph %xmm4, %xmm5, %xmm6
+# INTEL: vfnmsub213ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xae,0xf4
+
+# ATT: vfnmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfnmsub213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub213ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfnmsub213ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xae,0x31
+
+# ATT: vfnmsub213ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfnmsub213ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xae,0x71,0x7f
+
+# ATT: vfnmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfnmsub213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xae,0x72,0x80
+
+# ATT: vfnmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfnmsub213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub213ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfnmsub213ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xae,0x31
+
+# ATT: vfnmsub213ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfnmsub213ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xae,0x71,0x7f
+
+# ATT: vfnmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfnmsub213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xae,0x72,0x80
+
+# ATT: vfnmsub231ph %ymm4, %ymm5, %ymm6
+# INTEL: vfnmsub231ph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0xbe,0xf4
+
+# ATT: vfnmsub231ph %xmm4, %xmm5, %xmm6
+# INTEL: vfnmsub231ph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0xbe,0xf4
+
+# ATT: vfnmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfnmsub231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub231ph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vfnmsub231ph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0xbe,0x31
+
+# ATT: vfnmsub231ph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfnmsub231ph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0xbe,0x71,0x7f
+
+# ATT: vfnmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfnmsub231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0xbe,0x72,0x80
+
+# ATT: vfnmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfnmsub231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfnmsub231ph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vfnmsub231ph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0xbe,0x31
+
+# ATT: vfnmsub231ph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfnmsub231ph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0xbe,0x71,0x7f
+
+# ATT: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfnmsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80
diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s
index b358705fbedc8..6f3165d5994ad 100644
--- a/llvm/test/MC/X86/avx512fp16.s
+++ b/llvm/test/MC/X86/avx512fp16.s
@@ -1763,3 +1763,723 @@
// CHECK: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
// CHECK: encoding: [0x62,0x65,0x16,0x87,0x51,0x72,0x80]
vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmadd132ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x98,0xf4]
+ vfmadd132ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x98,0xf4]
+ vfmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmadd132ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x98,0x31]
+ vfmadd132ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmadd132ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x98,0x71,0x7f]
+ vfmadd132ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x98,0x72,0x80]
+ vfmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmadd132sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x99,0xf4]
+ vfmadd132sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x99,0xf4]
+ vfmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x99,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmadd132sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x99,0x31]
+ vfmadd132sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmadd132sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x99,0x71,0x7f]
+ vfmadd132sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x99,0x72,0x80]
+ vfmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmadd213ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xa8,0xf4]
+ vfmadd213ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa8,0xf4]
+ vfmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmadd213ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xa8,0x31]
+ vfmadd213ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmadd213ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xa8,0x71,0x7f]
+ vfmadd213ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xa8,0x72,0x80]
+ vfmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmadd213sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa9,0xf4]
+ vfmadd213sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa9,0xf4]
+ vfmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmadd213sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xa9,0x31]
+ vfmadd213sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmadd213sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa9,0x71,0x7f]
+ vfmadd213sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xa9,0x72,0x80]
+ vfmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmadd231ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xb8,0xf4]
+ vfmadd231ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb8,0xf4]
+ vfmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmadd231ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xb8,0x31]
+ vfmadd231ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmadd231ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xb8,0x71,0x7f]
+ vfmadd231ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xb8,0x72,0x80]
+ vfmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmadd231sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb9,0xf4]
+ vfmadd231sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb9,0xf4]
+ vfmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmadd231sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xb9,0x31]
+ vfmadd231sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmadd231sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb9,0x71,0x7f]
+ vfmadd231sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xb9,0x72,0x80]
+ vfmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmaddsub132ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x96,0xf4]
+ vfmaddsub132ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x96,0xf4]
+ vfmaddsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmaddsub132ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x96,0x31]
+ vfmaddsub132ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmaddsub132ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x96,0x71,0x7f]
+ vfmaddsub132ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmaddsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x96,0x72,0x80]
+ vfmaddsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmaddsub213ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xa6,0xf4]
+ vfmaddsub213ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa6,0xf4]
+ vfmaddsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmaddsub213ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xa6,0x31]
+ vfmaddsub213ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmaddsub213ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xa6,0x71,0x7f]
+ vfmaddsub213ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmaddsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xa6,0x72,0x80]
+ vfmaddsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmaddsub231ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xb6,0xf4]
+ vfmaddsub231ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb6,0xf4]
+ vfmaddsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmaddsub231ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xb6,0x31]
+ vfmaddsub231ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmaddsub231ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xb6,0x71,0x7f]
+ vfmaddsub231ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmaddsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xb6,0x72,0x80]
+ vfmaddsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmsub132ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x9a,0xf4]
+ vfmsub132ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9a,0xf4]
+ vfmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmsub132ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x9a,0x31]
+ vfmsub132ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmsub132ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x9a,0x71,0x7f]
+ vfmsub132ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x9a,0x72,0x80]
+ vfmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmsub132sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9b,0xf4]
+ vfmsub132sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9b,0xf4]
+ vfmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmsub132sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x9b,0x31]
+ vfmsub132sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmsub132sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9b,0x71,0x7f]
+ vfmsub132sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x9b,0x72,0x80]
+ vfmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmsub213ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xaa,0xf4]
+ vfmsub213ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xaa,0xf4]
+ vfmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmsub213ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xaa,0x31]
+ vfmsub213ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmsub213ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xaa,0x71,0x7f]
+ vfmsub213ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xaa,0x72,0x80]
+ vfmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmsub213sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xab,0xf4]
+ vfmsub213sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xab,0xf4]
+ vfmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xab,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmsub213sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xab,0x31]
+ vfmsub213sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmsub213sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xab,0x71,0x7f]
+ vfmsub213sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xab,0x72,0x80]
+ vfmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmsub231ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xba,0xf4]
+ vfmsub231ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xba,0xf4]
+ vfmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmsub231ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xba,0x31]
+ vfmsub231ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmsub231ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xba,0x71,0x7f]
+ vfmsub231ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xba,0x72,0x80]
+ vfmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmsub231sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbb,0xf4]
+ vfmsub231sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbb,0xf4]
+ vfmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbb,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmsub231sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xbb,0x31]
+ vfmsub231sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmsub231sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbb,0x71,0x7f]
+ vfmsub231sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbb,0x72,0x80]
+ vfmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmsubadd132ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x97,0xf4]
+ vfmsubadd132ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsubadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x97,0xf4]
+ vfmsubadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsubadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmsubadd132ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x97,0x31]
+ vfmsubadd132ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmsubadd132ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x97,0x71,0x7f]
+ vfmsubadd132ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmsubadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x97,0x72,0x80]
+ vfmsubadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmsubadd213ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xa7,0xf4]
+ vfmsubadd213ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsubadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa7,0xf4]
+ vfmsubadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsubadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmsubadd213ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xa7,0x31]
+ vfmsubadd213ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmsubadd213ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xa7,0x71,0x7f]
+ vfmsubadd213ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmsubadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xa7,0x72,0x80]
+ vfmsubadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmsubadd231ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xb7,0xf4]
+ vfmsubadd231ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsubadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb7,0xf4]
+ vfmsubadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmsubadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmsubadd231ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xb7,0x31]
+ vfmsubadd231ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfmsubadd231ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xb7,0x71,0x7f]
+ vfmsubadd231ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmsubadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xb7,0x72,0x80]
+ vfmsubadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmadd132ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x9c,0xf4]
+ vfnmadd132ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9c,0xf4]
+ vfnmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfnmadd132ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x9c,0x31]
+ vfnmadd132ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfnmadd132ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x9c,0x71,0x7f]
+ vfnmadd132ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfnmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x9c,0x72,0x80]
+ vfnmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmadd132sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9d,0xf4]
+ vfnmadd132sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9d,0xf4]
+ vfnmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfnmadd132sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x9d,0x31]
+ vfnmadd132sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfnmadd132sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9d,0x71,0x7f]
+ vfnmadd132sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfnmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x9d,0x72,0x80]
+ vfnmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfnmadd213ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xac,0xf4]
+ vfnmadd213ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xac,0xf4]
+ vfnmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfnmadd213ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xac,0x31]
+ vfnmadd213ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfnmadd213ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xac,0x71,0x7f]
+ vfnmadd213ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfnmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xac,0x72,0x80]
+ vfnmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmadd213sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xad,0xf4]
+ vfnmadd213sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xad,0xf4]
+ vfnmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xad,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfnmadd213sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xad,0x31]
+ vfnmadd213sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfnmadd213sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xad,0x71,0x7f]
+ vfnmadd213sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfnmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xad,0x72,0x80]
+ vfnmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfnmadd231ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xbc,0xf4]
+ vfnmadd231ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbc,0xf4]
+ vfnmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfnmadd231ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xbc,0x31]
+ vfnmadd231ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfnmadd231ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xbc,0x71,0x7f]
+ vfnmadd231ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfnmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xbc,0x72,0x80]
+ vfnmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmadd231sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbd,0xf4]
+ vfnmadd231sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbd,0xf4]
+ vfnmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbd,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfnmadd231sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xbd,0x31]
+ vfnmadd231sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfnmadd231sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbd,0x71,0x7f]
+ vfnmadd231sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfnmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbd,0x72,0x80]
+ vfnmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfnmsub132ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x9e,0xf4]
+ vfnmsub132ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9e,0xf4]
+ vfnmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfnmsub132ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x9e,0x31]
+ vfnmsub132ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfnmsub132ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x9e,0x71,0x7f]
+ vfnmsub132ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfnmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x9e,0x72,0x80]
+ vfnmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmsub132sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9f,0xf4]
+ vfnmsub132sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9f,0xf4]
+ vfnmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfnmsub132sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x9f,0x31]
+ vfnmsub132sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfnmsub132sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9f,0x71,0x7f]
+ vfnmsub132sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfnmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x9f,0x72,0x80]
+ vfnmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfnmsub213ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xae,0xf4]
+ vfnmsub213ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xae,0xf4]
+ vfnmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfnmsub213ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xae,0x31]
+ vfnmsub213ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfnmsub213ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xae,0x71,0x7f]
+ vfnmsub213ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfnmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xae,0x72,0x80]
+ vfnmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmsub213sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xaf,0xf4]
+ vfnmsub213sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xaf,0xf4]
+ vfnmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xaf,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfnmsub213sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xaf,0x31]
+ vfnmsub213sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfnmsub213sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xaf,0x71,0x7f]
+ vfnmsub213sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfnmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xaf,0x72,0x80]
+ vfnmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfnmsub231ph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0xbe,0xf4]
+ vfnmsub231ph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbe,0xf4]
+ vfnmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfnmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfnmsub231ph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0xbe,0x31]
+ vfnmsub231ph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vfnmsub231ph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0xbe,0x71,0x7f]
+ vfnmsub231ph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfnmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xbe,0x72,0x80]
+ vfnmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfnmsub231sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbf,0xf4]
+ vfnmsub231sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbf,0xf4]
+ vfnmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfnmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbf,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfnmsub231sh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0xbf,0x31]
+ vfnmsub231sh (%r9), %xmm29, %xmm30
+
+// CHECK: vfnmsub231sh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbf,0x71,0x7f]
+ vfnmsub231sh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbf,0x72,0x80]
+ vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s
index 91c45a56a2e8a..a3f888e045393 100644
--- a/llvm/test/MC/X86/avx512fp16vl.s
+++ b/llvm/test/MC/X86/avx512fp16vl.s
@@ -1491,3 +1491,723 @@
// CHECK: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80]
vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vfmadd132ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x98,0xf4]
+ vfmadd132ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmadd132ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x98,0xf4]
+ vfmadd132ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmadd132ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x98,0x31]
+ vfmadd132ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmadd132ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x98,0x71,0x7f]
+ vfmadd132ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x98,0x72,0x80]
+ vfmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmadd132ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x98,0x31]
+ vfmadd132ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmadd132ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x98,0x71,0x7f]
+ vfmadd132ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x98,0x72,0x80]
+ vfmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmadd213ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa8,0xf4]
+ vfmadd213ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmadd213ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa8,0xf4]
+ vfmadd213ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmadd213ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xa8,0x31]
+ vfmadd213ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmadd213ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa8,0x71,0x7f]
+ vfmadd213ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xa8,0x72,0x80]
+ vfmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmadd213ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa8,0x31]
+ vfmadd213ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmadd213ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa8,0x71,0x7f]
+ vfmadd213ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xa8,0x72,0x80]
+ vfmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmadd231ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb8,0xf4]
+ vfmadd231ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmadd231ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb8,0xf4]
+ vfmadd231ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmadd231ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xb8,0x31]
+ vfmadd231ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmadd231ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb8,0x71,0x7f]
+ vfmadd231ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xb8,0x72,0x80]
+ vfmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmadd231ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb8,0x31]
+ vfmadd231ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmadd231ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb8,0x71,0x7f]
+ vfmadd231ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xb8,0x72,0x80]
+ vfmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmaddsub132ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x96,0xf4]
+ vfmaddsub132ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmaddsub132ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x96,0xf4]
+ vfmaddsub132ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmaddsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmaddsub132ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x96,0x31]
+ vfmaddsub132ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmaddsub132ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x96,0x71,0x7f]
+ vfmaddsub132ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmaddsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x96,0x72,0x80]
+ vfmaddsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmaddsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmaddsub132ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x96,0x31]
+ vfmaddsub132ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmaddsub132ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x96,0x71,0x7f]
+ vfmaddsub132ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmaddsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x96,0x72,0x80]
+ vfmaddsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmaddsub213ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa6,0xf4]
+ vfmaddsub213ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmaddsub213ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa6,0xf4]
+ vfmaddsub213ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmaddsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmaddsub213ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xa6,0x31]
+ vfmaddsub213ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmaddsub213ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa6,0x71,0x7f]
+ vfmaddsub213ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmaddsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xa6,0x72,0x80]
+ vfmaddsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmaddsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmaddsub213ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa6,0x31]
+ vfmaddsub213ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmaddsub213ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa6,0x71,0x7f]
+ vfmaddsub213ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmaddsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xa6,0x72,0x80]
+ vfmaddsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmaddsub231ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb6,0xf4]
+ vfmaddsub231ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmaddsub231ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb6,0xf4]
+ vfmaddsub231ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmaddsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmaddsub231ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xb6,0x31]
+ vfmaddsub231ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmaddsub231ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb6,0x71,0x7f]
+ vfmaddsub231ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmaddsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xb6,0x72,0x80]
+ vfmaddsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmaddsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmaddsub231ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb6,0x31]
+ vfmaddsub231ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmaddsub231ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb6,0x71,0x7f]
+ vfmaddsub231ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmaddsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xb6,0x72,0x80]
+ vfmaddsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmsub132ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9a,0xf4]
+ vfmsub132ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmsub132ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9a,0xf4]
+ vfmsub132ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmsub132ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x9a,0x31]
+ vfmsub132ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmsub132ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9a,0x71,0x7f]
+ vfmsub132ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x9a,0x72,0x80]
+ vfmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmsub132ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9a,0x31]
+ vfmsub132ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmsub132ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9a,0x71,0x7f]
+ vfmsub132ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x9a,0x72,0x80]
+ vfmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmsub213ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xaa,0xf4]
+ vfmsub213ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmsub213ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaa,0xf4]
+ vfmsub213ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmsub213ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xaa,0x31]
+ vfmsub213ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmsub213ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xaa,0x71,0x7f]
+ vfmsub213ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xaa,0x72,0x80]
+ vfmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmsub213ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xaa,0x31]
+ vfmsub213ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmsub213ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaa,0x71,0x7f]
+ vfmsub213ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xaa,0x72,0x80]
+ vfmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmsub231ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xba,0xf4]
+ vfmsub231ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmsub231ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xba,0xf4]
+ vfmsub231ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmsub231ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xba,0x31]
+ vfmsub231ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmsub231ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xba,0x71,0x7f]
+ vfmsub231ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xba,0x72,0x80]
+ vfmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmsub231ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xba,0x31]
+ vfmsub231ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmsub231ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xba,0x71,0x7f]
+ vfmsub231ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xba,0x72,0x80]
+ vfmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmsubadd132ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x97,0xf4]
+ vfmsubadd132ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmsubadd132ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x97,0xf4]
+ vfmsubadd132ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmsubadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmsubadd132ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x97,0x31]
+ vfmsubadd132ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmsubadd132ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x97,0x71,0x7f]
+ vfmsubadd132ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmsubadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x97,0x72,0x80]
+ vfmsubadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmsubadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmsubadd132ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x97,0x31]
+ vfmsubadd132ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmsubadd132ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x97,0x71,0x7f]
+ vfmsubadd132ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmsubadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x97,0x72,0x80]
+ vfmsubadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmsubadd213ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa7,0xf4]
+ vfmsubadd213ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmsubadd213ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa7,0xf4]
+ vfmsubadd213ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmsubadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmsubadd213ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xa7,0x31]
+ vfmsubadd213ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmsubadd213ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa7,0x71,0x7f]
+ vfmsubadd213ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmsubadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xa7,0x72,0x80]
+ vfmsubadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmsubadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmsubadd213ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa7,0x31]
+ vfmsubadd213ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmsubadd213ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa7,0x71,0x7f]
+ vfmsubadd213ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmsubadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xa7,0x72,0x80]
+ vfmsubadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmsubadd231ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb7,0xf4]
+ vfmsubadd231ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmsubadd231ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb7,0xf4]
+ vfmsubadd231ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmsubadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmsubadd231ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xb7,0x31]
+ vfmsubadd231ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfmsubadd231ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb7,0x71,0x7f]
+ vfmsubadd231ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmsubadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xb7,0x72,0x80]
+ vfmsubadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmsubadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmsubadd231ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb7,0x31]
+ vfmsubadd231ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfmsubadd231ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb7,0x71,0x7f]
+ vfmsubadd231ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmsubadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xb7,0x72,0x80]
+ vfmsubadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfnmadd132ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9c,0xf4]
+ vfnmadd132ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfnmadd132ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9c,0xf4]
+ vfnmadd132ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfnmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfnmadd132ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x9c,0x31]
+ vfnmadd132ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfnmadd132ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9c,0x71,0x7f]
+ vfnmadd132ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfnmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x9c,0x72,0x80]
+ vfnmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfnmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfnmadd132ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9c,0x31]
+ vfnmadd132ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfnmadd132ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9c,0x71,0x7f]
+ vfnmadd132ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfnmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x9c,0x72,0x80]
+ vfnmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfnmadd213ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xac,0xf4]
+ vfnmadd213ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfnmadd213ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xac,0xf4]
+ vfnmadd213ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfnmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfnmadd213ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xac,0x31]
+ vfnmadd213ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfnmadd213ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xac,0x71,0x7f]
+ vfnmadd213ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfnmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xac,0x72,0x80]
+ vfnmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfnmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfnmadd213ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xac,0x31]
+ vfnmadd213ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfnmadd213ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xac,0x71,0x7f]
+ vfnmadd213ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfnmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xac,0x72,0x80]
+ vfnmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfnmadd231ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbc,0xf4]
+ vfnmadd231ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfnmadd231ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbc,0xf4]
+ vfnmadd231ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfnmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfnmadd231ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xbc,0x31]
+ vfnmadd231ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfnmadd231ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbc,0x71,0x7f]
+ vfnmadd231ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfnmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xbc,0x72,0x80]
+ vfnmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfnmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfnmadd231ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbc,0x31]
+ vfnmadd231ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfnmadd231ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbc,0x71,0x7f]
+ vfnmadd231ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfnmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xbc,0x72,0x80]
+ vfnmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfnmsub132ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9e,0xf4]
+ vfnmsub132ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfnmsub132ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9e,0xf4]
+ vfnmsub132ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfnmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfnmsub132ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x9e,0x31]
+ vfnmsub132ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfnmsub132ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9e,0x71,0x7f]
+ vfnmsub132ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfnmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x9e,0x72,0x80]
+ vfnmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfnmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfnmsub132ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9e,0x31]
+ vfnmsub132ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfnmsub132ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9e,0x71,0x7f]
+ vfnmsub132ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfnmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x9e,0x72,0x80]
+ vfnmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfnmsub213ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xae,0xf4]
+ vfnmsub213ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfnmsub213ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xae,0xf4]
+ vfnmsub213ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfnmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfnmsub213ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xae,0x31]
+ vfnmsub213ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfnmsub213ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xae,0x71,0x7f]
+ vfnmsub213ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfnmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xae,0x72,0x80]
+ vfnmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfnmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfnmsub213ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xae,0x31]
+ vfnmsub213ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfnmsub213ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xae,0x71,0x7f]
+ vfnmsub213ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfnmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xae,0x72,0x80]
+ vfnmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfnmsub231ph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbe,0xf4]
+ vfnmsub231ph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfnmsub231ph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbe,0xf4]
+ vfnmsub231ph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfnmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfnmsub231ph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xbe,0x31]
+ vfnmsub231ph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vfnmsub231ph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbe,0x71,0x7f]
+ vfnmsub231ph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfnmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xbe,0x72,0x80]
+ vfnmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfnmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfnmsub231ph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbe,0x31]
+ vfnmsub231ph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vfnmsub231ph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbe,0x71,0x7f]
+ vfnmsub231ph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80]
+ vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
index 36ca110e12e6e..e2fb2e4ddde2e 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
@@ -1635,3 +1635,723 @@
// CHECK: vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x51,0x72,0x80]
vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmadd132ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x98,0xf4]
+ vfmadd132ph zmm6, zmm5, zmm4
+
+// CHECK: vfmadd132ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x98,0xf4]
+ vfmadd132ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd132ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x98,0x31]
+ vfmadd132ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x98,0x71,0x7f]
+ vfmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x98,0x72,0x80]
+ vfmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmadd132sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x99,0xf4]
+ vfmadd132sh xmm6, xmm5, xmm4
+
+// CHECK: vfmadd132sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x99,0xf4]
+ vfmadd132sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x99,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd132sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x99,0x31]
+ vfmadd132sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfmadd132sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x99,0x71,0x7f]
+ vfmadd132sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x99,0x72,0x80]
+ vfmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmadd213ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa8,0xf4]
+ vfmadd213ph zmm6, zmm5, zmm4
+
+// CHECK: vfmadd213ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa8,0xf4]
+ vfmadd213ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd213ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xa8,0x31]
+ vfmadd213ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa8,0x71,0x7f]
+ vfmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xa8,0x72,0x80]
+ vfmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmadd213sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa9,0xf4]
+ vfmadd213sh xmm6, xmm5, xmm4
+
+// CHECK: vfmadd213sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa9,0xf4]
+ vfmadd213sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa9,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd213sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa9,0x31]
+ vfmadd213sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfmadd213sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa9,0x71,0x7f]
+ vfmadd213sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xa9,0x72,0x80]
+ vfmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmadd231ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb8,0xf4]
+ vfmadd231ph zmm6, zmm5, zmm4
+
+// CHECK: vfmadd231ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb8,0xf4]
+ vfmadd231ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd231ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xb8,0x31]
+ vfmadd231ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb8,0x71,0x7f]
+ vfmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xb8,0x72,0x80]
+ vfmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmadd231sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb9,0xf4]
+ vfmadd231sh xmm6, xmm5, xmm4
+
+// CHECK: vfmadd231sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb9,0xf4]
+ vfmadd231sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb9,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd231sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb9,0x31]
+ vfmadd231sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfmadd231sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb9,0x71,0x7f]
+ vfmadd231sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xb9,0x72,0x80]
+ vfmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmaddsub132ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x96,0xf4]
+ vfmaddsub132ph zmm6, zmm5, zmm4
+
+// CHECK: vfmaddsub132ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x96,0xf4]
+ vfmaddsub132ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmaddsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmaddsub132ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x96,0x31]
+ vfmaddsub132ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmaddsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x96,0x71,0x7f]
+ vfmaddsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmaddsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x96,0x72,0x80]
+ vfmaddsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmaddsub213ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa6,0xf4]
+ vfmaddsub213ph zmm6, zmm5, zmm4
+
+// CHECK: vfmaddsub213ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa6,0xf4]
+ vfmaddsub213ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmaddsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmaddsub213ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xa6,0x31]
+ vfmaddsub213ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmaddsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa6,0x71,0x7f]
+ vfmaddsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmaddsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xa6,0x72,0x80]
+ vfmaddsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmaddsub231ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb6,0xf4]
+ vfmaddsub231ph zmm6, zmm5, zmm4
+
+// CHECK: vfmaddsub231ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb6,0xf4]
+ vfmaddsub231ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmaddsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmaddsub231ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xb6,0x31]
+ vfmaddsub231ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmaddsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb6,0x71,0x7f]
+ vfmaddsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmaddsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xb6,0x72,0x80]
+ vfmaddsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub132ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9a,0xf4]
+ vfmsub132ph zmm6, zmm5, zmm4
+
+// CHECK: vfmsub132ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9a,0xf4]
+ vfmsub132ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub132ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x9a,0x31]
+ vfmsub132ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9a,0x71,0x7f]
+ vfmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x9a,0x72,0x80]
+ vfmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub132sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9b,0xf4]
+ vfmsub132sh xmm6, xmm5, xmm4
+
+// CHECK: vfmsub132sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9b,0xf4]
+ vfmsub132sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub132sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9b,0x31]
+ vfmsub132sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfmsub132sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9b,0x71,0x7f]
+ vfmsub132sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x9b,0x72,0x80]
+ vfmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmsub213ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xaa,0xf4]
+ vfmsub213ph zmm6, zmm5, zmm4
+
+// CHECK: vfmsub213ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xaa,0xf4]
+ vfmsub213ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub213ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xaa,0x31]
+ vfmsub213ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xaa,0x71,0x7f]
+ vfmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xaa,0x72,0x80]
+ vfmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub213sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xab,0xf4]
+ vfmsub213sh xmm6, xmm5, xmm4
+
+// CHECK: vfmsub213sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xab,0xf4]
+ vfmsub213sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xab,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub213sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xab,0x31]
+ vfmsub213sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfmsub213sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xab,0x71,0x7f]
+ vfmsub213sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xab,0x72,0x80]
+ vfmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmsub231ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xba,0xf4]
+ vfmsub231ph zmm6, zmm5, zmm4
+
+// CHECK: vfmsub231ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xba,0xf4]
+ vfmsub231ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub231ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xba,0x31]
+ vfmsub231ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xba,0x71,0x7f]
+ vfmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xba,0x72,0x80]
+ vfmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub231sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbb,0xf4]
+ vfmsub231sh xmm6, xmm5, xmm4
+
+// CHECK: vfmsub231sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbb,0xf4]
+ vfmsub231sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbb,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub231sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbb,0x31]
+ vfmsub231sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfmsub231sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbb,0x71,0x7f]
+ vfmsub231sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbb,0x72,0x80]
+ vfmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfmsubadd132ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x97,0xf4]
+ vfmsubadd132ph zmm6, zmm5, zmm4
+
+// CHECK: vfmsubadd132ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x97,0xf4]
+ vfmsubadd132ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmsubadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsubadd132ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x97,0x31]
+ vfmsubadd132ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmsubadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x97,0x71,0x7f]
+ vfmsubadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsubadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x97,0x72,0x80]
+ vfmsubadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsubadd213ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa7,0xf4]
+ vfmsubadd213ph zmm6, zmm5, zmm4
+
+// CHECK: vfmsubadd213ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa7,0xf4]
+ vfmsubadd213ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmsubadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsubadd213ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xa7,0x31]
+ vfmsubadd213ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmsubadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa7,0x71,0x7f]
+ vfmsubadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsubadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xa7,0x72,0x80]
+ vfmsubadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsubadd231ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb7,0xf4]
+ vfmsubadd231ph zmm6, zmm5, zmm4
+
+// CHECK: vfmsubadd231ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb7,0xf4]
+ vfmsubadd231ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmsubadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmsubadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsubadd231ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xb7,0x31]
+ vfmsubadd231ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfmsubadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb7,0x71,0x7f]
+ vfmsubadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsubadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xb7,0x72,0x80]
+ vfmsubadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd132ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9c,0xf4]
+ vfnmadd132ph zmm6, zmm5, zmm4
+
+// CHECK: vfnmadd132ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9c,0xf4]
+ vfnmadd132ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfnmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd132ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x9c,0x31]
+ vfnmadd132ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfnmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9c,0x71,0x7f]
+ vfnmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x9c,0x72,0x80]
+ vfnmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd132sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9d,0xf4]
+ vfnmadd132sh xmm6, xmm5, xmm4
+
+// CHECK: vfnmadd132sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9d,0xf4]
+ vfnmadd132sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfnmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd132sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9d,0x31]
+ vfnmadd132sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfnmadd132sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9d,0x71,0x7f]
+ vfnmadd132sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfnmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x9d,0x72,0x80]
+ vfnmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfnmadd213ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xac,0xf4]
+ vfnmadd213ph zmm6, zmm5, zmm4
+
+// CHECK: vfnmadd213ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xac,0xf4]
+ vfnmadd213ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfnmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd213ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xac,0x31]
+ vfnmadd213ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfnmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xac,0x71,0x7f]
+ vfnmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xac,0x72,0x80]
+ vfnmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd213sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xad,0xf4]
+ vfnmadd213sh xmm6, xmm5, xmm4
+
+// CHECK: vfnmadd213sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xad,0xf4]
+ vfnmadd213sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfnmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xad,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd213sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xad,0x31]
+ vfnmadd213sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfnmadd213sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xad,0x71,0x7f]
+ vfnmadd213sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfnmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xad,0x72,0x80]
+ vfnmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfnmadd231ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbc,0xf4]
+ vfnmadd231ph zmm6, zmm5, zmm4
+
+// CHECK: vfnmadd231ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbc,0xf4]
+ vfnmadd231ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfnmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd231ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xbc,0x31]
+ vfnmadd231ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfnmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbc,0x71,0x7f]
+ vfnmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xbc,0x72,0x80]
+ vfnmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd231sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbd,0xf4]
+ vfnmadd231sh xmm6, xmm5, xmm4
+
+// CHECK: vfnmadd231sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbd,0xf4]
+ vfnmadd231sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfnmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbd,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd231sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbd,0x31]
+ vfnmadd231sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfnmadd231sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbd,0x71,0x7f]
+ vfnmadd231sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfnmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbd,0x72,0x80]
+ vfnmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfnmsub132ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9e,0xf4]
+ vfnmsub132ph zmm6, zmm5, zmm4
+
+// CHECK: vfnmsub132ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9e,0xf4]
+ vfnmsub132ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfnmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub132ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x9e,0x31]
+ vfnmsub132ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfnmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9e,0x71,0x7f]
+ vfnmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x9e,0x72,0x80]
+ vfnmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmsub132sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9f,0xf4]
+ vfnmsub132sh xmm6, xmm5, xmm4
+
+// CHECK: vfnmsub132sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9f,0xf4]
+ vfnmsub132sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfnmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub132sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9f,0x31]
+ vfnmsub132sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfnmsub132sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9f,0x71,0x7f]
+ vfnmsub132sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfnmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x9f,0x72,0x80]
+ vfnmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfnmsub213ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xae,0xf4]
+ vfnmsub213ph zmm6, zmm5, zmm4
+
+// CHECK: vfnmsub213ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xae,0xf4]
+ vfnmsub213ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfnmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub213ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xae,0x31]
+ vfnmsub213ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfnmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xae,0x71,0x7f]
+ vfnmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xae,0x72,0x80]
+ vfnmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmsub213sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaf,0xf4]
+ vfnmsub213sh xmm6, xmm5, xmm4
+
+// CHECK: vfnmsub213sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xaf,0xf4]
+ vfnmsub213sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfnmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xaf,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub213sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaf,0x31]
+ vfnmsub213sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfnmsub213sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaf,0x71,0x7f]
+ vfnmsub213sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfnmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xaf,0x72,0x80]
+ vfnmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfnmsub231ph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbe,0xf4]
+ vfnmsub231ph zmm6, zmm5, zmm4
+
+// CHECK: vfnmsub231ph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbe,0xf4]
+ vfnmsub231ph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfnmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub231ph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xbe,0x31]
+ vfnmsub231ph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vfnmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbe,0x71,0x7f]
+ vfnmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xbe,0x72,0x80]
+ vfnmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmsub231sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbf,0xf4]
+ vfnmsub231sh xmm6, xmm5, xmm4
+
+// CHECK: vfnmsub231sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbf,0xf4]
+ vfnmsub231sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfnmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbf,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfnmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub231sh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbf,0x31]
+ vfnmsub231sh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vfnmsub231sh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbf,0x71,0x7f]
+ vfnmsub231sh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbf,0x72,0x80]
+ vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
index 6091599b87d66..427cd2f2eaa04 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
@@ -1491,3 +1491,723 @@
// CHECK: vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x51,0x72,0x80]
vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd132ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x98,0xf4]
+ vfmadd132ph ymm30, ymm29, ymm28
+
+// CHECK: vfmadd132ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x98,0xf4]
+ vfmadd132ph xmm30, xmm29, xmm28
+
+// CHECK: vfmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd132ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x98,0x31]
+ vfmadd132ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x98,0x71,0x7f]
+ vfmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x98,0x72,0x80]
+ vfmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd132ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x98,0x31]
+ vfmadd132ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x98,0x71,0x7f]
+ vfmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x98,0x72,0x80]
+ vfmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmadd213ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xa8,0xf4]
+ vfmadd213ph ymm30, ymm29, ymm28
+
+// CHECK: vfmadd213ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa8,0xf4]
+ vfmadd213ph xmm30, xmm29, xmm28
+
+// CHECK: vfmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd213ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xa8,0x31]
+ vfmadd213ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xa8,0x71,0x7f]
+ vfmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xa8,0x72,0x80]
+ vfmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd213ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xa8,0x31]
+ vfmadd213ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa8,0x71,0x7f]
+ vfmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xa8,0x72,0x80]
+ vfmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmadd231ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xb8,0xf4]
+ vfmadd231ph ymm30, ymm29, ymm28
+
+// CHECK: vfmadd231ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb8,0xf4]
+ vfmadd231ph xmm30, xmm29, xmm28
+
+// CHECK: vfmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd231ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xb8,0x31]
+ vfmadd231ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xb8,0x71,0x7f]
+ vfmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xb8,0x72,0x80]
+ vfmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd231ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xb8,0x31]
+ vfmadd231ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb8,0x71,0x7f]
+ vfmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xb8,0x72,0x80]
+ vfmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmaddsub132ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x96,0xf4]
+ vfmaddsub132ph ymm30, ymm29, ymm28
+
+// CHECK: vfmaddsub132ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x96,0xf4]
+ vfmaddsub132ph xmm30, xmm29, xmm28
+
+// CHECK: vfmaddsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddsub132ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x96,0x31]
+ vfmaddsub132ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmaddsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x96,0x71,0x7f]
+ vfmaddsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmaddsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x96,0x72,0x80]
+ vfmaddsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmaddsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddsub132ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x96,0x31]
+ vfmaddsub132ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmaddsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x96,0x71,0x7f]
+ vfmaddsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmaddsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x96,0x72,0x80]
+ vfmaddsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmaddsub213ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xa6,0xf4]
+ vfmaddsub213ph ymm30, ymm29, ymm28
+
+// CHECK: vfmaddsub213ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa6,0xf4]
+ vfmaddsub213ph xmm30, xmm29, xmm28
+
+// CHECK: vfmaddsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddsub213ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xa6,0x31]
+ vfmaddsub213ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmaddsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xa6,0x71,0x7f]
+ vfmaddsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmaddsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xa6,0x72,0x80]
+ vfmaddsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmaddsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddsub213ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xa6,0x31]
+ vfmaddsub213ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmaddsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa6,0x71,0x7f]
+ vfmaddsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmaddsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xa6,0x72,0x80]
+ vfmaddsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmaddsub231ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xb6,0xf4]
+ vfmaddsub231ph ymm30, ymm29, ymm28
+
+// CHECK: vfmaddsub231ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb6,0xf4]
+ vfmaddsub231ph xmm30, xmm29, xmm28
+
+// CHECK: vfmaddsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddsub231ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xb6,0x31]
+ vfmaddsub231ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmaddsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xb6,0x71,0x7f]
+ vfmaddsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmaddsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xb6,0x72,0x80]
+ vfmaddsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmaddsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddsub231ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xb6,0x31]
+ vfmaddsub231ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmaddsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb6,0x71,0x7f]
+ vfmaddsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmaddsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xb6,0x72,0x80]
+ vfmaddsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsub132ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x9a,0xf4]
+ vfmsub132ph ymm30, ymm29, ymm28
+
+// CHECK: vfmsub132ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9a,0xf4]
+ vfmsub132ph xmm30, xmm29, xmm28
+
+// CHECK: vfmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub132ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x9a,0x31]
+ vfmsub132ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x9a,0x71,0x7f]
+ vfmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x9a,0x72,0x80]
+ vfmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub132ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x9a,0x31]
+ vfmsub132ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9a,0x71,0x7f]
+ vfmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x9a,0x72,0x80]
+ vfmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsub213ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xaa,0xf4]
+ vfmsub213ph ymm30, ymm29, ymm28
+
+// CHECK: vfmsub213ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xaa,0xf4]
+ vfmsub213ph xmm30, xmm29, xmm28
+
+// CHECK: vfmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub213ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xaa,0x31]
+ vfmsub213ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xaa,0x71,0x7f]
+ vfmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xaa,0x72,0x80]
+ vfmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub213ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xaa,0x31]
+ vfmsub213ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xaa,0x71,0x7f]
+ vfmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xaa,0x72,0x80]
+ vfmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsub231ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xba,0xf4]
+ vfmsub231ph ymm30, ymm29, ymm28
+
+// CHECK: vfmsub231ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xba,0xf4]
+ vfmsub231ph xmm30, xmm29, xmm28
+
+// CHECK: vfmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub231ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xba,0x31]
+ vfmsub231ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xba,0x71,0x7f]
+ vfmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xba,0x72,0x80]
+ vfmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub231ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xba,0x31]
+ vfmsub231ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xba,0x71,0x7f]
+ vfmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xba,0x72,0x80]
+ vfmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsubadd132ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x97,0xf4]
+ vfmsubadd132ph ymm30, ymm29, ymm28
+
+// CHECK: vfmsubadd132ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x97,0xf4]
+ vfmsubadd132ph xmm30, xmm29, xmm28
+
+// CHECK: vfmsubadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsubadd132ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x97,0x31]
+ vfmsubadd132ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmsubadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x97,0x71,0x7f]
+ vfmsubadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsubadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x97,0x72,0x80]
+ vfmsubadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsubadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsubadd132ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x97,0x31]
+ vfmsubadd132ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmsubadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x97,0x71,0x7f]
+ vfmsubadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsubadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x97,0x72,0x80]
+ vfmsubadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsubadd213ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xa7,0xf4]
+ vfmsubadd213ph ymm30, ymm29, ymm28
+
+// CHECK: vfmsubadd213ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa7,0xf4]
+ vfmsubadd213ph xmm30, xmm29, xmm28
+
+// CHECK: vfmsubadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsubadd213ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xa7,0x31]
+ vfmsubadd213ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmsubadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xa7,0x71,0x7f]
+ vfmsubadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsubadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xa7,0x72,0x80]
+ vfmsubadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsubadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsubadd213ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xa7,0x31]
+ vfmsubadd213ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmsubadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa7,0x71,0x7f]
+ vfmsubadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsubadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xa7,0x72,0x80]
+ vfmsubadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsubadd231ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xb7,0xf4]
+ vfmsubadd231ph ymm30, ymm29, ymm28
+
+// CHECK: vfmsubadd231ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb7,0xf4]
+ vfmsubadd231ph xmm30, xmm29, xmm28
+
+// CHECK: vfmsubadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsubadd231ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xb7,0x31]
+ vfmsubadd231ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfmsubadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xb7,0x71,0x7f]
+ vfmsubadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsubadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xb7,0x72,0x80]
+ vfmsubadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsubadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmsubadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsubadd231ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xb7,0x31]
+ vfmsubadd231ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfmsubadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb7,0x71,0x7f]
+ vfmsubadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsubadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xb7,0x72,0x80]
+ vfmsubadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmadd132ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x9c,0xf4]
+ vfnmadd132ph ymm30, ymm29, ymm28
+
+// CHECK: vfnmadd132ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9c,0xf4]
+ vfnmadd132ph xmm30, xmm29, xmm28
+
+// CHECK: vfnmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd132ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x9c,0x31]
+ vfnmadd132ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfnmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x9c,0x71,0x7f]
+ vfnmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x9c,0x72,0x80]
+ vfnmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd132ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x9c,0x31]
+ vfnmadd132ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfnmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9c,0x71,0x7f]
+ vfnmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x9c,0x72,0x80]
+ vfnmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmadd213ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xac,0xf4]
+ vfnmadd213ph ymm30, ymm29, ymm28
+
+// CHECK: vfnmadd213ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xac,0xf4]
+ vfnmadd213ph xmm30, xmm29, xmm28
+
+// CHECK: vfnmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd213ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xac,0x31]
+ vfnmadd213ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfnmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xac,0x71,0x7f]
+ vfnmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xac,0x72,0x80]
+ vfnmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd213ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xac,0x31]
+ vfnmadd213ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfnmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xac,0x71,0x7f]
+ vfnmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xac,0x72,0x80]
+ vfnmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmadd231ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xbc,0xf4]
+ vfnmadd231ph ymm30, ymm29, ymm28
+
+// CHECK: vfnmadd231ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbc,0xf4]
+ vfnmadd231ph xmm30, xmm29, xmm28
+
+// CHECK: vfnmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd231ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xbc,0x31]
+ vfnmadd231ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfnmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xbc,0x71,0x7f]
+ vfnmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xbc,0x72,0x80]
+ vfnmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd231ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xbc,0x31]
+ vfnmadd231ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfnmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbc,0x71,0x7f]
+ vfnmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xbc,0x72,0x80]
+ vfnmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmsub132ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x9e,0xf4]
+ vfnmsub132ph ymm30, ymm29, ymm28
+
+// CHECK: vfnmsub132ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9e,0xf4]
+ vfnmsub132ph xmm30, xmm29, xmm28
+
+// CHECK: vfnmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub132ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x9e,0x31]
+ vfnmsub132ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfnmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x9e,0x71,0x7f]
+ vfnmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x9e,0x72,0x80]
+ vfnmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub132ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x9e,0x31]
+ vfnmsub132ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfnmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9e,0x71,0x7f]
+ vfnmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x9e,0x72,0x80]
+ vfnmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmsub213ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xae,0xf4]
+ vfnmsub213ph ymm30, ymm29, ymm28
+
+// CHECK: vfnmsub213ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xae,0xf4]
+ vfnmsub213ph xmm30, xmm29, xmm28
+
+// CHECK: vfnmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub213ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xae,0x31]
+ vfnmsub213ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfnmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xae,0x71,0x7f]
+ vfnmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xae,0x72,0x80]
+ vfnmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub213ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xae,0x31]
+ vfnmsub213ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfnmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xae,0x71,0x7f]
+ vfnmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xae,0x72,0x80]
+ vfnmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmsub231ph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0xbe,0xf4]
+ vfnmsub231ph ymm30, ymm29, ymm28
+
+// CHECK: vfnmsub231ph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbe,0xf4]
+ vfnmsub231ph xmm30, xmm29, xmm28
+
+// CHECK: vfnmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub231ph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0xbe,0x31]
+ vfnmsub231ph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vfnmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0xbe,0x71,0x7f]
+ vfnmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xbe,0x72,0x80]
+ vfnmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfnmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub231ph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0xbe,0x31]
+ vfnmsub231ph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vfnmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbe,0x71,0x7f]
+ vfnmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0xbe,0x72,0x80]
+ vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
More information about the cfe-commits
mailing list