[clang] ab40dbf - [X86] AVX512FP16 instructions enabling 6/6
via cfe-commits
cfe-commits at lists.llvm.org
Sun Aug 29 23:58:01 PDT 2021
Author: Wang, Pengfei
Date: 2021-08-30T13:08:45+08:00
New Revision: ab40dbfe03d50a2e4a12168beb0ad9ae660916cf
URL: https://github.com/llvm/llvm-project/commit/ab40dbfe03d50a2e4a12168beb0ad9ae660916cf
DIFF: https://github.com/llvm/llvm-project/commit/ab40dbfe03d50a2e4a12168beb0ad9ae660916cf.diff
LOG: [X86] AVX512FP16 instructions enabling 6/6
Enable FP16 complex FMA instructions.
Ref.: https://software.intel.com/content/www/us/en/develop/download/intel-avx512-fp16-architecture-specification.html
Reviewed By: LuoYuanke
Differential Revision: https://reviews.llvm.org/D105269
Added:
llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll
llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll
llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
llvm/test/MC/X86/avx512fp16-complex-fma.s
llvm/test/MC/X86/avx512fp16-complex-fma_vl.s
Modified:
clang/include/clang/Basic/BuiltinsX86.def
clang/lib/Headers/avx512fp16intrin.h
clang/lib/Headers/avx512vlfp16intrin.h
clang/lib/Sema/SemaChecking.cpp
clang/test/CodeGen/X86/avx512fp16-builtins.c
clang/test/CodeGen/X86/avx512vlfp16-builtins.c
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFoldTables.cpp
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/lib/Target/X86/X86IntrinsicsInfo.h
llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
llvm/test/MC/Disassembler/X86/avx512fp16.txt
llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
llvm/test/MC/X86/avx512fp16.s
llvm/test/MC/X86/avx512fp16vl.s
llvm/test/MC/X86/intel-syntax-avx512fp16.s
llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 0ab1444e7120a..4bab7ca3010be 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2014,6 +2014,32 @@ TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_maskz, "V8xV8xV8xV8xUcIi", "ncV:128:", "
TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_maskz, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmulcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmulcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfmulcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmulcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vfcmulcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+
// generic select intrinsics
TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 6440be3799df8..8911b4e2ff42f 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -545,6 +545,23 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
}
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
+ return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_conj_pch(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
__m128h __B) {
__A[0] += __B[0];
@@ -2909,6 +2926,359 @@ _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
(__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
(__mmask8)(U), (int)(R)))
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ __U,
+ __builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
+ (__v4sf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)_mm_move_ss((__m128)__C,
+ (__m128)__builtin_ia32_vfcmaddcsh_mask(
+ (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, __U,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm_fcmadd_round_sch(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
+ (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
+ ((__m128h)__builtin_ia32_selectps_128( \
+ (__mmask8)(U & 1), \
+ __builtin_ia32_vfcmaddcsh_mask( \
+ (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)), \
+ (__v4sf)(__m128h)(A)))
+
+#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
+ (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
+ ((__m128h)_mm_move_ss((__m128)(C), \
+ (__m128)__builtin_ia32_vfcmaddcsh_mask( \
+ (__v4sf)(C), (__v4sf)(A), (__v4sf)(B), (U), (R))))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ __U,
+ __builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
+ (__v4sf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmadd_round_sch(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
+ (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
+ ((__m128h)__builtin_ia32_selectps_128( \
+ (__mmask8)(U & 1), \
+ __builtin_ia32_vfmaddcsh_mask( \
+ (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)), \
+ (__v4sf)(__m128h)(A)))
+
+#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
+ (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fcmul_round_sch(A, B, R) \
+ ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmul_round_sch(A, B, R) \
+ ((__m128h)__builtin_ia32_vfmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fmul_round_sch(U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_vfcmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__W, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfcmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fcmul_round_pch(A, B, R) \
+ ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_vfmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__W, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmul_round_pch(A, B, R) \
+ ((__m512h)__builtin_ia32_vfmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
+ (__v16sf)__B, (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_selectps_512(
+ __U,
+ __builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
+ (__v16sf)__B, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v16sf)__A);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
+ (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
+ (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fcmadd_round_pch(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_selectps_512( \
+ (__mmask16)(U), \
+ __builtin_ia32_vfcmaddcph512_mask( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)(U), (int)(R)), \
+ (__v16sf)(__m512h)(A)))
+
+#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
+ (__v16sf)__B, (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_selectps_512(
+ __U,
+ __builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, (__v16sf)__B,
+ (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v16sf)__A);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
+ (__v16sf)__B, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
+ (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_pch(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_selectps_512( \
+ (__mmask16)(U), \
+ __builtin_ia32_vfmaddcph512_mask( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)(U), (int)(R)), \
+ (__v16sf)(__m512h)(A)))
+
+#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
+ (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__mmask16)(U), (int)(R)))
+
static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ph(__m512h __W) {
return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 8f48b0156cd69..b8cd554717a03 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -311,6 +311,39 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
}
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
+ return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
+ return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
+}
+
#define _mm256_cmp_ph_mask(a, b, p) \
((__mmask16)__builtin_ia32_cmpph256_mask( \
(__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
@@ -1743,6 +1776,192 @@ _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
(__v16hf)__C);
}
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_vfcmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfcmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ __U,
+ __builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)(__m128h)__A,
+ (__v4sf)__B, (__mmask8)__U),
+ (__v4sf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
+ (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
+ (__v8sf)__B, (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ __U,
+ __builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B,
+ (__mmask8)__U),
+ (__v8sf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
+ return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
+ (__v8sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
+ (__v8sf)__C, (__v8sf)__A, (__v8sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_vfmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ __U,
+ __builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
+ (__mmask8)__U),
+ (__v4sf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__C, (__v4sf)__A,
+ (__v4sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
+ (__v8sf)__B, (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ __U,
+ __builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B,
+ (__mmask8)__U),
+ (__v8sf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
+ return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
+ (__v8sf)__B, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__C, (__v8sf)__A,
+ (__v8sf)__B, (__mmask8)__U);
+}
+
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
__m128h __A,
__m128h __W) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ee3efc14c1ab8..99cd2b2278f16 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4111,6 +4111,16 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
+ case X86::BI__builtin_ia32_vfmaddcsh_mask:
+ case X86::BI__builtin_ia32_vfmaddcph512_mask:
+ case X86::BI__builtin_ia32_vfmaddcph512_maskz:
+ case X86::BI__builtin_ia32_vfcmaddcsh_mask:
+ case X86::BI__builtin_ia32_vfcmaddcph512_mask:
+ case X86::BI__builtin_ia32_vfcmaddcph512_maskz:
+ case X86::BI__builtin_ia32_vfmulcsh_mask:
+ case X86::BI__builtin_ia32_vfmulcph512_mask:
+ case X86::BI__builtin_ia32_vfcmulcsh_mask:
+ case X86::BI__builtin_ia32_vfcmulcph512_mask:
ArgNum = 4;
HasRC = true;
break;
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index 1a6ddeea15fca..ee4bed552f9ac 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -658,6 +658,48 @@ __m512h test_mm512_abs_ph(__m512h a) {
return _mm512_abs_ph(a);
}
+__m512h test_mm512_conj_pch(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_conj_pch
+ // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
+ // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
+ return _mm512_conj_pch(__A);
+}
+
+__m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_conj_pch
+ // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16
+ // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
+ // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
+ // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
+ return _mm512_mask_conj_pch(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_conj_pch
+ // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16
+ // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
+ // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
+ // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
+ // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
+ return _mm512_maskz_conj_pch(__U, __A);
+}
+
__m128h test_mm_add_round_sh(__m128h __A, __m128h __B) {
// CHECK-LABEL: @test_mm_add_round_sh
// CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
@@ -3996,6 +4038,360 @@ __m128h test_mm_mask3_fnmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __m
return _mm_mask3_fnmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}
+__m128h test_mm_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fcmadd_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
+ return _mm_fcmadd_sch(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fcmadd_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_fcmadd_sch(__A, __U, __B, __C);
+}
+
+__m128h test_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fcmadd_sch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.csh
+ return _mm_maskz_fcmadd_sch(__U, __A, __B, __C);
+}
+
+__m128h test_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fcmadd_sch
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %{{.*}} = insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_mask3_fcmadd_sch(__A, __B, __C, __U);
+}
+
+__m128h test_mm_fcmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fcmadd_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
+ return _mm_fcmadd_round_sch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fcmadd_round_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fcmadd_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_fcmadd_round_sch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fcmadd_round_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fcmadd_round_sch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.csh
+ return _mm_maskz_fcmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask3_fcmadd_round_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fcmadd_round_sch
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
+ // CHECK: %{{.*}} = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %{{.*}} = insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_mask3_fcmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_fmadd_sch(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmadd_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
+ return _mm_fmadd_sch(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmadd_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_fmadd_sch(__A, __U, __B, __C);
+}
+
+__m128h test_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmadd_sch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.csh
+ return _mm_maskz_fmadd_sch(__U, __A, __B, __C);
+}
+
+__m128h test_mm_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmadd_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
+ return _mm_fmadd_round_sch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fmadd_round_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmadd_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_fmadd_round_sch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fmadd_round_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmadd_round_sch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.csh
+ return _mm_maskz_fmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_fcmul_sch(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fcmul_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
+ return _mm_fcmul_sch(__A, __B);
+}
+
+__m128h test_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fcmul_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
+ return _mm_mask_fcmul_sch(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_fcmul_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
+ return _mm_maskz_fcmul_sch(__U, __A, __B);
+}
+
+__m128h test_mm_fcmul_round_sch(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fcmul_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
+ return _mm_fcmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fcmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fcmul_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
+ return _mm_mask_fcmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fcmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_fcmul_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
+ return _mm_maskz_fcmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fcmul_pch(__m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
+ return _mm512_fcmul_pch(__A, __B);
+}
+
+__m512h test_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_mask_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
+ return _mm512_mask_fcmul_pch(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_maskz_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
+ return _mm512_maskz_fcmul_pch(__U, __A, __B);
+}
+
+__m512h test_mm512_fcmul_round_pch(__m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_fcmul_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
+ return _mm512_fcmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fcmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_mask_fcmul_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
+ return _mm512_mask_fcmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fcmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_maskz_fcmul_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
+ return _mm512_maskz_fcmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
+ return _mm512_fcmadd_pch(__A, __B, __C);
+}
+
+__m512h test_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
+ // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask_fcmadd_pch(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
+ // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask3_fcmadd_pch(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512
+ return _mm512_maskz_fcmadd_pch(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fcmadd_round_pch(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fcmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
+ return _mm512_fcmadd_round_pch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fcmadd_round_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fcmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
+ // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask_fcmadd_round_pch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fcmadd_round_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fcmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
+ // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask3_fcmadd_round_pch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fcmadd_round_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fcmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512
+ return _mm512_maskz_fcmadd_round_pch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fmul_pch(__m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
+ return _mm512_fmul_pch(__A, __B);
+}
+
+__m512h test_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_mask_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
+ return _mm512_mask_fmul_pch(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_maskz_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
+ return _mm512_maskz_fmul_pch(__U, __A, __B);
+}
+
+__m512h test_mm512_fmul_round_pch(__m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_fmul_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
+ return _mm512_fmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_mask_fmul_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
+ return _mm512_mask_fmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_maskz_fmul_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
+ return _mm512_maskz_fmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_fmadd_pch(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
+ return _mm512_fmadd_pch(__A, __B, __C);
+}
+
+__m512h test_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
+ // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask_fmadd_pch(__A, __U, __B, __C);
+}
+
+__m512h test_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
+ // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask3_fmadd_pch(__A, __B, __C, __U);
+}
+
+__m512h test_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.cph.512
+ return _mm512_maskz_fmadd_pch(__U, __A, __B, __C);
+}
+
+__m512h test_mm512_fmadd_round_pch(__m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_fmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
+ return _mm512_fmadd_round_pch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_fmadd_round_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_fmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
+ // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask_fmadd_round_pch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask3_fmadd_round_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
+ // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask3_fmadd_round_pch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_fmadd_round_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.cph.512
+ return _mm512_maskz_fmadd_round_pch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_fmul_sch(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fmul_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
+ return _mm_fmul_sch(__A, __B);
+}
+
+__m128h test_mm_mask_fmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmul_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
+ return _mm_mask_fmul_sch(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_fmul_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
+ return _mm_maskz_fmul_sch(__U, __A, __B);
+}
+
+__m128h test_mm_fmul_round_sch(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fmul_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
+ return _mm_fmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_fmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmul_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
+ return _mm_mask_fmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_fmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_fmul_round_sch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
+ return _mm_maskz_fmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
_Float16 test_mm512_reduce_add_ph(__m512h __W) {
// CHECK-LABEL: @test_mm512_reduce_add_ph
// CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
index 8644309b63224..948e6ad13567c 100644
--- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
@@ -383,6 +383,92 @@ __m256h test_mm256_abs_ph(__m256h a) {
return _mm256_abs_ph(a);
}
+__m256h test_mm256_conj_pch(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_conj_pch
+ // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32>
+ // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half>
+ return _mm256_conj_pch(__A);
+}
+
+__m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_conj_pch
+ // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8
+ // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32>
+ // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half>
+ // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half>
+ return _mm256_mask_conj_pch(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_conj_pch(__mmask32 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_conj_pch
+ // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8
+ // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32>
+ // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float>
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half>
+ // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half>
+ return _mm256_maskz_conj_pch(__U, __A);
+}
+
+__m128h test_mm_conj_pch(__m128h __A) {
+ // CHECK-LABEL: @test_mm_conj_pch
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32>
+ // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_conj_pch(__A);
+}
+
+__m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_conj_pch
+ // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32>
+ // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_mask_conj_pch(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_conj_pch(__mmask32 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_conj_pch
+ // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32>
+ // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}}
+ // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_maskz_conj_pch(__U, __A);
+}
+
__mmask16 test_mm256_cmp_ph_mask_eq_oq(__m256h a, __m256h b) {
// CHECK-LABEL: @test_mm256_cmp_ph_mask_eq_oq
// CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}}
@@ -2726,6 +2812,183 @@ __m256h test_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmas
// CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}})
return _mm256_mask3_fnmsub_ph(__A, __B, __C, __U);
}
+
+__m128h test_mm_fcmul_pch(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.128
+ return _mm_fcmul_pch(__A, __B);
+}
+
+__m128h test_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.128
+ return _mm_mask_fcmul_pch(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.128
+ return _mm_maskz_fcmul_pch(__U, __A, __B);
+}
+
+__m256h test_mm256_fcmul_pch(__m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.256
+ return _mm256_fcmul_pch(__A, __B);
+}
+
+__m256h test_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_mask_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.256
+ return _mm256_mask_fcmul_pch(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_maskz_fcmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.256
+ return _mm256_maskz_fcmul_pch(__U, __A, __B);
+}
+
+__m128h test_mm_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.128
+ return _mm_fcmadd_pch(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.128
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_fcmadd_pch(__A, __U, __B, __C);
+}
+
+__m128h test_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.128
+ // CHECK-NOT: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask3_fcmadd_pch(__A, __B, __C, __U);
+}
+
+__m128h test_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128
+ return _mm_maskz_fcmadd_pch(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.256
+ return _mm256_fcmadd_pch(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.256
+ // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_fcmadd_pch(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.256
+ // CHECK-NOT: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask3_fcmadd_pch(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fcmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256
+ return _mm256_maskz_fcmadd_pch(__U, __A, __B, __C);
+}
+
+__m128h test_mm_fmul_pch(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.128
+ return _mm_fmul_pch(__A, __B);
+}
+
+__m128h test_mm_mask_fmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.128
+ return _mm_mask_fmul_pch(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.128
+ return _mm_maskz_fmul_pch(__U, __A, __B);
+}
+
+__m256h test_mm256_fmul_pch(__m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.256
+ return _mm256_fmul_pch(__A, __B);
+}
+
+__m256h test_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_mask_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.256
+ return _mm256_mask_fmul_pch(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_maskz_fmul_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.256
+ return _mm256_maskz_fmul_pch(__U, __A, __B);
+}
+
+__m128h test_mm_fmadd_pch(__m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.128
+ return _mm_fmadd_pch(__A, __B, __C);
+}
+
+__m128h test_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_mask_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.128
+ // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_fmadd_pch(__A, __U, __B, __C);
+}
+
+__m128h test_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.128
+ return _mm_mask3_fmadd_pch(__A, __B, __C, __U);
+}
+
+__m128h test_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ // CHECK-LABEL: @test_mm_maskz_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.cph.128
+ return _mm_maskz_fmadd_pch(__U, __A, __B, __C);
+}
+
+__m256h test_mm256_fmadd_pch(__m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.256
+ return _mm256_fmadd_pch(__A, __B, __C);
+}
+
+__m256h test_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_mask_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.256
+ // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_fmadd_pch(__A, __U, __B, __C);
+}
+
+__m256h test_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm256_mask3_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.256
+ return _mm256_mask3_fmadd_pch(__A, __B, __C, __U);
+}
+
+__m256h test_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
+ // CHECK-LABEL: @test_mm256_maskz_fmadd_pch
+ // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.cph.256
+ return _mm256_maskz_fmadd_pch(__U, __A, __B, __C);
+}
+
__m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) {
// CHECK-LABEL: @test_mm_mask_blend_ph
// CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 680e649290653..2601f96696ac2 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5732,4 +5732,137 @@ let TargetPrefix = "x86" in {
: Intrinsic<[ llvm_half_ty ],
[ llvm_half_ty, llvm_half_ty, llvm_half_ty, llvm_i32_ty ],
[ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+ def int_x86_avx512fp16_mask_vfcmadd_cph_128
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_maskz_vfcmadd_cph_128
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_maskz">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfcmadd_cph_256
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_mask">,
+ Intrinsic<[ llvm_v8f32_ty ],
+ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_maskz_vfcmadd_cph_256
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_maskz">,
+ Intrinsic<[ llvm_v8f32_ty ],
+ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfcmadd_cph_512
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_mask">,
+ Intrinsic<[ llvm_v16f32_ty ],
+ [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_maskz_vfcmadd_cph_512
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_maskz">,
+ Intrinsic<[ llvm_v16f32_ty ],
+ [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfmadd_cph_128
+ : GCCBuiltin<"__builtin_ia32_vfmaddcph128_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_maskz_vfmadd_cph_128
+ : GCCBuiltin<"__builtin_ia32_vfmaddcph128_maskz">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfmadd_cph_256
+ : GCCBuiltin<"__builtin_ia32_vfmaddcph256_mask">,
+ Intrinsic<[ llvm_v8f32_ty ],
+ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_maskz_vfmadd_cph_256
+ : GCCBuiltin<"__builtin_ia32_vfmaddcph256_maskz">,
+ Intrinsic<[ llvm_v8f32_ty ],
+ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfmadd_cph_512
+ : GCCBuiltin<"__builtin_ia32_vfmaddcph512_mask">,
+ Intrinsic<[ llvm_v16f32_ty ],
+ [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_maskz_vfmadd_cph_512
+ : GCCBuiltin<"__builtin_ia32_vfmaddcph512_maskz">,
+ Intrinsic<[ llvm_v16f32_ty ],
+ [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfmadd_csh
+ : GCCBuiltin<"__builtin_ia32_vfmaddcsh_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_maskz_vfmadd_csh
+ : GCCBuiltin<"__builtin_ia32_vfmaddcsh_maskz">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfcmadd_csh
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_maskz_vfcmadd_csh
+ : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_maskz">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfmul_cph_128
+ : GCCBuiltin<"__builtin_ia32_vfmulcph128_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfcmul_cph_128
+ : GCCBuiltin<"__builtin_ia32_vfcmulcph128_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfmul_cph_256
+ : GCCBuiltin<"__builtin_ia32_vfmulcph256_mask">,
+ Intrinsic<[ llvm_v8f32_ty ],
+ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfcmul_cph_256
+ : GCCBuiltin<"__builtin_ia32_vfcmulcph256_mask">,
+ Intrinsic<[ llvm_v8f32_ty ],
+ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_vfmul_cph_512
+ : GCCBuiltin<"__builtin_ia32_vfmulcph512_mask">,
+ Intrinsic<[ llvm_v16f32_ty ],
+ [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfcmul_cph_512
+ : GCCBuiltin<"__builtin_ia32_vfcmulcph512_mask">,
+ Intrinsic<[ llvm_v16f32_ty ],
+ [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfmul_csh
+ : GCCBuiltin<"__builtin_ia32_vfmulcsh_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_vfcmul_csh
+ : GCCBuiltin<"__builtin_ia32_vfcmulcsh_mask">,
+ Intrinsic<[ llvm_v4f32_ty ],
+ [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b9d8c148f5fbf..04c0e239ae5f8 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3866,6 +3866,176 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
}
break;
}
+ case X86::VFCMADDCPHZ128m:
+ case X86::VFCMADDCPHZ256m:
+ case X86::VFCMADDCPHZm:
+ case X86::VFCMADDCPHZ128mb:
+ case X86::VFCMADDCPHZ256mb:
+ case X86::VFCMADDCPHZmb:
+ case X86::VFCMADDCPHZ128mbk:
+ case X86::VFCMADDCPHZ256mbk:
+ case X86::VFCMADDCPHZmbk:
+ case X86::VFCMADDCPHZ128mbkz:
+ case X86::VFCMADDCPHZ256mbkz:
+ case X86::VFCMADDCPHZmbkz:
+ case X86::VFCMADDCPHZ128mk:
+ case X86::VFCMADDCPHZ256mk:
+ case X86::VFCMADDCPHZmk:
+ case X86::VFCMADDCPHZ128mkz:
+ case X86::VFCMADDCPHZ256mkz:
+ case X86::VFCMADDCPHZmkz:
+ case X86::VFCMADDCPHZ128r:
+ case X86::VFCMADDCPHZ256r:
+ case X86::VFCMADDCPHZr:
+ case X86::VFCMADDCPHZ128rk:
+ case X86::VFCMADDCPHZ256rk:
+ case X86::VFCMADDCPHZrk:
+ case X86::VFCMADDCPHZ128rkz:
+ case X86::VFCMADDCPHZ256rkz:
+ case X86::VFCMADDCPHZrkz:
+ case X86::VFCMADDCPHZrb:
+ case X86::VFCMADDCPHZrbk:
+ case X86::VFCMADDCPHZrbkz:
+ case X86::VFCMADDCSHZm:
+ case X86::VFCMADDCSHZmk:
+ case X86::VFCMADDCSHZmkz:
+ case X86::VFCMADDCSHZr:
+ case X86::VFCMADDCSHZrb:
+ case X86::VFCMADDCSHZrbk:
+ case X86::VFCMADDCSHZrbkz:
+ case X86::VFCMADDCSHZrk:
+ case X86::VFCMADDCSHZrkz:
+ case X86::VFMADDCPHZ128m:
+ case X86::VFMADDCPHZ256m:
+ case X86::VFMADDCPHZm:
+ case X86::VFMADDCPHZ128mb:
+ case X86::VFMADDCPHZ256mb:
+ case X86::VFMADDCPHZmb:
+ case X86::VFMADDCPHZ128mbk:
+ case X86::VFMADDCPHZ256mbk:
+ case X86::VFMADDCPHZmbk:
+ case X86::VFMADDCPHZ128mbkz:
+ case X86::VFMADDCPHZ256mbkz:
+ case X86::VFMADDCPHZmbkz:
+ case X86::VFMADDCPHZ128mk:
+ case X86::VFMADDCPHZ256mk:
+ case X86::VFMADDCPHZmk:
+ case X86::VFMADDCPHZ128mkz:
+ case X86::VFMADDCPHZ256mkz:
+ case X86::VFMADDCPHZmkz:
+ case X86::VFMADDCPHZ128r:
+ case X86::VFMADDCPHZ256r:
+ case X86::VFMADDCPHZr:
+ case X86::VFMADDCPHZ128rk:
+ case X86::VFMADDCPHZ256rk:
+ case X86::VFMADDCPHZrk:
+ case X86::VFMADDCPHZ128rkz:
+ case X86::VFMADDCPHZ256rkz:
+ case X86::VFMADDCPHZrkz:
+ case X86::VFMADDCPHZrb:
+ case X86::VFMADDCPHZrbk:
+ case X86::VFMADDCPHZrbkz:
+ case X86::VFMADDCSHZm:
+ case X86::VFMADDCSHZmk:
+ case X86::VFMADDCSHZmkz:
+ case X86::VFMADDCSHZr:
+ case X86::VFMADDCSHZrb:
+ case X86::VFMADDCSHZrbk:
+ case X86::VFMADDCSHZrbkz:
+ case X86::VFMADDCSHZrk:
+ case X86::VFMADDCSHZrkz: {
+ unsigned Dest = Inst.getOperand(0).getReg();
+ for (unsigned i = 2; i < Inst.getNumOperands(); i++)
+ if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+ return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+ "distinct from source registers");
+ break;
+ }
+ case X86::VFCMULCPHZ128rm:
+ case X86::VFCMULCPHZ256rm:
+ case X86::VFCMULCPHZrm:
+ case X86::VFCMULCPHZ128rmb:
+ case X86::VFCMULCPHZ256rmb:
+ case X86::VFCMULCPHZrmb:
+ case X86::VFCMULCPHZ128rmbk:
+ case X86::VFCMULCPHZ256rmbk:
+ case X86::VFCMULCPHZrmbk:
+ case X86::VFCMULCPHZ128rmbkz:
+ case X86::VFCMULCPHZ256rmbkz:
+ case X86::VFCMULCPHZrmbkz:
+ case X86::VFCMULCPHZ128rmk:
+ case X86::VFCMULCPHZ256rmk:
+ case X86::VFCMULCPHZrmk:
+ case X86::VFCMULCPHZ128rmkz:
+ case X86::VFCMULCPHZ256rmkz:
+ case X86::VFCMULCPHZrmkz:
+ case X86::VFCMULCPHZ128rr:
+ case X86::VFCMULCPHZ256rr:
+ case X86::VFCMULCPHZrr:
+ case X86::VFCMULCPHZ128rrk:
+ case X86::VFCMULCPHZ256rrk:
+ case X86::VFCMULCPHZrrk:
+ case X86::VFCMULCPHZ128rrkz:
+ case X86::VFCMULCPHZ256rrkz:
+ case X86::VFCMULCPHZrrkz:
+ case X86::VFCMULCPHZrrb:
+ case X86::VFCMULCPHZrrbk:
+ case X86::VFCMULCPHZrrbkz:
+ case X86::VFCMULCSHZrm:
+ case X86::VFCMULCSHZrmk:
+ case X86::VFCMULCSHZrmkz:
+ case X86::VFCMULCSHZrr:
+ case X86::VFCMULCSHZrrb:
+ case X86::VFCMULCSHZrrbk:
+ case X86::VFCMULCSHZrrbkz:
+ case X86::VFCMULCSHZrrk:
+ case X86::VFCMULCSHZrrkz:
+ case X86::VFMULCPHZ128rm:
+ case X86::VFMULCPHZ256rm:
+ case X86::VFMULCPHZrm:
+ case X86::VFMULCPHZ128rmb:
+ case X86::VFMULCPHZ256rmb:
+ case X86::VFMULCPHZrmb:
+ case X86::VFMULCPHZ128rmbk:
+ case X86::VFMULCPHZ256rmbk:
+ case X86::VFMULCPHZrmbk:
+ case X86::VFMULCPHZ128rmbkz:
+ case X86::VFMULCPHZ256rmbkz:
+ case X86::VFMULCPHZrmbkz:
+ case X86::VFMULCPHZ128rmk:
+ case X86::VFMULCPHZ256rmk:
+ case X86::VFMULCPHZrmk:
+ case X86::VFMULCPHZ128rmkz:
+ case X86::VFMULCPHZ256rmkz:
+ case X86::VFMULCPHZrmkz:
+ case X86::VFMULCPHZ128rr:
+ case X86::VFMULCPHZ256rr:
+ case X86::VFMULCPHZrr:
+ case X86::VFMULCPHZ128rrk:
+ case X86::VFMULCPHZ256rrk:
+ case X86::VFMULCPHZrrk:
+ case X86::VFMULCPHZ128rrkz:
+ case X86::VFMULCPHZ256rrkz:
+ case X86::VFMULCPHZrrkz:
+ case X86::VFMULCPHZrrb:
+ case X86::VFMULCPHZrrbk:
+ case X86::VFMULCPHZrrbkz:
+ case X86::VFMULCSHZrm:
+ case X86::VFMULCSHZrmk:
+ case X86::VFMULCSHZrmkz:
+ case X86::VFMULCSHZrr:
+ case X86::VFMULCSHZrrb:
+ case X86::VFMULCSHZrrbk:
+ case X86::VFMULCSHZrrbkz:
+ case X86::VFMULCSHZrrk:
+ case X86::VFMULCSHZrrkz: {
+ unsigned Dest = Inst.getOperand(0).getReg();
+ for (unsigned i = 1; i < Inst.getNumOperands(); i++)
+ if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+ return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+ "distinct from source registers");
+ break;
+ }
}
const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 64436da7c0a07..a97fc71969c7e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26063,6 +26063,35 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Swap Src1 and Src2 in the node creation
return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
}
+ case FMA_OP_MASKZ:
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+
+ SDValue PassThru = Src1;
+ if (IntrData->Type == FMA_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ // We add rounding mode to the Node when
+ // - RC Opcode is specified and
+ // - RC is not "current direction".
+ SDValue NewOp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+ if (!NewOp)
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
+ return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
+ }
case IFMA_OP:
// NOTE: We need to swizzle the operands to pass the multiply operands
// first.
@@ -32341,6 +32370,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FNMSUB_RND)
NODE_NAME_CASE(FMADDSUB_RND)
NODE_NAME_CASE(FMSUBADD_RND)
+ NODE_NAME_CASE(VFMADDC)
+ NODE_NAME_CASE(VFMADDC_RND)
+ NODE_NAME_CASE(VFCMADDC)
+ NODE_NAME_CASE(VFCMADDC_RND)
+ NODE_NAME_CASE(VFMULC)
+ NODE_NAME_CASE(VFMULC_RND)
+ NODE_NAME_CASE(VFCMULC)
+ NODE_NAME_CASE(VFCMULC_RND)
+ NODE_NAME_CASE(VFMULCSH)
+ NODE_NAME_CASE(VFMULCSH_RND)
+ NODE_NAME_CASE(VFCMULCSH)
+ NODE_NAME_CASE(VFCMULCSH_RND)
+ NODE_NAME_CASE(VFMADDCSH)
+ NODE_NAME_CASE(VFMADDCSH_RND)
+ NODE_NAME_CASE(VFCMADDCSH)
+ NODE_NAME_CASE(VFCMADDCSH_RND)
NODE_NAME_CASE(VPMADD52H)
NODE_NAME_CASE(VPMADD52L)
NODE_NAME_CASE(VRNDSCALE)
@@ -47377,11 +47422,141 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Try to combine the following nodes
+// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
+// <i32 -2147483648[float -0.000000e+00]> 0
+// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
+// <(load 4 from constant-pool)> t0, t29
+// [t30: v16i32 = bitcast t27]
+// t6: v16i32 = xor t7, t27[t30]
+// t11: v16f32 = bitcast t6
+// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
+// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
+// t22: v16f32 = bitcast t7
+// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
+// t24: v32f16 = bitcast t23
+static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ int CombineOpcode =
+ N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
+ auto isConjugationConstant = [](const Constant *c) {
+ if (const auto *CI = dyn_cast<ConstantInt>(c)) {
+ APInt ConjugationInt32 = APInt(32, 0x80000000, true);
+ APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
+ switch (CI->getBitWidth()) {
+ case 16:
+ return false;
+ case 32:
+ return CI->getValue() == ConjugationInt32;
+ case 64:
+ return CI->getValue() == ConjugationInt64;
+ default:
+ llvm_unreachable("Unexpected bit width");
+ }
+ }
+ if (const auto *CF = dyn_cast<ConstantFP>(c))
+ return CF->isNegativeZeroValue();
+ return false;
+ };
+ auto combineConjugation = [&](SDValue &r) {
+ if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
+ SDValue XOR = LHS.getOperand(0);
+ if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
+ SDValue XORRHS = XOR.getOperand(1);
+ if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
+ XORRHS = XORRHS.getOperand(0);
+ if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ XORRHS.getOperand(1).getNumOperands()) {
+ ConstantPoolSDNode *CP =
+ dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
+ if (CP && isConjugationConstant(CP->getConstVal())) {
+ SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+ SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
+ SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
+ r = DAG.getBitcast(VT, FCMulC);
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ };
+ SDValue Res;
+ if (combineConjugation(Res))
+ return Res;
+ std::swap(LHS, RHS);
+ if (combineConjugation(Res))
+ return Res;
+ return Res;
+}
+
+// Try to combine the following nodes
+// t21: v16f32 = X86ISD::VFMULC/VFCMULC t7, t8
+// t15: v32f16 = bitcast t21
+// t16: v32f16 = fadd nnan ninf nsz arcp contract afn reassoc t15, t2
+// into X86ISD::VFMADDC/VFCMADDC if possible:
+// t22: v16f32 = bitcast t2
+// t23: v16f32 = nnan ninf nsz arcp contract afn reassoc
+// X86ISD::VFMADDC/VFCMADDC t7, t8, t22
+// t24: v32f16 = bitcast t23
+static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ auto AllowContract = [&DAG](SDNode *N) {
+ return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ N->getFlags().hasAllowContract();
+ };
+ if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() || !AllowContract(N))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue CFmul, FAddOp1;
+ auto GetCFmulFrom = [&CFmul, &AllowContract](SDValue N) -> bool {
+ if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
+ return false;
+ SDValue Op0 = N.getOperand(0);
+ unsigned Opcode = Op0.getOpcode();
+ if (Op0.hasOneUse() && AllowContract(Op0.getNode()) &&
+ (Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC))
+ CFmul = Op0;
+ return !!CFmul;
+ };
+
+ if (GetCFmulFrom(LHS))
+ FAddOp1 = RHS;
+ else if (GetCFmulFrom(RHS))
+ FAddOp1 = LHS;
+ else
+ return SDValue();
+
+ MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
+ assert(CFmul->getValueType(0) == CVT && "Complex type mismatch");
+ FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
+ unsigned newOp = CFmul.getOpcode() == X86ISD::VFMULC ? X86ISD::VFMADDC
+ : X86ISD::VFCMADDC;
+ // FIXME: How do we handle when fast math flags of FADD are
diff erent from
+ // CFMUL's?
+ CFmul = DAG.getNode(newOp, SDLoc(N), CVT, FAddOp1, CFmul.getOperand(0),
+ CFmul.getOperand(1), N->getFlags());
+ return DAG.getBitcast(VT, CFmul);
+}
+
/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
return HOp;
+
+ if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
+ return COp;
+
return SDValue();
}
@@ -51985,6 +52160,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
+ case X86ISD::VFCMULC:
+ case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index be2d5db64b04a..2e043b83149e3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -566,6 +566,27 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
+ // AVX512-FP16 complex addition and multiplication.
+ VFMADDC,
+ VFMADDC_RND,
+ VFCMADDC,
+ VFCMADDC_RND,
+
+ VFMULC,
+ VFMULC_RND,
+ VFCMULC,
+ VFCMULC_RND,
+
+ VFMADDCSH,
+ VFMADDCSH_RND,
+ VFCMADDCSH,
+ VFCMADDCSH_RND,
+
+ VFMULCSH,
+ VFMULCSH_RND,
+ VFCMULCSH,
+ VFCMULCSH_RND,
+
// Compress and expand.
COMPRESS,
EXPAND,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index df3e1554320ef..b4f1730cbdd10 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -205,8 +205,9 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
string MaskingConstraint = "",
bit IsCommutable = 0,
bit IsKCommutable = 0,
- bit IsKZCommutable = IsCommutable> {
- let isCommutable = IsCommutable in
+ bit IsKZCommutable = IsCommutable,
+ string ClobberConstraint = ""> {
+ let isCommutable = IsCommutable, Constraints = ClobberConstraint in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
@@ -220,12 +221,15 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
MaskingPattern>,
EVEX_K {
// In case of the 3src subclass this is overridden with a let.
- string Constraints = MaskingConstraint;
+ string Constraints = !if(!eq(ClobberConstraint, ""), MaskingConstraint,
+ !if(!eq(MaskingConstraint, ""), ClobberConstraint,
+ !strconcat(ClobberConstraint, ", ", MaskingConstraint)));
}
// Zero mask does not add any restrictions to commute operands transformation.
// So, it is Ok to use IsCommutable instead of IsKCommutable.
- let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<>
+ let isCommutable = IsKZCommutable, // Prefer over VMOV*rrkz Pat<>
+ Constraints = ClobberConstraint in
def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -245,7 +249,8 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
string MaskingConstraint = "",
bit IsCommutable = 0,
bit IsKCommutable = 0,
- bit IsKZCommutable = IsCommutable> :
+ bit IsKZCommutable = IsCommutable,
+ string ClobberConstraint = ""> :
AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],
@@ -253,7 +258,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
[(set _.RC:$dst,
(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
MaskingConstraint, IsCommutable,
- IsKCommutable, IsKZCommutable>;
+ IsKCommutable, IsKZCommutable, ClobberConstraint>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -263,6 +268,7 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskRHS,
+ string ClobberConstraint = "",
bit IsCommutable = 0, bit IsKCommutable = 0,
bit IsKZCommutable = IsCommutable> :
AVX512_maskable_custom<O, F, Outs, Ins,
@@ -275,7 +281,7 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
[(set _.RC:$dst,
(vselect_mask _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
"$src0 = $dst", IsCommutable, IsKCommutable,
- IsKZCommutable>;
+ IsKZCommutable, ClobberConstraint>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -286,14 +292,15 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
dag RHS,
bit IsCommutable = 0, bit IsKCommutable = 0,
bit IsKZCommutable = IsCommutable,
- SDPatternOperator Select = vselect_mask> :
+ SDPatternOperator Select = vselect_mask,
+ string ClobberConstraint = ""> :
AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(Select _.KRCWM:$mask, RHS, _.RC:$src0),
Select, "$src0 = $dst", IsCommutable, IsKCommutable,
- IsKZCommutable>;
+ IsKZCommutable, ClobberConstraint>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the scalar instruction.
@@ -5742,43 +5749,47 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
SDPatternOperator MaskOpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable,
- bit IsKCommutable = IsCommutable> {
+ bit IsKCommutable = IsCommutable,
+ string suffix = _.Suffix,
+ string ClobberConstraint = "",
+ bit MayRaiseFPException = 1> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0,
- Uses = [MXCSR], mayRaiseFPException = 1 in {
+ Uses = [MXCSR], mayRaiseFPException = MayRaiseFPException in {
defm rr: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
- IsKCommutable, IsKCommutable>,
- EVEX_4V, Sched<[sched]>;
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), ClobberConstraint,
+ IsCommutable, IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
- (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+ (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+ ClobberConstraint>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
- "${src2}"#_.BroadcastStr#", $src1",
- "$src1, ${src2}"#_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
- (MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
- EVEX_4V, EVEX_B,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#suffix,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+ (MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+ ClobberConstraint>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
}
multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNodeRnd,
- X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string suffix = _.Suffix,
+ string ClobberConstraint = ""> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#_.Suffix,
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc))),
+ 0, 0, 0, vselect_mask, ClobberConstraint>,
EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -13510,3 +13521,132 @@ let Predicates = [HasFP16, HasVLX] in {
v8f16x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTUQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
}
+
+let Constraints = "@earlyclobber $dst, $src1 = $dst" in {
+ multiclass avx512_cfmop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, EVEX_4V;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3))))>, EVEX_B, EVEX_4V;
+ }
+} // Constraints = "@earlyclobber $dst, $src1 = $dst"
+
+multiclass avx512_cfmop_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let Constraints = "@earlyclobber $dst, $src1 = $dst" in
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 timm:$rc)))>,
+ EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+
+multiclass avx512_cfmaop_common<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasFP16] in {
+ defm Z : avx512_cfmop_rm<opc, OpcodeStr, OpNode, v16f32_info>,
+ avx512_cfmop_round<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+ EVEX_V512, Sched<[WriteFMAZ]>;
+ }
+ let Predicates = [HasVLX, HasFP16] in {
+ defm Z256 : avx512_cfmop_rm<opc, OpcodeStr, OpNode, v8f32x_info>, EVEX_V256, Sched<[WriteFMAY]>;
+ defm Z128 : avx512_cfmop_rm<opc, OpcodeStr, OpNode, v4f32x_info>, EVEX_V128, Sched<[WriteFMAX]>;
+ }
+}
+
+multiclass avx512_cfmulop_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched = SchedWriteFMA> {
+ let Predicates = [HasFP16] in {
+ defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
+ sched.ZMM, 0, 0, "", "@earlyclobber $dst", 0>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.ZMM, v16f32_info,
+ "", "@earlyclobber $dst">, EVEX_V512;
+ }
+ let Predicates = [HasVLX, HasFP16] in {
+ defm Z256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
+ sched.YMM, 0, 0, "", "@earlyclobber $dst", 0>, EVEX_V256;
+ defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
+ sched.XMM, 0, 0, "", "@earlyclobber $dst", 0>, EVEX_V128;
+ }
+}
+
+
+let Uses = [MXCSR] in {
+ defm VFMADDCPH : avx512_cfmaop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd>,
+ T_MAP6XS, EVEX_CD8<32, CD8VF>;
+ defm VFCMADDCPH : avx512_cfmaop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd>,
+ T_MAP6XD, EVEX_CD8<32, CD8VF>;
+
+ defm VFMULCPH : avx512_cfmulop_common<0xD6, "vfmulcph", x86vfmulc, x86vfmulc,
+ x86vfmulcRnd>, T_MAP6XS, EVEX_CD8<32, CD8VF>;
+ defm VFCMULCPH : avx512_cfmulop_common<0xD6, "vfcmulcph", x86vfcmulc,
+ x86vfcmulc, x86vfcmulcRnd>, T_MAP6XD, EVEX_CD8<32, CD8VF>;
+}
+
+
+multiclass avx512_cfmop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched = SchedWriteFMA> {
+ let Predicates = [HasFP16], Constraints = "@earlyclobber $dst, $src1 = $dst" in {
+ defm r : AVX512_maskable_3src<opc, MRMSrcReg, v4f32x_info, (outs VR128X:$dst),
+ (ins VR128X:$src2, VR128X:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3",
+ (v4f32 (OpNode VR128X:$src1, VR128X:$src2, VR128X:$src3))>,
+ Sched<[sched.XMM]>;
+ defm m : AVX512_maskable_3src<opc, MRMSrcMem, v4f32x_info, (outs VR128X:$dst),
+ (ins VR128X:$src2, ssmem:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3",
+ (v4f32 (OpNode VR128X:$src1, VR128X:$src2, (sse_load_f32 addr:$src3)))>,
+ Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold]>;
+ defm rb : AVX512_maskable_3src<opc, MRMSrcReg, v4f32x_info, (outs VR128X:$dst),
+ (ins VR128X:$src2, VR128X:$src3, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (v4f32 (OpNodeRnd VR128X:$src1, VR128X:$src2, VR128X:$src3, (i32 timm:$rc)))>,
+ EVEX_B, EVEX_RC, Sched<[sched.XMM]>;
+ }
+}
+
+multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, X86SchedWriteWidths sched = SchedWriteFMA> {
+ let Predicates = [HasFP16] in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, f32x_info, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (v4f32 (OpNode VR128X:$src1, VR128X:$src2)),
+ 0, 0, 0, X86selects, "@earlyclobber $dst">, Sched<[sched.XMM]>;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, f32x_info, (outs VR128X:$dst),
+ (ins VR128X:$src1, ssmem:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (v4f32 (OpNode VR128X:$src1, (sse_load_f32 addr:$src2))),
+ 0, 0, 0, X86selects, "@earlyclobber $dst">,
+ Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold]>;
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, f32x_info, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (OpNodeRnd (v4f32 VR128X:$src1), (v4f32 VR128X:$src2), (i32 timm:$rc)),
+ 0, 0, 0, X86selects, "@earlyclobber $dst">,
+ EVEX_B, EVEX_RC, Sched<[sched.XMM]>;
+ }
+}
+
+let Uses = [MXCSR] in {
+ defm VFMADDCSHZ : avx512_cfmop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd>,
+ T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+ defm VFCMADDCSHZ : avx512_cfmop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd>,
+ T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+
+ defm VFMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd>,
+ T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+ defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd>,
+ T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+}
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 235f0d4b92613..6d4ad08842c76 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -1846,6 +1846,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VEXPANDPSZ128rrkz, X86::VEXPANDPSZ128rmkz, TB_NO_REVERSE },
{ X86::VEXPANDPSZ256rrkz, X86::VEXPANDPSZ256rmkz, TB_NO_REVERSE },
{ X86::VEXPANDPSZrrkz, X86::VEXPANDPSZrmkz, TB_NO_REVERSE },
+ { X86::VFCMULCPHZ128rr, X86::VFCMULCPHZ128rm, 0 },
+ { X86::VFCMULCPHZ256rr, X86::VFCMULCPHZ256rm, 0 },
+ { X86::VFCMULCPHZrr, X86::VFCMULCPHZrm, 0 },
+ { X86::VFCMULCSHZrr, X86::VFCMULCSHZrm, TB_NO_REVERSE },
{ X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, 0 },
{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 },
{ X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, 0 },
@@ -1870,6 +1874,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
{ X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFMULCPHZ128rr, X86::VFMULCPHZ128rm, 0 },
+ { X86::VFMULCPHZ256rr, X86::VFMULCPHZ256rm, 0 },
+ { X86::VFMULCPHZrr, X86::VFMULCPHZrm, 0 },
+ { X86::VFMULCSHZrr, X86::VFMULCSHZrm, TB_NO_REVERSE },
{ X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, 0 },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 },
{ X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, 0 },
@@ -3275,6 +3283,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VEXPANDPSZ128rrk, X86::VEXPANDPSZ128rmk, TB_NO_REVERSE },
{ X86::VEXPANDPSZ256rrk, X86::VEXPANDPSZ256rmk, TB_NO_REVERSE },
{ X86::VEXPANDPSZrrk, X86::VEXPANDPSZrmk, TB_NO_REVERSE },
+ { X86::VFCMADDCPHZ128r, X86::VFCMADDCPHZ128m, 0 },
+ { X86::VFCMADDCPHZ256r, X86::VFCMADDCPHZ256m, 0 },
+ { X86::VFCMADDCPHZr, X86::VFCMADDCPHZm, 0 },
+ { X86::VFCMADDCSHZr, X86::VFCMADDCSHZm, TB_NO_REVERSE },
+ { X86::VFCMULCPHZ128rrkz, X86::VFCMULCPHZ128rmkz, 0 },
+ { X86::VFCMULCPHZ256rrkz, X86::VFCMULCPHZ256rmkz, 0 },
+ { X86::VFCMULCPHZrrkz, X86::VFCMULCPHZrmkz, 0 },
+ { X86::VFCMULCSHZrrkz, X86::VFCMULCSHZrmkz, TB_NO_REVERSE },
{ X86::VFIXUPIMMPDZ128rri, X86::VFIXUPIMMPDZ128rmi, 0 },
{ X86::VFIXUPIMMPDZ256rri, X86::VFIXUPIMMPDZ256rmi, 0 },
{ X86::VFIXUPIMMPDZrri, X86::VFIXUPIMMPDZrmi, 0 },
@@ -3352,6 +3368,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_NO_REVERSE },
{ X86::VFMADD231SSr, X86::VFMADD231SSm, 0 },
{ X86::VFMADD231SSr_Int, X86::VFMADD231SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADDCPHZ128r, X86::VFMADDCPHZ128m, 0 },
+ { X86::VFMADDCPHZ256r, X86::VFMADDCPHZ256m, 0 },
+ { X86::VFMADDCPHZr, X86::VFMADDCPHZm, 0 },
+ { X86::VFMADDCSHZr, X86::VFMADDCSHZm, TB_NO_REVERSE },
{ X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, 0 },
{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, 0 },
{ X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, 0 },
@@ -3523,6 +3543,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
{ X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFMULCPHZ128rrkz, X86::VFMULCPHZ128rmkz, 0 },
+ { X86::VFMULCPHZ256rrkz, X86::VFMULCPHZ256rmkz, 0 },
+ { X86::VFMULCPHZrrkz, X86::VFMULCPHZrmkz, 0 },
+ { X86::VFMULCSHZrrkz, X86::VFMULCSHZrmkz, TB_NO_REVERSE },
{ X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, 0 },
{ X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, 0 },
{ X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0 },
@@ -4655,6 +4679,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 },
{ X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 },
{ X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 },
+ { X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mk, 0 },
+ { X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mkz, 0 },
+ { X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mk, 0 },
+ { X86::VFCMADDCPHZ256rkz, X86::VFCMADDCPHZ256mkz, 0 },
+ { X86::VFCMADDCPHZrk, X86::VFCMADDCPHZmk, 0 },
+ { X86::VFCMADDCPHZrkz, X86::VFCMADDCPHZmkz, 0 },
+ { X86::VFCMADDCSHZrk, X86::VFCMADDCSHZmk, TB_NO_REVERSE },
+ { X86::VFCMADDCSHZrkz, X86::VFCMADDCSHZmkz, TB_NO_REVERSE },
+ { X86::VFCMULCPHZ128rrk, X86::VFCMULCPHZ128rmk, 0 },
+ { X86::VFCMULCPHZ256rrk, X86::VFCMULCPHZ256rmk, 0 },
+ { X86::VFCMULCPHZrrk, X86::VFCMULCPHZrmk, 0 },
+ { X86::VFCMULCSHZrrk, X86::VFCMULCSHZrmk, TB_NO_REVERSE },
{ X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 },
{ X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 },
{ X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 },
@@ -4743,6 +4779,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE },
{ X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE },
{ X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADDCPHZ128rk, X86::VFMADDCPHZ128mk, 0 },
+ { X86::VFMADDCPHZ128rkz, X86::VFMADDCPHZ128mkz, 0 },
+ { X86::VFMADDCPHZ256rk, X86::VFMADDCPHZ256mk, 0 },
+ { X86::VFMADDCPHZ256rkz, X86::VFMADDCPHZ256mkz, 0 },
+ { X86::VFMADDCPHZrk, X86::VFMADDCPHZmk, 0 },
+ { X86::VFMADDCPHZrkz, X86::VFMADDCPHZmkz, 0 },
+ { X86::VFMADDCSHZrk, X86::VFMADDCSHZmk, TB_NO_REVERSE },
+ { X86::VFMADDCSHZrkz, X86::VFMADDCSHZmkz, TB_NO_REVERSE },
{ X86::VFMADDSUB132PDZ128rk, X86::VFMADDSUB132PDZ128mk, 0 },
{ X86::VFMADDSUB132PDZ128rkz, X86::VFMADDSUB132PDZ128mkz, 0 },
{ X86::VFMADDSUB132PDZ256rk, X86::VFMADDSUB132PDZ256mk, 0 },
@@ -4923,6 +4967,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFMSUBADD231PSZ256rkz, X86::VFMSUBADD231PSZ256mkz, 0 },
{ X86::VFMSUBADD231PSZrk, X86::VFMSUBADD231PSZmk, 0 },
{ X86::VFMSUBADD231PSZrkz, X86::VFMSUBADD231PSZmkz, 0 },
+ { X86::VFMULCPHZ128rrk, X86::VFMULCPHZ128rmk, 0 },
+ { X86::VFMULCPHZ256rrk, X86::VFMULCPHZ256rmk, 0 },
+ { X86::VFMULCPHZrrk, X86::VFMULCPHZrmk, 0 },
+ { X86::VFMULCSHZrrk, X86::VFMULCSHZrmk, TB_NO_REVERSE },
{ X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mk, 0 },
{ X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mkz, 0 },
{ X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mk, 0 },
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index b562c360c359e..bb323bfabdf47 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -572,6 +572,24 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
+def x86vfmaddc : SDNode<"X86ISD::VFMADDC", SDTFPTernaryOp>;
+def x86vfmaddcRnd : SDNode<"X86ISD::VFMADDC_RND", SDTFmaRound>;
+def x86vfcmaddc : SDNode<"X86ISD::VFCMADDC", SDTFPTernaryOp>;
+def x86vfcmaddcRnd : SDNode<"X86ISD::VFCMADDC_RND", SDTFmaRound>;
+def x86vfmulc : SDNode<"X86ISD::VFMULC", SDTFPBinOp>;
+def x86vfmulcRnd : SDNode<"X86ISD::VFMULC_RND", SDTFPBinOpRound>;
+def x86vfcmulc : SDNode<"X86ISD::VFCMULC", SDTFPBinOp>;
+def x86vfcmulcRnd : SDNode<"X86ISD::VFCMULC_RND", SDTFPBinOpRound>;
+
+def x86vfmaddcSh : SDNode<"X86ISD::VFMADDCSH", SDTFPTernaryOp>;
+def x86vfcmaddcSh : SDNode<"X86ISD::VFCMADDCSH", SDTFPTernaryOp>;
+def x86vfmulcSh : SDNode<"X86ISD::VFMULCSH", SDTFPBinOp>;
+def x86vfcmulcSh : SDNode<"X86ISD::VFCMULCSH", SDTFPBinOp>;
+def x86vfmaddcShRnd : SDNode<"X86ISD::VFMADDCSH_RND", SDTFmaRound>;
+def x86vfcmaddcShRnd : SDNode<"X86ISD::VFCMADDCSH_RND",SDTFmaRound>;
+def x86vfmulcShRnd : SDNode<"X86ISD::VFMULCSH_RND", SDTFPBinOpRound>;
+def x86vfcmulcShRnd : SDNode<"X86ISD::VFCMULCSH_RND", SDTFPBinOpRound>;
+
def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>;
def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index b0e1e808a5369..869753ec84deb 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -24,6 +24,7 @@ enum IntrinsicType : uint16_t {
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
INTR_TYPE_3OP_IMM8,
+ FMA_OP_MASK, FMA_OP_MASKZ,
CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
CVTPD2PS_MASK,
INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
@@ -1160,6 +1161,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_256, TRUNCATE_TO_REG,
X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_128, FMA_OP_MASK, X86ISD::VFCMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_256, FMA_OP_MASK, X86ISD::VFCMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_512, FMA_OP_MASK, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_csh, FMA_OP_MASK, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_128, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_256, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_512, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, X86ISD::VFCMULC_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_csh, INTR_TYPE_SCALAR_MASK, X86ISD::VFCMULCSH, X86ISD::VFCMULCSH_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_128, FMA_OP_MASK, X86ISD::VFMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_256, FMA_OP_MASK, X86ISD::VFMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_512, FMA_OP_MASK, X86ISD::VFMADDC, X86ISD::VFMADDC_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_csh, FMA_OP_MASK, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_128, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_256, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_512, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, X86ISD::VFMULC_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_csh, INTR_TYPE_SCALAR_MASK, X86ISD::VFMULCSH, X86ISD::VFMULCSH_RND),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_128, FMA_OP_MASKZ, X86ISD::VFCMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_256, FMA_OP_MASKZ, X86ISD::VFCMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_512, FMA_OP_MASKZ, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_csh, FMA_OP_MASKZ, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_128, FMA_OP_MASKZ, X86ISD::VFMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_256, FMA_OP_MASKZ, X86ISD::VFMADDC, 0),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_512, FMA_OP_MASKZ, X86ISD::VFMADDC, X86ISD::VFMADDC_RND),
+ X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_csh, FMA_OP_MASKZ, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND),
X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
diff --git a/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll
new file mode 100644
index 0000000000000..4302b03b7dbd2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @test_int_x86_avx512fp8_mask_cfmadd_ph_bst(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmadd_ph_bst:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_mask_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmadd_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_maskz_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfmadd_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp8_cfmadd_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmaddcph %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1)
+ ret <4 x float> %res
+}
+
+
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @test_int_x86_avx512fp16_mask_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_maskz_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmaddcph %ymm0, %ymm1, %ymm2
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1)
+ ret <8 x float> %res
+}
+
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512fp16_mask_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_maskz_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfmadd_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_512_rn:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmaddcph {rz-sae}, %zmm0, %zmm1, %zmm2
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmaddcph %zmm0, %zmm1, %zmm2
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @test_int_x86_avx512fp8_mask_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmadd_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_maskz_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfcmadd_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp8_cfcmadd_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmaddcph %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1)
+ ret <4 x float> %res
+}
+
+
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @test_int_x86_avx512fp16_mask_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmaddcph %ymm0, %ymm1, %ymm2
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1)
+ ret <8 x float> %res
+}
+
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512fp16_mask_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfcmadd_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_512_rn:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmaddcph {rz-sae}, %zmm0, %zmm1, %zmm2
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmaddcph %zmm0, %zmm1, %zmm2
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4)
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll
new file mode 100644
index 0000000000000..2b98e429ed3f5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @test_int_x86_avx512fp8_mask_cfmul_ph_bst(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmul_ph_bst:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %x0, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_mask_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmul_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_maskz_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfmul_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp8_cfmul_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmulcph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1)
+ ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @test_int_x86_avx512fp16_mask_cfmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmul_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_maskz_cfmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmul_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> zeroinitializer, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_cfmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfmul_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmulcph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1)
+ ret <8 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512fp16_mask_cfmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmul_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_maskz_cfmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmul_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> zeroinitializer, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfmul_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfmul_ph_512_rn:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmulcph {rz-sae}, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfmul_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmulcph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @test_int_x86_avx512fp8_mask_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmul_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_maskz_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfcmul_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp8_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp8_cfcmul_ph_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmulcph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1)
+ ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @test_int_x86_avx512fp16_mask_cfcmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmul_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_maskz_cfcmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmul_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> zeroinitializer, i8 %x3)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512fp16_cfcmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfcmul_ph_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmulcph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1)
+ ret <8 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512fp16_mask_cfcmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmul_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_maskz_cfcmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmul_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> zeroinitializer, i16 %x3, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfcmul_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfcmul_ph_512_rn:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmulcph {rz-sae}, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512fp16_cfcmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512fp16_cfcmul_ph_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmulcph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4)
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
new file mode 100644
index 0000000000000..8b273b6d8006c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+;; no mask, no rounding
+
+define <4 x float> @test_nm_nr_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1) {
+; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_mask_cfmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_nm_nr_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1) {
+; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_mask_cfcmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_nm_nr_int_x86_avx512fp16_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_cfmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_nm_nr_int_x86_avx512fp16_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_cfcmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+;; no mask, rounding
+
+define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1) {
+; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1) {
+; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfcmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfcmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 9)
+ ret <4 x float> %res
+}
+
+;; mask, no rounding
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfcmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfcmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+;; mask, rounding
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+;; maskz, no rounding
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfcmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfcmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ ret <4 x float> %res
+}
+
+;; maskz, rounding
+
+define <4 x float> @test_int_x86_avx512fp16_maskz_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmul_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_maskz_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_sh:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9)
+ ret <4 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
new file mode 100644
index 0000000000000..9afe46e9e7c63
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
+
+define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <32 x half> %lhs.coerce to <16 x float>
+ %1 = bitcast <32 x half> %rhs.coerce to <16 x float>
+ %2 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+ %3 = bitcast <16 x float> %2 to <32 x half>
+ %add.i.i = fadd fast <32 x half> %3, %acc.coerce
+ ret <32 x half> %add.i.i
+}
+
+define dso_local <16 x half> @test2(<16 x half> %acc.coerce, <16 x half> %lhs.coerce, <16 x half> %rhs.coerce) {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <16 x half> %lhs.coerce to <8 x float>
+ %1 = bitcast <16 x half> %rhs.coerce to <8 x float>
+ %2 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
+ %3 = bitcast <8 x float> %2 to <16 x half>
+ %add.i.i = fadd fast <16 x half> %3, %acc.coerce
+ ret <16 x half> %add.i.i
+}
+
+define dso_local <8 x half> @test3(<8 x half> %acc.coerce, <8 x half> %lhs.coerce, <8 x half> %rhs.coerce) {
+; CHECK-LABEL: test3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce to <4 x float>
+ %1 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %2 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
+ %3 = bitcast <4 x float> %2 to <8 x half>
+ %add.i.i = fadd fast <8 x half> %3, %acc.coerce
+ ret <8 x half> %add.i.i
+}
+
+
+define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce, <8 x half> %rhs.coerce) {
+; CHECK-LABEL: test4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce to <4 x float>
+ %1 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %2 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
+ %3 = bitcast <4 x float> %2 to <8 x half>
+ %add.i.i = fadd fast <8 x half> %acc.coerce, %3
+ ret <8 x half> %add.i.i
+}
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
new file mode 100644
index 0000000000000..1d413ad0c1065
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
+
+define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
+ %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
+ %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
+ %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %2, <16 x float> zeroinitializer, i16 -1, i32 4) #2
+ %4 = bitcast <16 x float> %3 to <32 x half>
+ %add = fadd fast <32 x half> %4, %acc.coerce
+ ret <32 x half> %add
+}
+
+define dso_local <32 x half> @test2(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
+ %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
+ %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
+ %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %2, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2
+ %4 = bitcast <16 x float> %3 to <32 x half>
+ %add = fadd fast <32 x half> %4, %acc.coerce
+ ret <32 x half> %add
+}
+
+define dso_local <16 x half> @test3(<16 x half> %acc.coerce, <16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32>
+ %xor.i.i = xor <8 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <8 x i32> %xor.i.i to <8 x float>
+ %2 = bitcast <16 x half> %rhs.coerce to <8 x float>
+ %3 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %1, <8 x float> %2, <8 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <8 x float> %3 to <16 x half>
+ %add = fadd fast <16 x half> %4, %acc.coerce
+ ret <16 x half> %add
+}
+
+define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ %add = fadd fast <8 x half> %4, %acc.coerce
+ ret <8 x half> %add
+}
+
+define dso_local <8 x half> @test5(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ %add = fadd fast <8 x half> %4, %acc.coerce
+ ret <8 x half> %add
+}
+
+define dso_local <8 x half> @test6(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %0
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ %add = fadd fast <8 x half> %4, %acc.coerce
+ ret <8 x half> %add
+}
+
+define dso_local <8 x half> @test7(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ %add = fadd fast <8 x half> %acc.coerce, %4
+ ret <8 x half> %add
+}
+
+define dso_local <8 x half> @test8(<8 x half> %acc.coerce, <4 x float> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <4 x float> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ %add = fadd fast <8 x half> %acc.coerce, %4
+ ret <8 x half> %add
+}
+
+define dso_local <32 x half> @test9(<32 x half> %acc.coerce, <8 x i64> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test9:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %xor1.i = xor <8 x i64> %lhs.coerce.conj, <i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160>
+ %0 = bitcast <8 x i64> %xor1.i to <16 x float>
+ %1 = bitcast <32 x half> %rhs.coerce to <16 x float>
+ %2 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2
+ %3 = bitcast <16 x float> %2 to <32 x half>
+ %add = fadd fast <32 x half> %3, %acc.coerce
+ ret <32 x half> %add
+}
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
new file mode 100644
index 0000000000000..29d6fcb037a69
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
+
+define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmulcph %zmm0, %zmm1, %zmm2
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
+ %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
+ %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
+ %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %2, <16 x float> zeroinitializer, i16 -1, i32 4) #2
+ %4 = bitcast <16 x float> %3 to <32 x half>
+ ret <32 x half> %4
+}
+
+; Function Attrs: nounwind readnone uwtable
+define dso_local <16 x half> @test2(<16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmulcph %ymm0, %ymm1, %ymm2
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32>
+ %xor.i.i = xor <8 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <8 x i32> %xor.i.i to <8 x float>
+ %2 = bitcast <16 x half> %rhs.coerce to <8 x float>
+ %3 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %1, <8 x float> %2, <8 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <8 x float> %3 to <16 x half>
+ ret <16 x half> %4
+}
+
+define dso_local <8 x half> @test3(<8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfcmulcph %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ ret <8 x half> %4
+}
+
+define dso_local <8 x half> @test4(<8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: test4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfmulcph %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
+ %xor.i.i = xor <4 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
+ %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
+ %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
+ %4 = bitcast <4 x float> %3 to <8 x half>
+ ret <8 x half> %4
+}
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index ef84bf32619e0..495058c08b895 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -989,5 +989,225 @@ define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) {
ret <8 x half> %5
}
+define <16 x float> @stack_fold_fmulcph(<16 x float> %a0, <16 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fmulcph:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmulcph_mask:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x float>, <16 x float>* %passthru
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
+ ret <16 x float> %3
+}
+
+define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmulcph_maskz:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
+ ret <16 x float> %3
+}
+
+define <16 x float> @stack_fold_fcmulcph(<16 x float> %a0, <16 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fcmulcph:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulcph_mask:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x float>, <16 x float>* %passthru
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
+ ret <16 x float> %3
+}
+
+define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulcph_maskz:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
+ ret <16 x float> %3
+}
+
+define <16 x float> @stack_fold_fmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddcph:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @stack_fold_fmaddcph_mask(<16 x float>* %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddcph_mask:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x float>, <16 x float>* %p
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddcph_maskz:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> zeroinitializer, <16 x float> %a1, <16 x float> %a2, i16 %2, i32 4)
+ ret <16 x float> %3
+}
+declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @stack_fold_fcmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fcmaddcph:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @stack_fold_fcmaddcph_mask(<16 x float>* %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddcph_mask:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x float>, <16 x float>* %p
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddcph_maskz:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> zeroinitializer, <16 x float> %a1, <16 x float> %a2, i16 %2, i32 4)
+ ret <16 x float> %3
+}
+declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <4 x float> @stack_fold_fmulcsh(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fmulcsh:
+ ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmulcsh_mask:
+ ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <4 x float>, <4 x float>* %passthru
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmulcsh_maskz:
+ ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fcmulcsh:
+ ;CHECK: vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulcsh_mask:
+ ;CHECK: vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <4 x float>, <4 x float>* %passthru
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulcsh_maskz:
+ ;CHECK: vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddcsh:
+ ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @stack_fold_fmaddcsh_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddcsh_mask:
+ ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x float>, <4 x float>* %p
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddcsh_maskz:
+ ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2, i32 4)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @stack_fold_fcmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fcmaddcsh:
+ ;CHECK: vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @stack_fold_fcmaddcsh_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddcsh_mask:
+ ;CHECK: vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x float>, <4 x float>* %p
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddcsh_maskz:
+ ;CHECK: vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2, i32 4)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
attributes #0 = { "unsafe-fp-math"="false" }
attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index 92cb57f27b9ab..d988f04e34d4a 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -588,5 +588,225 @@ define <16 x half> @stack_fold_subph_ymm(<16 x half> %a0, <16 x half> %a1) {
ret <16 x half> %2
}
+define <4 x float> @stack_fold_fmulc(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fmulc:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @stack_fold_fmulc_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmulc_mask:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <4 x float>, <4 x float>* %passthru
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmulc_maskz:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fcmulc(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fcmulc:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @stack_fold_fcmulc_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulc_mask:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <4 x float>, <4 x float>* %passthru
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulc_maskz:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
+ ret <4 x float> %3
+}
+
+define <4 x float> @stack_fold_fmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddc:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @stack_fold_fmaddc_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddc_mask:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x float>, <4 x float>* %p
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask)
+ ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_fmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddc_maskz:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @stack_fold_fcmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fcmaddc:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float> @stack_fold_fcmaddc_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddc_mask:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x float>, <4 x float>* %p
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask)
+ ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_fcmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddc_maskz:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @stack_fold_fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fmulc_ymm:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @stack_fold_fmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmulc_mask_ymm:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x float>, <8 x float>* %passthru
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
+ ret <8 x float> %3
+}
+
+define <8 x float> @stack_fold_fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmulc_maskz_ymm:
+ ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
+ ret <8 x float> %3
+}
+
+define <8 x float> @stack_fold_fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_fcmulc_ymm:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @stack_fold_fcmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulc_mask_ymm:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x float>, <8 x float>* %passthru
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
+ ret <8 x float> %3
+}
+
+define <8 x float> @stack_fold_fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmulc_maskz_ymm:
+ ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
+ ret <8 x float> %3
+}
+
+define <8 x float> @stack_fold_fmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fmaddc_ymm:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 -1)
+ ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @stack_fold_fmaddc_mask_ymm(<8 x float>* %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddc_mask_ymm:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x float>, <8 x float>* %p
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask)
+ ret <8 x float> %2
+}
+
+define <8 x float> @stack_fold_fmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fmaddc_maskz_ymm:
+ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> zeroinitializer, <8 x float> %a1, <8 x float> %a2, i8 %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @stack_fold_fcmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+ ;CHECK-LABEL: stack_fold_fcmaddc_ymm:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 -1)
+ ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float> @stack_fold_fcmaddc_mask_ymm(<8 x float>* %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddc_mask_ymm:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x float>, <8 x float>* %p
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask)
+ ret <8 x float> %2
+}
+
+define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_fcmaddc_maskz_ymm:
+ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> zeroinitializer, <8 x float> %a1, <8 x float> %a2, i8 %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
attributes #0 = { "unsafe-fp-math"="false" }
attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
index ca1772175f95d..a30683e2f177f 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
@@ -2484,3 +2484,195 @@
# ATT: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
# INTEL: vfnmsub231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
0x62,0x66,0x15,0x87,0xbf,0x72,0x80
+
+# ATT: vfcmaddcph %zmm28, %zmm29, %zmm30
+# INTEL: vfcmaddcph zmm30, zmm29, zmm28
+0x62,0x06,0x17,0x40,0x56,0xf4
+
+# ATT: vfcmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfcmaddcph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x17,0x10,0x56,0xf4
+
+# ATT: vfcmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfcmaddcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x17,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfcmaddcph (%r9){1to16}, %zmm29, %zmm30
+# INTEL: vfcmaddcph zmm30, zmm29, dword ptr [r9]{1to16}
+0x62,0x46,0x17,0x50,0x56,0x31
+
+# ATT: vfcmaddcph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfcmaddcph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x17,0x40,0x56,0x71,0x7f
+
+# ATT: vfcmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfcmaddcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16}
+0x62,0x66,0x17,0xd7,0x56,0x72,0x80
+
+# ATT: vfcmaddcsh %xmm28, %xmm29, %xmm30
+# INTEL: vfcmaddcsh xmm30, xmm29, xmm28
+0x62,0x06,0x17,0x00,0x57,0xf4
+
+# ATT: vfcmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfcmaddcsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x17,0x10,0x57,0xf4
+
+# ATT: vfcmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfcmaddcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x17,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfcmaddcsh (%r9), %xmm29, %xmm30
+# INTEL: vfcmaddcsh xmm30, xmm29, dword ptr [r9]
+0x62,0x46,0x17,0x00,0x57,0x31
+
+# ATT: vfcmaddcsh 508(%rcx), %xmm29, %xmm30
+# INTEL: vfcmaddcsh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x66,0x17,0x00,0x57,0x71,0x7f
+
+# ATT: vfcmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfcmaddcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]
+0x62,0x66,0x17,0x87,0x57,0x72,0x80
+
+# ATT: vfcmulcph %zmm28, %zmm29, %zmm30
+# INTEL: vfcmulcph zmm30, zmm29, zmm28
+0x62,0x06,0x17,0x40,0xd6,0xf4
+
+# ATT: vfcmulcph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfcmulcph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x17,0x10,0xd6,0xf4
+
+# ATT: vfcmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfcmulcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x17,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfcmulcph (%r9){1to16}, %zmm29, %zmm30
+# INTEL: vfcmulcph zmm30, zmm29, dword ptr [r9]{1to16}
+0x62,0x46,0x17,0x50,0xd6,0x31
+
+# ATT: vfcmulcph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfcmulcph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x17,0x40,0xd6,0x71,0x7f
+
+# ATT: vfcmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfcmulcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16}
+0x62,0x66,0x17,0xd7,0xd6,0x72,0x80
+
+# ATT: vfcmulcsh %xmm28, %xmm29, %xmm30
+# INTEL: vfcmulcsh xmm30, xmm29, xmm28
+0x62,0x06,0x17,0x00,0xd7,0xf4
+
+# ATT: vfcmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfcmulcsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x17,0x10,0xd7,0xf4
+
+# ATT: vfcmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfcmulcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x17,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfcmulcsh (%r9), %xmm29, %xmm30
+# INTEL: vfcmulcsh xmm30, xmm29, dword ptr [r9]
+0x62,0x46,0x17,0x00,0xd7,0x31
+
+# ATT: vfcmulcsh 508(%rcx), %xmm29, %xmm30
+# INTEL: vfcmulcsh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x66,0x17,0x00,0xd7,0x71,0x7f
+
+# ATT: vfcmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfcmulcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]
+0x62,0x66,0x17,0x87,0xd7,0x72,0x80
+
+# ATT: vfmaddcph %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddcph zmm30, zmm29, zmm28
+0x62,0x06,0x16,0x40,0x56,0xf4
+
+# ATT: vfmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmaddcph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x16,0x10,0x56,0xf4
+
+# ATT: vfmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmaddcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x16,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddcph (%r9){1to16}, %zmm29, %zmm30
+# INTEL: vfmaddcph zmm30, zmm29, dword ptr [r9]{1to16}
+0x62,0x46,0x16,0x50,0x56,0x31
+
+# ATT: vfmaddcph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmaddcph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x16,0x40,0x56,0x71,0x7f
+
+# ATT: vfmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmaddcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16}
+0x62,0x66,0x16,0xd7,0x56,0x72,0x80
+
+# ATT: vfmaddcsh %xmm28, %xmm29, %xmm30
+# INTEL: vfmaddcsh xmm30, xmm29, xmm28
+0x62,0x06,0x16,0x00,0x57,0xf4
+
+# ATT: vfmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmaddcsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x16,0x10,0x57,0xf4
+
+# ATT: vfmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmaddcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x16,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddcsh (%r9), %xmm29, %xmm30
+# INTEL: vfmaddcsh xmm30, xmm29, dword ptr [r9]
+0x62,0x46,0x16,0x00,0x57,0x31
+
+# ATT: vfmaddcsh 508(%rcx), %xmm29, %xmm30
+# INTEL: vfmaddcsh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x66,0x16,0x00,0x57,0x71,0x7f
+
+# ATT: vfmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmaddcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]
+0x62,0x66,0x16,0x87,0x57,0x72,0x80
+
+# ATT: vfmulcph %zmm28, %zmm29, %zmm30
+# INTEL: vfmulcph zmm30, zmm29, zmm28
+0x62,0x06,0x16,0x40,0xd6,0xf4
+
+# ATT: vfmulcph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vfmulcph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x16,0x10,0xd6,0xf4
+
+# ATT: vfmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vfmulcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x16,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmulcph (%r9){1to16}, %zmm29, %zmm30
+# INTEL: vfmulcph zmm30, zmm29, dword ptr [r9]{1to16}
+0x62,0x46,0x16,0x50,0xd6,0x31
+
+# ATT: vfmulcph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vfmulcph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x16,0x40,0xd6,0x71,0x7f
+
+# ATT: vfmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vfmulcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16}
+0x62,0x66,0x16,0xd7,0xd6,0x72,0x80
+
+# ATT: vfmulcsh %xmm28, %xmm29, %xmm30
+# INTEL: vfmulcsh xmm30, xmm29, xmm28
+0x62,0x06,0x16,0x00,0xd7,0xf4
+
+# ATT: vfmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vfmulcsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x16,0x10,0xd7,0xf4
+
+# ATT: vfmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vfmulcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x16,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vfmulcsh (%r9), %xmm29, %xmm30
+# INTEL: vfmulcsh xmm30, xmm29, dword ptr [r9]
+0x62,0x46,0x16,0x00,0xd7,0x31
+
+# ATT: vfmulcsh 508(%rcx), %xmm29, %xmm30
+# INTEL: vfmulcsh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x66,0x16,0x00,0xd7,0x71,0x7f
+
+# ATT: vfmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vfmulcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]
+0x62,0x66,0x16,0x87,0xd7,0x72,0x80
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
index 390622b2d4824..726d674ade8d8 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
@@ -2212,3 +2212,163 @@
# ATT: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
# INTEL: vfnmsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80
+
+# ATT: vfcmaddcph %ymm4, %ymm5, %ymm6
+# INTEL: vfcmaddcph ymm6, ymm5, ymm4
+0x62,0xf6,0x57,0x28,0x56,0xf4
+
+# ATT: vfcmaddcph %xmm4, %xmm5, %xmm6
+# INTEL: vfcmaddcph xmm6, xmm5, xmm4
+0x62,0xf6,0x57,0x08,0x56,0xf4
+
+# ATT: vfcmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfcmaddcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x57,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfcmaddcph (%ecx){1to8}, %ymm5, %ymm6
+# INTEL: vfcmaddcph ymm6, ymm5, dword ptr [ecx]{1to8}
+0x62,0xf6,0x57,0x38,0x56,0x31
+
+# ATT: vfcmaddcph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfcmaddcph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x57,0x28,0x56,0x71,0x7f
+
+# ATT: vfcmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfcmaddcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8}
+0x62,0xf6,0x57,0xbf,0x56,0x72,0x80
+
+# ATT: vfcmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfcmaddcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x57,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfcmaddcph (%ecx){1to4}, %xmm5, %xmm6
+# INTEL: vfcmaddcph xmm6, xmm5, dword ptr [ecx]{1to4}
+0x62,0xf6,0x57,0x18,0x56,0x31
+
+# ATT: vfcmaddcph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfcmaddcph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x57,0x08,0x56,0x71,0x7f
+
+# ATT: vfcmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfcmaddcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4}
+0x62,0xf6,0x57,0x9f,0x56,0x72,0x80
+
+# ATT: vfcmulcph %ymm4, %ymm5, %ymm6
+# INTEL: vfcmulcph ymm6, ymm5, ymm4
+0x62,0xf6,0x57,0x28,0xd6,0xf4
+
+# ATT: vfcmulcph %xmm4, %xmm5, %xmm6
+# INTEL: vfcmulcph xmm6, xmm5, xmm4
+0x62,0xf6,0x57,0x08,0xd6,0xf4
+
+# ATT: vfcmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfcmulcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x57,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfcmulcph (%ecx){1to8}, %ymm5, %ymm6
+# INTEL: vfcmulcph ymm6, ymm5, dword ptr [ecx]{1to8}
+0x62,0xf6,0x57,0x38,0xd6,0x31
+
+# ATT: vfcmulcph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfcmulcph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x57,0x28,0xd6,0x71,0x7f
+
+# ATT: vfcmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfcmulcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8}
+0x62,0xf6,0x57,0xbf,0xd6,0x72,0x80
+
+# ATT: vfcmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfcmulcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x57,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfcmulcph (%ecx){1to4}, %xmm5, %xmm6
+# INTEL: vfcmulcph xmm6, xmm5, dword ptr [ecx]{1to4}
+0x62,0xf6,0x57,0x18,0xd6,0x31
+
+# ATT: vfcmulcph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfcmulcph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x57,0x08,0xd6,0x71,0x7f
+
+# ATT: vfcmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfcmulcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4}
+0x62,0xf6,0x57,0x9f,0xd6,0x72,0x80
+
+# ATT: vfmaddcph %ymm4, %ymm5, %ymm6
+# INTEL: vfmaddcph ymm6, ymm5, ymm4
+0x62,0xf6,0x56,0x28,0x56,0xf4
+
+# ATT: vfmaddcph %xmm4, %xmm5, %xmm6
+# INTEL: vfmaddcph xmm6, xmm5, xmm4
+0x62,0xf6,0x56,0x08,0x56,0xf4
+
+# ATT: vfmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmaddcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x56,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddcph (%ecx){1to8}, %ymm5, %ymm6
+# INTEL: vfmaddcph ymm6, ymm5, dword ptr [ecx]{1to8}
+0x62,0xf6,0x56,0x38,0x56,0x31
+
+# ATT: vfmaddcph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmaddcph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x56,0x28,0x56,0x71,0x7f
+
+# ATT: vfmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmaddcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8}
+0x62,0xf6,0x56,0xbf,0x56,0x72,0x80
+
+# ATT: vfmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmaddcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x56,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmaddcph (%ecx){1to4}, %xmm5, %xmm6
+# INTEL: vfmaddcph xmm6, xmm5, dword ptr [ecx]{1to4}
+0x62,0xf6,0x56,0x18,0x56,0x31
+
+# ATT: vfmaddcph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmaddcph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x56,0x08,0x56,0x71,0x7f
+
+# ATT: vfmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmaddcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4}
+0x62,0xf6,0x56,0x9f,0x56,0x72,0x80
+
+# ATT: vfmulcph %ymm4, %ymm5, %ymm6
+# INTEL: vfmulcph ymm6, ymm5, ymm4
+0x62,0xf6,0x56,0x28,0xd6,0xf4
+
+# ATT: vfmulcph %xmm4, %xmm5, %xmm6
+# INTEL: vfmulcph xmm6, xmm5, xmm4
+0x62,0xf6,0x56,0x08,0xd6,0xf4
+
+# ATT: vfmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vfmulcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x56,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmulcph (%ecx){1to8}, %ymm5, %ymm6
+# INTEL: vfmulcph ymm6, ymm5, dword ptr [ecx]{1to8}
+0x62,0xf6,0x56,0x38,0xd6,0x31
+
+# ATT: vfmulcph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vfmulcph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x56,0x28,0xd6,0x71,0x7f
+
+# ATT: vfmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vfmulcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8}
+0x62,0xf6,0x56,0xbf,0xd6,0x72,0x80
+
+# ATT: vfmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vfmulcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x56,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vfmulcph (%ecx){1to4}, %xmm5, %xmm6
+# INTEL: vfmulcph xmm6, xmm5, dword ptr [ecx]{1to4}
+0x62,0xf6,0x56,0x18,0xd6,0x31
+
+# ATT: vfmulcph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vfmulcph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x56,0x08,0xd6,0x71,0x7f
+
+# ATT: vfmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vfmulcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4}
+0x62,0xf6,0x56,0x9f,0xd6,0x72,0x80
diff --git a/llvm/test/MC/X86/avx512fp16-complex-fma.s b/llvm/test/MC/X86/avx512fp16-complex-fma.s
new file mode 100644
index 0000000000000..bb02f12c1e40a
--- /dev/null
+++ b/llvm/test/MC/X86/avx512fp16-complex-fma.s
@@ -0,0 +1,324 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s > %t 2> %t.err
+// RUN: FileCheck < %t %s
+// RUN: FileCheck --check-prefix=CHECK-STDERR < %t.err %s
+
+// CHECK: vfcmaddcph %zmm24, %zmm23, %zmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %zmm24, %zmm23, %zmm24
+
+// CHECK: vfcmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23
+
+// CHECK: vfcmaddcph %zmm24, %zmm23, %zmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %zmm24, %zmm23, %zmm24 {%k7}
+
+// CHECK: vfcmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+
+// CHECK: vfcmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+
+// CHECK: vfcmaddcph (%rip){1to16}, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph (%rip){1to16}, %zmm23, %zmm23
+
+// CHECK: vfcmaddcph -2048(,%rbp,2), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph -2048(,%rbp,2), %zmm23, %zmm23
+
+// CHECK: vfcmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfcmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfcmaddcsh %xmm24, %xmm23, %xmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh %xmm24, %xmm23, %xmm24
+
+// CHECK: vfcmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+
+// CHECK: vfcmaddcsh %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfcmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfcmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfcmaddcsh (%rip), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh (%rip), %xmm23, %xmm23
+
+// CHECK: vfcmaddcsh -128(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh -128(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfcmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmulcph %zmm24, %zmm23, %zmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %zmm24, %zmm23, %zmm24
+
+// CHECK: vfcmulcph {rn-sae}, %zmm24, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph {rn-sae}, %zmm24, %zmm23, %zmm23
+
+// CHECK: vfcmulcph %zmm24, %zmm23, %zmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %zmm24, %zmm23, %zmm24 {%k7}
+
+// CHECK: vfcmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+
+// CHECK: vfcmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+
+// CHECK: vfcmulcph (%rip){1to16}, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph (%rip){1to16}, %zmm23, %zmm23
+
+// CHECK: vfcmulcph -2048(,%rbp,2), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph -2048(,%rbp,2), %zmm23, %zmm23
+
+// CHECK: vfcmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfcmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfcmulcsh %xmm24, %xmm23, %xmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh %xmm24, %xmm23, %xmm24
+
+// CHECK: vfcmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+
+// CHECK: vfcmulcsh %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfcmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfcmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfcmulcsh (%rip), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh (%rip), %xmm23, %xmm23
+
+// CHECK: vfcmulcsh -128(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh -128(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfcmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmaddcph %zmm24, %zmm23, %zmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %zmm24, %zmm23, %zmm24
+
+// CHECK: vfmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23
+
+// CHECK: vfmaddcph %zmm24, %zmm23, %zmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %zmm24, %zmm23, %zmm24 {%k7}
+
+// CHECK: vfmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+
+// CHECK: vfmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+
+// CHECK: vfmaddcph (%rip){1to16}, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph (%rip){1to16}, %zmm23, %zmm23
+
+// CHECK: vfmaddcph -2048(,%rbp,2), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph -2048(,%rbp,2), %zmm23, %zmm23
+
+// CHECK: vfmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfmaddcsh %xmm24, %xmm23, %xmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh %xmm24, %xmm23, %xmm24
+
+// CHECK: vfmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+
+// CHECK: vfmaddcsh %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfmaddcsh (%rip), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh (%rip), %xmm23, %xmm23
+
+// CHECK: vfmaddcsh -128(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh -128(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmulcph %zmm24, %zmm23, %zmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %zmm24, %zmm23, %zmm24
+
+// CHECK: vfmulcph {rn-sae}, %zmm24, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph {rn-sae}, %zmm24, %zmm23, %zmm23
+
+// CHECK: vfmulcph %zmm24, %zmm23, %zmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %zmm24, %zmm23, %zmm24 {%k7}
+
+// CHECK: vfmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23
+
+// CHECK: vfmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7}
+
+// CHECK: vfmulcph (%rip){1to16}, %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph (%rip){1to16}, %zmm23, %zmm23
+
+// CHECK: vfmulcph -2048(,%rbp,2), %zmm23, %zmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph -2048(,%rbp,2), %zmm23, %zmm23
+
+// CHECK: vfmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z}
+
+// CHECK: vfmulcsh %xmm24, %xmm23, %xmm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh %xmm24, %xmm23, %xmm24
+
+// CHECK: vfmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23
+
+// CHECK: vfmulcsh %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfmulcsh (%rip), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh (%rip), %xmm23, %xmm23
+
+// CHECK: vfmulcsh -128(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh -128(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z}
+
diff --git a/llvm/test/MC/X86/avx512fp16-complex-fma_vl.s b/llvm/test/MC/X86/avx512fp16-complex-fma_vl.s
new file mode 100644
index 0000000000000..a7eb1c9074f89
--- /dev/null
+++ b/llvm/test/MC/X86/avx512fp16-complex-fma_vl.s
@@ -0,0 +1,292 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s > %t 2> %t.err
+// RUN: FileCheck < %t %s
+// RUN: FileCheck --check-prefix=CHECK-STDERR < %t.err %s
+
+// CHECK: vfcmaddcph %ymm24, %ymm23, %ymm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %ymm24, %ymm23, %ymm24
+
+// CHECK: vfcmaddcph %ymm24, %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %ymm24, %ymm23, %ymm23 {%k7}
+
+// CHECK: vfcmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+
+// CHECK: vfcmaddcph %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %xmm24, %xmm23, %xmm23
+
+// CHECK: vfcmaddcph %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfcmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+
+// CHECK: vfcmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+
+// CHECK: vfcmaddcph (%rip){1to8}, %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph (%rip){1to8}, %ymm23, %ymm23
+
+// CHECK: vfcmaddcph -1024(,%rbp,2), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph -1024(,%rbp,2), %ymm23, %ymm23
+
+// CHECK: vfcmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfcmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfcmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfcmaddcph (%rip){1to4}, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph (%rip){1to4}, %xmm23, %xmm23
+
+// CHECK: vfcmaddcph -512(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph -512(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfcmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmulcph %ymm24, %ymm23, %ymm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %ymm24, %ymm23, %ymm24
+
+// CHECK: vfcmulcph %ymm24, %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %ymm24, %ymm23, %ymm23 {%k7}
+
+// CHECK: vfcmulcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+
+// CHECK: vfcmulcph %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %xmm24, %xmm23, %xmm23
+
+// CHECK: vfcmulcph %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfcmulcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+
+// CHECK: vfcmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+
+// CHECK: vfcmulcph (%rip){1to8}, %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph (%rip){1to8}, %ymm23, %ymm23
+
+// CHECK: vfcmulcph -1024(,%rbp,2), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph -1024(,%rbp,2), %ymm23, %ymm23
+
+// CHECK: vfcmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfcmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfcmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfcmulcph (%rip){1to4}, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph (%rip){1to4}, %xmm23, %xmm23
+
+// CHECK: vfcmulcph -512(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph -512(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfcmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfcmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfcmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmaddcph %ymm24, %ymm23, %ymm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %ymm24, %ymm23, %ymm24
+
+// CHECK: vfmaddcph %ymm24, %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %ymm24, %ymm23, %ymm23 {%k7}
+
+// CHECK: vfmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+
+// CHECK: vfmaddcph %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %xmm24, %xmm23, %xmm23
+
+// CHECK: vfmaddcph %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+
+// CHECK: vfmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+
+// CHECK: vfmaddcph (%rip){1to8}, %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph (%rip){1to8}, %ymm23, %ymm23
+
+// CHECK: vfmaddcph -1024(,%rbp,2), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph -1024(,%rbp,2), %ymm23, %ymm23
+
+// CHECK: vfmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfmaddcph (%rip){1to4}, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph (%rip){1to4}, %xmm23, %xmm23
+
+// CHECK: vfmaddcph -512(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph -512(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmulcph %ymm24, %ymm23, %ymm24
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %ymm24, %ymm23, %ymm24
+
+// CHECK: vfmulcph %ymm24, %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %ymm24, %ymm23, %ymm23 {%k7}
+
+// CHECK: vfmulcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %ymm24, %ymm23, %ymm24 {%k7} {z}
+
+// CHECK: vfmulcph %xmm24, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %xmm24, %xmm23, %xmm23
+
+// CHECK: vfmulcph %xmm24, %xmm23, %xmm24 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %xmm24, %xmm23, %xmm24 {%k7}
+
+// CHECK: vfmulcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph %xmm24, %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23
+
+// CHECK: vfmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7}
+
+// CHECK: vfmulcph (%rip){1to8}, %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph (%rip){1to8}, %ymm23, %ymm23
+
+// CHECK: vfmulcph -1024(,%rbp,2), %ymm23, %ymm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph -1024(,%rbp,2), %ymm23, %ymm23
+
+// CHECK: vfmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z}
+
+// CHECK: vfmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23
+
+// CHECK: vfmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7}
+
+// CHECK: vfmulcph (%rip){1to4}, %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph (%rip){1to4}, %xmm23, %xmm23
+
+// CHECK: vfmulcph -512(,%rbp,2), %xmm23, %xmm23
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph -512(,%rbp,2), %xmm23, %xmm23
+
+// CHECK: vfmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z}
+
+// CHECK: vfmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+// CHECK-STDERR: warning: Destination register should be distinct from source registers
+ vfmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z}
+
diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s
index 6f3165d5994ad..cc619d877d8a1 100644
--- a/llvm/test/MC/X86/avx512fp16.s
+++ b/llvm/test/MC/X86/avx512fp16.s
@@ -2483,3 +2483,195 @@
// CHECK: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbf,0x72,0x80]
vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfcmaddcph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x40,0x56,0xf4]
+ vfcmaddcph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfcmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x10,0x56,0xf4]
+ vfcmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x17,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfcmaddcph (%r9){1to16}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x17,0x50,0x56,0x31]
+ vfcmaddcph (%r9){1to16}, %zmm29, %zmm30
+
+// CHECK: vfcmaddcph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x17,0x40,0x56,0x71,0x7f]
+ vfcmaddcph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfcmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x17,0xd7,0x56,0x72,0x80]
+ vfcmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfcmaddcsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x00,0x57,0xf4]
+ vfcmaddcsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfcmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x10,0x57,0xf4]
+ vfcmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfcmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x17,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfcmaddcsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x17,0x00,0x57,0x31]
+ vfcmaddcsh (%r9), %xmm29, %xmm30
+
+// CHECK: vfcmaddcsh 508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x17,0x00,0x57,0x71,0x7f]
+ vfcmaddcsh 508(%rcx), %xmm29, %xmm30
+
+// CHECK: vfcmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x17,0x87,0x57,0x72,0x80]
+ vfcmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfcmulcph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x40,0xd6,0xf4]
+ vfcmulcph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfcmulcph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x10,0xd6,0xf4]
+ vfcmulcph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x17,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfcmulcph (%r9){1to16}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x17,0x50,0xd6,0x31]
+ vfcmulcph (%r9){1to16}, %zmm29, %zmm30
+
+// CHECK: vfcmulcph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x17,0x40,0xd6,0x71,0x7f]
+ vfcmulcph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfcmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x17,0xd7,0xd6,0x72,0x80]
+ vfcmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfcmulcsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x00,0xd7,0xf4]
+ vfcmulcsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfcmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x17,0x10,0xd7,0xf4]
+ vfcmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfcmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x17,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfcmulcsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x17,0x00,0xd7,0x31]
+ vfcmulcsh (%r9), %xmm29, %xmm30
+
+// CHECK: vfcmulcsh 508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x17,0x00,0xd7,0x71,0x7f]
+ vfcmulcsh 508(%rcx), %xmm29, %xmm30
+
+// CHECK: vfcmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x17,0x87,0xd7,0x72,0x80]
+ vfcmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmaddcph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x40,0x56,0xf4]
+ vfmaddcph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x10,0x56,0xf4]
+ vfmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x16,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmaddcph (%r9){1to16}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x16,0x50,0x56,0x31]
+ vfmaddcph (%r9){1to16}, %zmm29, %zmm30
+
+// CHECK: vfmaddcph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x16,0x40,0x56,0x71,0x7f]
+ vfmaddcph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x16,0xd7,0x56,0x72,0x80]
+ vfmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmaddcsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x00,0x57,0xf4]
+ vfmaddcsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x10,0x57,0xf4]
+ vfmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x16,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmaddcsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x16,0x00,0x57,0x31]
+ vfmaddcsh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmaddcsh 508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x16,0x00,0x57,0x71,0x7f]
+ vfmaddcsh 508(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x16,0x87,0x57,0x72,0x80]
+ vfmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vfmulcph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x40,0xd6,0xf4]
+ vfmulcph %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmulcph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x10,0xd6,0xf4]
+ vfmulcph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vfmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x16,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vfmulcph (%r9){1to16}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x16,0x50,0xd6,0x31]
+ vfmulcph (%r9){1to16}, %zmm29, %zmm30
+
+// CHECK: vfmulcph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x16,0x40,0xd6,0x71,0x7f]
+ vfmulcph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vfmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x16,0xd7,0xd6,0x72,0x80]
+ vfmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vfmulcsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x00,0xd7,0xf4]
+ vfmulcsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x16,0x10,0xd7,0xf4]
+ vfmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vfmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x16,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vfmulcsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x16,0x00,0xd7,0x31]
+ vfmulcsh (%r9), %xmm29, %xmm30
+
+// CHECK: vfmulcsh 508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x16,0x00,0xd7,0x71,0x7f]
+ vfmulcsh 508(%rcx), %xmm29, %xmm30
+
+// CHECK: vfmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x16,0x87,0xd7,0x72,0x80]
+ vfmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z}
diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s
index a3f888e045393..989effae40def 100644
--- a/llvm/test/MC/X86/avx512fp16vl.s
+++ b/llvm/test/MC/X86/avx512fp16vl.s
@@ -2211,3 +2211,163 @@
// CHECK: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80]
vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfcmaddcph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x28,0x56,0xf4]
+ vfcmaddcph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfcmaddcph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x56,0xf4]
+ vfcmaddcph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfcmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x57,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfcmaddcph (%ecx){1to8}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x38,0x56,0x31]
+ vfcmaddcph (%ecx){1to8}, %ymm5, %ymm6
+
+// CHECK: vfcmaddcph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x28,0x56,0x71,0x7f]
+ vfcmaddcph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfcmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x57,0xbf,0x56,0x72,0x80]
+ vfcmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfcmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfcmaddcph (%ecx){1to4}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x18,0x56,0x31]
+ vfcmaddcph (%ecx){1to4}, %xmm5, %xmm6
+
+// CHECK: vfcmaddcph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x56,0x71,0x7f]
+ vfcmaddcph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfcmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x57,0x9f,0x56,0x72,0x80]
+ vfcmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfcmulcph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x28,0xd6,0xf4]
+ vfcmulcph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfcmulcph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd6,0xf4]
+ vfcmulcph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfcmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x57,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfcmulcph (%ecx){1to8}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x38,0xd6,0x31]
+ vfcmulcph (%ecx){1to8}, %ymm5, %ymm6
+
+// CHECK: vfcmulcph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x28,0xd6,0x71,0x7f]
+ vfcmulcph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfcmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x57,0xbf,0xd6,0x72,0x80]
+ vfcmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfcmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfcmulcph (%ecx){1to4}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x18,0xd6,0x31]
+ vfcmulcph (%ecx){1to4}, %xmm5, %xmm6
+
+// CHECK: vfcmulcph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd6,0x71,0x7f]
+ vfcmulcph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfcmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x57,0x9f,0xd6,0x72,0x80]
+ vfcmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmaddcph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x28,0x56,0xf4]
+ vfmaddcph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmaddcph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x56,0xf4]
+ vfmaddcph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x56,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmaddcph (%ecx){1to8}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x38,0x56,0x31]
+ vfmaddcph (%ecx){1to8}, %ymm5, %ymm6
+
+// CHECK: vfmaddcph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x28,0x56,0x71,0x7f]
+ vfmaddcph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x56,0xbf,0x56,0x72,0x80]
+ vfmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmaddcph (%ecx){1to4}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x18,0x56,0x31]
+ vfmaddcph (%ecx){1to4}, %xmm5, %xmm6
+
+// CHECK: vfmaddcph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x56,0x71,0x7f]
+ vfmaddcph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x56,0x9f,0x56,0x72,0x80]
+ vfmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vfmulcph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x28,0xd6,0xf4]
+ vfmulcph %ymm4, %ymm5, %ymm6
+
+// CHECK: vfmulcph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd6,0xf4]
+ vfmulcph %xmm4, %xmm5, %xmm6
+
+// CHECK: vfmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x56,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vfmulcph (%ecx){1to8}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x38,0xd6,0x31]
+ vfmulcph (%ecx){1to8}, %ymm5, %ymm6
+
+// CHECK: vfmulcph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x28,0xd6,0x71,0x7f]
+ vfmulcph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vfmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x56,0xbf,0xd6,0x72,0x80]
+ vfmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vfmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vfmulcph (%ecx){1to4}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x18,0xd6,0x31]
+ vfmulcph (%ecx){1to4}, %xmm5, %xmm6
+
+// CHECK: vfmulcph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd6,0x71,0x7f]
+ vfmulcph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vfmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x56,0x9f,0xd6,0x72,0x80]
+ vfmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z}
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
index e2fb2e4ddde2e..8c48fa3116d68 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
@@ -2355,3 +2355,195 @@
// CHECK: vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbf,0x72,0x80]
vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vfcmaddcph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x57,0x48,0x56,0xf4]
+ vfcmaddcph zmm6, zmm5, zmm4
+
+// CHECK: vfcmaddcph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x57,0x18,0x56,0xf4]
+ vfcmaddcph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfcmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x57,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfcmaddcph zmm6, zmm5, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x57,0x58,0x56,0x31]
+ vfcmaddcph zmm6, zmm5, dword ptr [ecx]{1to16}
+
+// CHECK: vfcmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x57,0x48,0x56,0x71,0x7f]
+ vfcmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfcmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x57,0xdf,0x56,0x72,0x80]
+ vfcmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+
+// CHECK: vfcmaddcsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x57,0xf4]
+ vfcmaddcsh xmm6, xmm5, xmm4
+
+// CHECK: vfcmaddcsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x57,0x18,0x57,0xf4]
+ vfcmaddcsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfcmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfcmaddcsh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x57,0x31]
+ vfcmaddcsh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vfcmaddcsh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x57,0x71,0x7f]
+ vfcmaddcsh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vfcmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf6,0x57,0x8f,0x57,0x72,0x80]
+ vfcmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+
+// CHECK: vfcmulcph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x57,0x48,0xd6,0xf4]
+ vfcmulcph zmm6, zmm5, zmm4
+
+// CHECK: vfcmulcph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x57,0x18,0xd6,0xf4]
+ vfcmulcph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfcmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x57,0x4f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfcmulcph zmm6, zmm5, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x57,0x58,0xd6,0x31]
+ vfcmulcph zmm6, zmm5, dword ptr [ecx]{1to16}
+
+// CHECK: vfcmulcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x57,0x48,0xd6,0x71,0x7f]
+ vfcmulcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfcmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x57,0xdf,0xd6,0x72,0x80]
+ vfcmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+
+// CHECK: vfcmulcsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd7,0xf4]
+ vfcmulcsh xmm6, xmm5, xmm4
+
+// CHECK: vfcmulcsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x57,0x18,0xd7,0xf4]
+ vfcmulcsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfcmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0xd7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfcmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfcmulcsh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd7,0x31]
+ vfcmulcsh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vfcmulcsh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd7,0x71,0x7f]
+ vfcmulcsh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vfcmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf6,0x57,0x8f,0xd7,0x72,0x80]
+ vfcmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+
+// CHECK: vfmaddcph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x56,0x48,0x56,0xf4]
+ vfmaddcph zmm6, zmm5, zmm4
+
+// CHECK: vfmaddcph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x56,0x18,0x56,0xf4]
+ vfmaddcph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x56,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmaddcph zmm6, zmm5, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x56,0x58,0x56,0x31]
+ vfmaddcph zmm6, zmm5, dword ptr [ecx]{1to16}
+
+// CHECK: vfmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x56,0x48,0x56,0x71,0x7f]
+ vfmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x56,0xdf,0x56,0x72,0x80]
+ vfmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+
+// CHECK: vfmaddcsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x57,0xf4]
+ vfmaddcsh xmm6, xmm5, xmm4
+
+// CHECK: vfmaddcsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x56,0x18,0x57,0xf4]
+ vfmaddcsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmaddcsh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x57,0x31]
+ vfmaddcsh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vfmaddcsh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x57,0x71,0x7f]
+ vfmaddcsh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vfmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf6,0x56,0x8f,0x57,0x72,0x80]
+ vfmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+
+// CHECK: vfmulcph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x56,0x48,0xd6,0xf4]
+ vfmulcph zmm6, zmm5, zmm4
+
+// CHECK: vfmulcph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x56,0x18,0xd6,0xf4]
+ vfmulcph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vfmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x56,0x4f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmulcph zmm6, zmm5, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x56,0x58,0xd6,0x31]
+ vfmulcph zmm6, zmm5, dword ptr [ecx]{1to16}
+
+// CHECK: vfmulcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x56,0x48,0xd6,0x71,0x7f]
+ vfmulcph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x56,0xdf,0xd6,0x72,0x80]
+ vfmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16}
+
+// CHECK: vfmulcsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd7,0xf4]
+ vfmulcsh xmm6, xmm5, xmm4
+
+// CHECK: vfmulcsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x56,0x18,0xd7,0xf4]
+ vfmulcsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vfmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0xd7,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vfmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmulcsh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd7,0x31]
+ vfmulcsh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vfmulcsh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd7,0x71,0x7f]
+ vfmulcsh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vfmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf6,0x56,0x8f,0xd7,0x72,0x80]
+ vfmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
index 427cd2f2eaa04..39392f0b6e4bc 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
@@ -2211,3 +2211,163 @@
// CHECK: vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
// CHECK: encoding: [0x62,0x66,0x15,0x97,0xbe,0x72,0x80]
vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfcmaddcph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x17,0x20,0x56,0xf4]
+ vfcmaddcph ymm30, ymm29, ymm28
+
+// CHECK: vfcmaddcph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x17,0x00,0x56,0xf4]
+ vfcmaddcph xmm30, xmm29, xmm28
+
+// CHECK: vfcmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x17,0x27,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfcmaddcph ymm30, ymm29, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x17,0x30,0x56,0x31]
+ vfcmaddcph ymm30, ymm29, dword ptr [r9]{1to8}
+
+// CHECK: vfcmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x17,0x20,0x56,0x71,0x7f]
+ vfcmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfcmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x66,0x17,0xb7,0x56,0x72,0x80]
+ vfcmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vfcmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x17,0x07,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfcmaddcph xmm30, xmm29, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x46,0x17,0x10,0x56,0x31]
+ vfcmaddcph xmm30, xmm29, dword ptr [r9]{1to4}
+
+// CHECK: vfcmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x17,0x00,0x56,0x71,0x7f]
+ vfcmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfcmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x66,0x17,0x97,0x56,0x72,0x80]
+ vfcmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+
+// CHECK: vfcmulcph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x17,0x20,0xd6,0xf4]
+ vfcmulcph ymm30, ymm29, ymm28
+
+// CHECK: vfcmulcph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x17,0x00,0xd6,0xf4]
+ vfcmulcph xmm30, xmm29, xmm28
+
+// CHECK: vfcmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x17,0x27,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfcmulcph ymm30, ymm29, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x17,0x30,0xd6,0x31]
+ vfcmulcph ymm30, ymm29, dword ptr [r9]{1to8}
+
+// CHECK: vfcmulcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x17,0x20,0xd6,0x71,0x7f]
+ vfcmulcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfcmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x66,0x17,0xb7,0xd6,0x72,0x80]
+ vfcmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vfcmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x17,0x07,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfcmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfcmulcph xmm30, xmm29, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x46,0x17,0x10,0xd6,0x31]
+ vfcmulcph xmm30, xmm29, dword ptr [r9]{1to4}
+
+// CHECK: vfcmulcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x17,0x00,0xd6,0x71,0x7f]
+ vfcmulcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfcmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x66,0x17,0x97,0xd6,0x72,0x80]
+ vfcmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+
+// CHECK: vfmaddcph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x16,0x20,0x56,0xf4]
+ vfmaddcph ymm30, ymm29, ymm28
+
+// CHECK: vfmaddcph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x16,0x00,0x56,0xf4]
+ vfmaddcph xmm30, xmm29, xmm28
+
+// CHECK: vfmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x16,0x27,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddcph ymm30, ymm29, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x16,0x30,0x56,0x31]
+ vfmaddcph ymm30, ymm29, dword ptr [r9]{1to8}
+
+// CHECK: vfmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x16,0x20,0x56,0x71,0x7f]
+ vfmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x66,0x16,0xb7,0x56,0x72,0x80]
+ vfmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vfmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x16,0x07,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmaddcph xmm30, xmm29, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x46,0x16,0x10,0x56,0x31]
+ vfmaddcph xmm30, xmm29, dword ptr [r9]{1to4}
+
+// CHECK: vfmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x16,0x00,0x56,0x71,0x7f]
+ vfmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x66,0x16,0x97,0x56,0x72,0x80]
+ vfmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+
+// CHECK: vfmulcph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x16,0x20,0xd6,0xf4]
+ vfmulcph ymm30, ymm29, ymm28
+
+// CHECK: vfmulcph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x16,0x00,0xd6,0xf4]
+ vfmulcph xmm30, xmm29, xmm28
+
+// CHECK: vfmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x16,0x27,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmulcph ymm30, ymm29, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x16,0x30,0xd6,0x31]
+ vfmulcph ymm30, ymm29, dword ptr [r9]{1to8}
+
+// CHECK: vfmulcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x16,0x20,0xd6,0x71,0x7f]
+ vfmulcph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x66,0x16,0xb7,0xd6,0x72,0x80]
+ vfmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vfmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x16,0x07,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vfmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmulcph xmm30, xmm29, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x46,0x16,0x10,0xd6,0x31]
+ vfmulcph xmm30, xmm29, dword ptr [r9]{1to4}
+
+// CHECK: vfmulcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x16,0x00,0xd6,0x71,0x7f]
+ vfmulcph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x66,0x16,0x97,0xd6,0x72,0x80]
+ vfmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}
More information about the cfe-commits
mailing list