[clang] b088536 - [X86] AVX512FP16 instructions enabling 4/6
via cfe-commits
cfe-commits at lists.llvm.org
Sat Aug 21 17:59:50 PDT 2021
Author: Wang, Pengfei
Date: 2021-08-22T08:59:35+08:00
New Revision: b088536ce9e0473d6ab63c24ad69ca7ea2339a46
URL: https://github.com/llvm/llvm-project/commit/b088536ce9e0473d6ab63c24ad69ca7ea2339a46
DIFF: https://github.com/llvm/llvm-project/commit/b088536ce9e0473d6ab63c24ad69ca7ea2339a46.diff
LOG: [X86] AVX512FP16 instructions enabling 4/6
Enable FP16 unary operator instructions.
Ref.: https://software.intel.com/content/www/us/en/develop/download/intel-avx512-fp16-architecture-specification.html
Reviewed By: LuoYuanke
Differential Revision: https://reviews.llvm.org/D105267
Added:
llvm/test/CodeGen/X86/avx512fp16-rndscale.ll
llvm/test/CodeGen/X86/avx512fp16-scalar.ll
llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
Modified:
clang/include/clang/Basic/BuiltinsX86.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Headers/avx512fp16intrin.h
clang/lib/Headers/avx512vlfp16intrin.h
clang/lib/Sema/SemaChecking.cpp
clang/test/CodeGen/X86/avx512fp16-builtins.c
clang/test/CodeGen/X86/avx512vlfp16-builtins.c
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFoldTables.cpp
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86IntrinsicsInfo.h
llvm/test/CodeGen/X86/avx512fp16-arith.ll
llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
llvm/test/MC/Disassembler/X86/avx512fp16.txt
llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
llvm/test/MC/X86/avx512fp16.s
llvm/test/MC/X86/avx512fp16vl.s
llvm/test/MC/X86/intel-syntax-avx512fp16.s
llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index a0926f230d46f..f21c17ee0ebe9 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1876,6 +1876,47 @@ TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512f
TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rcpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcpph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rsqrtph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrtph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrtph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_getmantph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_getexpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getexpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getexpph512_mask, "V32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_scalefph128_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scalefph256_mask, "V16xV16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scalefph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_rndscaleph_128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rndscaleph_256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rndscaleph_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reduceph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduceph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduceph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rcpsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rsqrtsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_getmantsh_round_mask, "V8xV8xV8xIiV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_getexpsh128_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_scalefsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rndscalesh_round_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reducesh_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_sqrtph, "V8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_sqrtph256, "V16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_sqrtph512, "V32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_sqrtsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_fpclassph128_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassph256_mask, "UsV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassph512_mask, "UiV32xIiUi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_fpclasssh_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16")
+
TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 89b773fc5f97b..789c446940ce8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13959,15 +13959,28 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
}
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
+ case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask: {
unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
// Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
// otherwise keep the intrinsic.
if (CC != 4) {
- Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtsd_round_mask ?
- Intrinsic::x86_avx512_mask_sqrt_sd :
- Intrinsic::x86_avx512_mask_sqrt_ss;
+ Intrinsic::ID IID;
+
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_sqrtsh_round_mask:
+ IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
+ break;
+ case X86::BI__builtin_ia32_sqrtsd_round_mask:
+ IID = Intrinsic::x86_avx512_mask_sqrt_sd;
+ break;
+ case X86::BI__builtin_ia32_sqrtss_round_mask:
+ IID = Intrinsic::x86_avx512_mask_sqrt_ss;
+ break;
+ }
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
@@ -13989,6 +14002,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_sqrtpd:
case X86::BI__builtin_ia32_sqrtps256:
case X86::BI__builtin_ia32_sqrtps:
+ case X86::BI__builtin_ia32_sqrtph256:
+ case X86::BI__builtin_ia32_sqrtph:
+ case X86::BI__builtin_ia32_sqrtph512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
if (Ops.size() == 2) {
@@ -13996,9 +14012,21 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
// otherwise keep the intrinsic.
if (CC != 4) {
- Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtps512 ?
- Intrinsic::x86_avx512_sqrt_ps_512 :
- Intrinsic::x86_avx512_sqrt_pd_512;
+ Intrinsic::ID IID;
+
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_sqrtph512:
+ IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtps512:
+ IID = Intrinsic::x86_avx512_sqrt_ps_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtpd512:
+ IID = Intrinsic::x86_avx512_sqrt_pd_512;
+ break;
+ }
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
}
@@ -14315,6 +14343,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_fpclassps128_mask:
case X86::BI__builtin_ia32_fpclassps256_mask:
case X86::BI__builtin_ia32_fpclassps512_mask:
+ case X86::BI__builtin_ia32_fpclassph128_mask:
+ case X86::BI__builtin_ia32_fpclassph256_mask:
+ case X86::BI__builtin_ia32_fpclassph512_mask:
case X86::BI__builtin_ia32_fpclasspd128_mask:
case X86::BI__builtin_ia32_fpclasspd256_mask:
case X86::BI__builtin_ia32_fpclasspd512_mask: {
@@ -14326,6 +14357,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Intrinsic::ID ID;
switch (BuiltinID) {
default: llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_fpclassph128_mask:
+ ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
+ break;
+ case X86::BI__builtin_ia32_fpclassph256_mask:
+ ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
+ break;
+ case X86::BI__builtin_ia32_fpclassph512_mask:
+ ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
+ break;
case X86::BI__builtin_ia32_fpclassps128_mask:
ID = Intrinsic::x86_avx512_fpclass_ps_128;
break;
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 6a4a9d4a6c7eb..48370d0bf0ee0 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -947,6 +947,492 @@ static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
return __b[0];
}
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_rcpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
+ (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rcpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_rsqrtph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
+ (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rsqrtph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+#define _mm512_getmant_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, A, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ph(U, A, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_getexpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_getexpph512_mask(
+ (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_getexpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ph(A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_scalefph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__W, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_scalefph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_scalefph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_scalefph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_scalefph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_roundscale_ph(A, B) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask( \
+ (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, imm) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask( \
+ (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
+ (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, imm) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask( \
+ (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
+ (__v32hf)(__m512h)(A), \
+ (__mmask32)(B), (int)(R)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
+ (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(A), (int)(R)))
+
+#define _mm512_roundscale_round_ph(A, imm, R) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_reduce_ph(A, imm) \
+ ((__m512h)__builtin_ia32_reduceph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(W, U, A, imm) \
+ ((__m512h)__builtin_ia32_reduceph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(U, A, imm) \
+ ((__m512h)__builtin_ia32_reduceph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
+ ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
+ ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_reduce_round_ph(A, imm, R) \
+ ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rcpsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rcpsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rsqrtsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_rsqrtsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+#define _mm_getmant_round_sh(A, B, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_getmant_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_getmant_sh(U, A, B, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_getexp_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_scalef_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_scalefsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefsh_round_mask(
+ (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_scalefsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefsh_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_scalefsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_roundscale_round_sh(A, B, imm, R) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(imm), (int)(R)))
+
+#define _mm_roundscale_sh(A, B, imm) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(W, U, A, B, I) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(I), (int)(R)))
+
+#define _mm_maskz_roundscale_sh(U, A, B, I) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(I), (int)(R)))
+
+#define _mm_reduce_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(U, A, B, C) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(C), (int)(R)))
+
+#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm512_sqrt_round_ph(A, R) \
+ ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
+
+#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_sqrt_round_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)(__U),
+ (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
+ (__v32hf)(__m512h)(__W));
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)(__U),
+ (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm_sqrt_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+ (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
+ __mmask32 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+ (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
+ (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+ (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
+ (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
+ ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
+ (int)(imm), (__mmask32)(U)))
+
+#define _mm512_fpclass_ph_mask(A, imm) \
+ ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
+ (int)(imm), (__mmask32)-1))
+
+#define _mm_fpclass_sh_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fpclass_sh_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__mmask8)(U)))
+
#define _mm512_cvt_roundpd_ph(A, R) \
((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
(__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index ab2cf436ee16d..1809211fd4066 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -327,6 +327,284 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
((__mmask8)__builtin_ia32_cmpph128_mask( \
(__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
+ return (__m256h)__builtin_ia32_rcpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rcpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
+ return (__m128h)__builtin_ia32_rcpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_rcpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
+ return (__m256h)__builtin_ia32_rsqrtph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rsqrtph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
+ return (__m128h)__builtin_ia32_rsqrtph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_rsqrtph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
+ return (__m128h)__builtin_ia32_getexpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_getexpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
+ return (__m256h)__builtin_ia32_getexpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_getexpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+#define _mm_getmant_ph(A, B, C) \
+ ((__m128h)__builtin_ia32_getmantph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_getmantph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, A, B, C) \
+ ((__m128h)__builtin_ia32_getmantph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U)))
+
+#define _mm256_getmant_ph(A, B, C) \
+ ((__m256h)__builtin_ia32_getmantph256_mask( \
+ (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
+ (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, A, B, C) \
+ ((__m256h)__builtin_ia32_getmantph256_mask( \
+ (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, A, B, C) \
+ ((__m256h)__builtin_ia32_getmantph256_mask( \
+ (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
+ (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefph128_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefph128_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_scalefph256_mask(
+ (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
+ (__v16hf)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_scalefph256_mask(
+ (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+#define _mm_roundscale_ph(A, imm) \
+ ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1))
+
+#define _mm_mask_roundscale_ph(W, U, A, imm) \
+ ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_ph(U, A, imm) \
+ ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U)))
+
+#define _mm256_roundscale_ph(A, imm) \
+ ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
+ (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)-1))
+
+#define _mm256_mask_roundscale_ph(W, U, A, imm) \
+ ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
+ (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_roundscale_ph(U, A, imm) \
+ ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
+ (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)(U)))
+
+#define _mm_reduce_ph(A, imm) \
+ ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1))
+
+#define _mm_mask_reduce_ph(W, U, A, imm) \
+ ((__m128h)__builtin_ia32_reduceph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ph(U, A, imm) \
+ ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U)))
+
+#define _mm256_reduce_ph(A, imm) \
+ ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+ (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)-1))
+
+#define _mm256_mask_reduce_ph(W, U, A, imm) \
+ ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+ (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_reduce_ph(U, A, imm) \
+ ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+ (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)(U)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
+ return __builtin_ia32_sqrtph((__v8hf)__a);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
+ return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+ (__v16hf)_mm256_sqrt_ph(__A),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+#define _mm_mask_fpclass_ph_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
+ (int)(imm), (__mmask8)(U)))
+
+#define _mm_fpclass_ph_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
+ (int)(imm), (__mmask8)-1))
+
+#define _mm256_mask_fpclass_ph_mask(U, A, imm) \
+ ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
+ (int)(imm), (__mmask16)(U)))
+
+#define _mm256_fpclass_ph_mask(A, imm) \
+ ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
+ (int)(imm), (__mmask16)-1))
+
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
(__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 69560027f330a..4e7d5b66bca7f 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3913,6 +3913,7 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_exp2ps_mask:
case X86::BI__builtin_ia32_getexppd512_mask:
case X86::BI__builtin_ia32_getexpps512_mask:
+ case X86::BI__builtin_ia32_getexpph512_mask:
case X86::BI__builtin_ia32_rcp28pd_mask:
case X86::BI__builtin_ia32_rcp28ps_mask:
case X86::BI__builtin_ia32_rsqrt28pd_mask:
@@ -3933,8 +3934,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_cvtss2sd_round_mask:
case X86::BI__builtin_ia32_getexpsd128_round_mask:
case X86::BI__builtin_ia32_getexpss128_round_mask:
+ case X86::BI__builtin_ia32_getexpsh128_round_mask:
case X86::BI__builtin_ia32_getmantpd512_mask:
case X86::BI__builtin_ia32_getmantps512_mask:
+ case X86::BI__builtin_ia32_getmantph512_mask:
case X86::BI__builtin_ia32_maxsd_round_mask:
case X86::BI__builtin_ia32_maxss_round_mask:
case X86::BI__builtin_ia32_maxsh_round_mask:
@@ -3945,8 +3948,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_rcp28ss_round_mask:
case X86::BI__builtin_ia32_reducepd512_mask:
case X86::BI__builtin_ia32_reduceps512_mask:
+ case X86::BI__builtin_ia32_reduceph512_mask:
case X86::BI__builtin_ia32_rndscalepd_mask:
case X86::BI__builtin_ia32_rndscaleps_mask:
+ case X86::BI__builtin_ia32_rndscaleph_mask:
case X86::BI__builtin_ia32_rsqrt28sd_round_mask:
case X86::BI__builtin_ia32_rsqrt28ss_round_mask:
ArgNum = 4;
@@ -3961,14 +3966,17 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_fixupimmss_maskz:
case X86::BI__builtin_ia32_getmantsd_round_mask:
case X86::BI__builtin_ia32_getmantss_round_mask:
+ case X86::BI__builtin_ia32_getmantsh_round_mask:
case X86::BI__builtin_ia32_rangepd512_mask:
case X86::BI__builtin_ia32_rangeps512_mask:
case X86::BI__builtin_ia32_rangesd128_round_mask:
case X86::BI__builtin_ia32_rangess128_round_mask:
case X86::BI__builtin_ia32_reducesd_mask:
case X86::BI__builtin_ia32_reducess_mask:
+ case X86::BI__builtin_ia32_reducesh_mask:
case X86::BI__builtin_ia32_rndscalesd_round_mask:
case X86::BI__builtin_ia32_rndscaless_round_mask:
+ case X86::BI__builtin_ia32_rndscalesh_round_mask:
ArgNum = 5;
break;
case X86::BI__builtin_ia32_vcvtsd2si64:
@@ -3985,6 +3993,7 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vcvtsh2usi64:
case X86::BI__builtin_ia32_sqrtpd512:
case X86::BI__builtin_ia32_sqrtps512:
+ case X86::BI__builtin_ia32_sqrtph512:
ArgNum = 1;
HasRC = true;
break;
@@ -4057,15 +4066,18 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_subsh_round_mask:
case X86::BI__builtin_ia32_subss_round_mask:
case X86::BI__builtin_ia32_subsd_round_mask:
+ case X86::BI__builtin_ia32_scalefph512_mask:
case X86::BI__builtin_ia32_scalefpd512_mask:
case X86::BI__builtin_ia32_scalefps512_mask:
case X86::BI__builtin_ia32_scalefsd_round_mask:
case X86::BI__builtin_ia32_scalefss_round_mask:
+ case X86::BI__builtin_ia32_scalefsh_round_mask:
case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
case X86::BI__builtin_ia32_vcvtss2sh_round_mask:
case X86::BI__builtin_ia32_vcvtsd2sh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask:
+ case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_vfmaddsd3_mask:
case X86::BI__builtin_ia32_vfmaddsd3_maskz:
case X86::BI__builtin_ia32_vfmaddsd3_mask3:
@@ -4439,6 +4451,9 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case X86::BI__builtin_ia32_getmantps256_mask:
case X86::BI__builtin_ia32_getmantpd512_mask:
case X86::BI__builtin_ia32_getmantps512_mask:
+ case X86::BI__builtin_ia32_getmantph128_mask:
+ case X86::BI__builtin_ia32_getmantph256_mask:
+ case X86::BI__builtin_ia32_getmantph512_mask:
case X86::BI__builtin_ia32_vec_ext_v16qi:
case X86::BI__builtin_ia32_vec_ext_v16hi:
i = 1; l = 0; u = 15;
@@ -4457,6 +4472,7 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case X86::BI__builtin_ia32_rangeps512_mask:
case X86::BI__builtin_ia32_getmantsd_round_mask:
case X86::BI__builtin_ia32_getmantss_round_mask:
+ case X86::BI__builtin_ia32_getmantsh_round_mask:
case X86::BI__builtin_ia32_vec_set_v16qi:
case X86::BI__builtin_ia32_vec_set_v16hi:
i = 2; l = 0; u = 15;
@@ -4509,12 +4525,16 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case X86::BI__builtin_ia32_rndscalepd_256_mask:
case X86::BI__builtin_ia32_rndscaleps_mask:
case X86::BI__builtin_ia32_rndscalepd_mask:
+ case X86::BI__builtin_ia32_rndscaleph_mask:
case X86::BI__builtin_ia32_reducepd128_mask:
case X86::BI__builtin_ia32_reducepd256_mask:
case X86::BI__builtin_ia32_reducepd512_mask:
case X86::BI__builtin_ia32_reduceps128_mask:
case X86::BI__builtin_ia32_reduceps256_mask:
case X86::BI__builtin_ia32_reduceps512_mask:
+ case X86::BI__builtin_ia32_reduceph128_mask:
+ case X86::BI__builtin_ia32_reduceph256_mask:
+ case X86::BI__builtin_ia32_reduceph512_mask:
case X86::BI__builtin_ia32_prold512:
case X86::BI__builtin_ia32_prolq512:
case X86::BI__builtin_ia32_prold128:
@@ -4533,8 +4553,12 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case X86::BI__builtin_ia32_fpclassps256_mask:
case X86::BI__builtin_ia32_fpclassps512_mask:
case X86::BI__builtin_ia32_fpclasspd512_mask:
+ case X86::BI__builtin_ia32_fpclassph128_mask:
+ case X86::BI__builtin_ia32_fpclassph256_mask:
+ case X86::BI__builtin_ia32_fpclassph512_mask:
case X86::BI__builtin_ia32_fpclasssd_mask:
case X86::BI__builtin_ia32_fpclassss_mask:
+ case X86::BI__builtin_ia32_fpclasssh_mask:
case X86::BI__builtin_ia32_pslldqi128_byteshift:
case X86::BI__builtin_ia32_pslldqi256_byteshift:
case X86::BI__builtin_ia32_pslldqi512_byteshift:
@@ -4645,6 +4669,8 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case X86::BI__builtin_ia32_reducess_mask:
case X86::BI__builtin_ia32_rndscalesd_round_mask:
case X86::BI__builtin_ia32_rndscaless_round_mask:
+ case X86::BI__builtin_ia32_rndscalesh_round_mask:
+ case X86::BI__builtin_ia32_reducesh_mask:
i = 4; l = 0; u = 255;
break;
}
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index d4fe44bc259ee..42591662606eb 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -1542,6 +1542,537 @@ __m128i test_mm_cvtsi16_si128(short A) {
return _mm_cvtsi16_si128(A);
}
+__m512h test_mm512_rcp_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
+ return _mm512_rcp_ph(__A);
+}
+
+__m512h test_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
+ return (__m512h)_mm512_mask_rcp_ph(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
+ return _mm512_maskz_rcp_ph(__U, __A);
+}
+
+__m512h test_mm512_rsqrt_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+ return _mm512_rsqrt_ph(__A);
+}
+
+__m512h test_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+ return (__m512h)_mm512_mask_rsqrt_ph(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+ return _mm512_maskz_rsqrt_ph(__U, __A);
+}
+
+__m512h test_mm512_getmant_round_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_getmant_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+ return _mm512_getmant_round_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_getmant_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_getmant_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+ return _mm512_mask_getmant_round_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_getmant_round_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_getmant_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+ return _mm512_maskz_getmant_round_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_getmant_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+ return _mm512_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512h test_mm512_mask_getmant_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+ return _mm512_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512h test_mm512_maskz_getmant_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+ return _mm512_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512h test_mm512_scalef_round_ph(__m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_scalef_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+ return _mm512_scalef_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_scalef_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_mask_scalef_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+ return _mm512_mask_scalef_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_scalef_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_maskz_scalef_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+ return _mm512_maskz_scalef_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_scalef_ph(__m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+ return _mm512_scalef_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_mask_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+ return _mm512_mask_scalef_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ // CHECK-LABEL: @test_mm512_maskz_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+ return _mm512_maskz_scalef_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_mask_roundscale_ph(__m512h __W, __mmask16 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+ return _mm512_mask_roundscale_ph(__W, __U, __A, 1);
+}
+
+__m512h test_mm512_maskz_roundscale_ph(__mmask16 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+ return _mm512_maskz_roundscale_ph(__U, __A, 1);
+}
+
+__m512h test_mm512_mask_roundscale_round_ph(__m512h __A, __mmask16 __U, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_roundscale_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+ return _mm512_mask_roundscale_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_roundscale_round_ph(__m512h __A, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+ return _mm512_maskz_roundscale_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_roundscale_round_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_roundscale_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+ return _mm512_roundscale_round_ph(__A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_roundscale_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+ return _mm512_roundscale_ph(__A, 3);
+}
+
+__m512h test_mm512_getexp_round_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_getexp_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+ return _mm512_getexp_round_ph(__A, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_getexp_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_getexp_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+ return _mm512_mask_getexp_round_ph(__W, __U, __A, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_getexp_round_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_getexp_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+ return _mm512_maskz_getexp_round_ph(__U, __A, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_getexp_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+ return _mm512_getexp_ph(__A);
+}
+
+__m512h test_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+ return _mm512_mask_getexp_ph(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+ return _mm512_maskz_getexp_ph(__U, __A);
+}
+
+__m512h test_mm512_mask_reduce_ph(__m512h __W, __mmask16 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+ return _mm512_mask_reduce_ph(__W, __U, __A, 1);
+}
+
+__m512h test_mm512_maskz_reduce_ph(__mmask16 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+ return _mm512_maskz_reduce_ph(__U, __A, 1);
+}
+
+__m512h test_mm512_mask_reduce_round_ph(__m512h __A, __mmask16 __U, __m512h __C) {
+ // CHECK-LABEL: @test_mm512_mask_reduce_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+ return _mm512_mask_reduce_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_reduce_round_ph(__m512h __A, __mmask16 __U) {
+ // CHECK-LABEL: @test_mm512_maskz_reduce_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+ return _mm512_maskz_reduce_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_reduce_round_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_reduce_round_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+ return _mm512_reduce_round_ph(__A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_reduce_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+ return _mm512_reduce_ph(__A, 3);
+}
+__m128h test_mm_rcp_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_rcp_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
+ return _mm_rcp_sh(__A, __B);
+}
+
+__m128h test_mm_mask_rcp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_rcp_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
+ return _mm_mask_rcp_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_rcp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_rcp_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
+ return _mm_maskz_rcp_sh(__U, __A, __B);
+}
+
+__m128h test_mm_rsqrt_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_rsqrt_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
+ return _mm_rsqrt_sh(__A, __B);
+}
+
+__m128h test_mm_mask_rsqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_rsqrt_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
+ return _mm_mask_rsqrt_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_rsqrt_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
+ return _mm_maskz_rsqrt_sh(__U, __A, __B);
+}
+
+__m128h test_mm_getmant_round_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_getmant_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+ return _mm_getmant_round_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8);
+}
+
+__m128h test_mm_getmant_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_getmant_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+ return _mm_getmant_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
+}
+
+__m128h test_mm_mask_getmant_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_getmant_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+ return _mm_mask_getmant_sh(__W, __U, __A, __B, 1, 2);
+}
+
+__m128h test_mm_mask_getmant_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_getmant_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+ return _mm_mask_getmant_round_sh(__W, __U, __A, __B, 1, 2, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_getmant_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_getmant_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+ return _mm_maskz_getmant_sh(__U, __A, __B, 1, 2);
+}
+
+__m128h test_mm_maskz_getmant_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_getmant_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+ return _mm_maskz_getmant_round_sh(__U, __A, __B, 1, 2, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_getexp_round_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_getexp_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+ return _mm_getexp_round_sh(__A, __B, 8);
+}
+
+__m128h test_mm_getexp_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_getexp_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+ return _mm_getexp_sh(__A, __B);
+}
+
+__m128h test_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_getexp_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+ return _mm_mask_getexp_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_mask_getexp_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_getexp_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+ return _mm_mask_getexp_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_getexp_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+ return _mm_maskz_getexp_sh(__U, __A, __B);
+}
+
+__m128h test_mm_maskz_getexp_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_getexp_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+ return _mm_maskz_getexp_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_scalef_round_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_scalef_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11)
+ return _mm_scalef_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_scalef_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_scalef_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
+ return _mm_scalef_sh(__A, __B);
+}
+
+__m128h test_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_scalef_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
+ return _mm_mask_scalef_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_mask_scalef_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_scalef_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11)
+ return _mm_mask_scalef_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_scalef_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
+ return _mm_maskz_scalef_sh(__U, __A, __B);
+}
+
+__m128h test_mm_maskz_scalef_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_scalef_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11)
+ return _mm_maskz_scalef_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_roundscale_round_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_roundscale_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+ return _mm_roundscale_round_sh(__A, __B, 3, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_roundscale_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_roundscale_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+ return _mm_roundscale_sh(__A, __B, 3);
+}
+
+__m128h test_mm_mask_roundscale_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_roundscale_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+ return _mm_mask_roundscale_sh(__W, __U, __A, __B, 3);
+}
+
+__m128h test_mm_mask_roundscale_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_roundscale_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+ return _mm_mask_roundscale_round_sh(__W, __U, __A, __B, 3, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_roundscale_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_roundscale_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+ return _mm_maskz_roundscale_round_sh(__U, __A, __B, 3, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_roundscale_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_roundscale_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+ return _mm_maskz_roundscale_sh(__U, __A, __B, 3);
+}
+
+__m128h test_mm_reduce_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_reduce_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+ return _mm_reduce_sh(__A, __B, 4);
+}
+
+__m128h test_mm_mask_reduce_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_reduce_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+ return _mm_mask_reduce_sh(__W, __U, __A, __B, 4);
+}
+
+__m128h test_mm_maskz_reduce_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_reduce_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+ return _mm_maskz_reduce_sh(__U, __A, __B, 4);
+}
+
+__m128h test_mm_reduce_round_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_reduce_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+ return _mm_reduce_round_sh(__A, __B, 4, 8);
+}
+
+__m128h test_mm_mask_reduce_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_reduce_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+ return _mm_mask_reduce_round_sh(__W, __U, __A, __B, 4, 8);
+}
+
+__m128h test_mm_maskz_reduce_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_reduce_round_sh
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+ return _mm_maskz_reduce_round_sh(__U, __A, __B, 4, 8);
+}
+
+__m512h test_mm512_sqrt_round_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_sqrt_round_ph
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
+ return _mm512_sqrt_round_ph(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_sqrt_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_sqrt_round_ph
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_sqrt_round_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_sqrt_round_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ph
+ // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}}
+ return _mm512_maskz_sqrt_round_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_sqrt_ph(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_sqrt_ph
+ // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
+ return _mm512_sqrt_ph(__A);
+}
+__m512h test_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_sqrt_ph
+ // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+ return _mm512_mask_sqrt_ph(__W, __U, __A);
+}
+__m512h test_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_maskz_sqrt_ph
+ // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
+ // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+ // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}}
+ return _mm512_maskz_sqrt_ph(__U, __A);
+}
+
+__m128h test_mm_sqrt_round_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_sqrt_round_sh
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11)
+ return _mm_sqrt_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_sqrt_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_sqrt_round_sh
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11)
+ return _mm_mask_sqrt_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_sqrt_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_sqrt_round_sh
+ // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11)
+ return _mm_maskz_sqrt_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_sqrt_sh(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_sqrt_sh
+ // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
+ return _mm_sqrt_sh(__A, __B);
+}
+__m128h test_mm_mask_sqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_sqrt_sh
+ // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
+ return _mm_mask_sqrt_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_sqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_sqrt_sh
+ // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
+ return _mm_maskz_sqrt_sh(__U, __A, __B);
+}
+
+__mmask32 test_mm512_mask_fpclass_ph_mask(__mmask32 __U, __m512h __A) {
+ // CHECK-LABEL: @test_mm512_mask_fpclass_ph_mask
+ // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512
+ return _mm512_mask_fpclass_ph_mask(__U, __A, 4);
+}
+
+__mmask32 test_mm512_fpclass_ph_mask(__m512h __A) {
+ // CHECK-LABEL: @test_mm512_fpclass_ph_mask
+ // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512
+ return _mm512_fpclass_ph_mask(__A, 4);
+}
+
+__mmask8 test_mm_fpclash_sh_mask(__m128 __A) {
+ // CHECK-LABEL: @test_mm_fpclash_sh_mask
+ // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh
+ return _mm_fpclass_sh_mask(__A, 2);
+}
+
+__mmask8 test_mm_mask_fpclash_sh_mask(__mmask8 __U, __m128 __A) {
+ // CHECK-LABEL: @test_mm_mask_fpclash_sh_mask
+ // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh
+ return _mm_mask_fpclass_sh_mask(__U, __A, 2);
+}
+
__m128h test_mm512_cvt_roundpd_ph(__m512d A) {
// CHECK-LABEL: test_mm512_cvt_roundpd_ph
// CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
index 0d020ccd1452f..cb99d655f21c6 100644
--- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
@@ -1215,6 +1215,320 @@ __mmask8 test_mm_mask_cmp_ph_mask_true_us(__mmask8 m, __m128h a, __m128h b) {
return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US);
}
+__m256h test_mm256_rcp_ph(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256
+ return _mm256_rcp_ph(__A);
+}
+
+__m256h test_mm256_mask_rcp_ph(__m256h __W, __mmask32 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256
+ return (__m256h)_mm256_mask_rcp_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_rcp_ph(__mmask32 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256
+ return _mm256_maskz_rcp_ph(__U, __A);
+}
+
+__m128h test_mm_rcp_ph(__m128h __A) {
+ // CHECK-LABEL: @test_mm_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128
+ return _mm_rcp_ph(__A);
+}
+
+__m128h test_mm_mask_rcp_ph(__m128h __W, __mmask32 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128
+ return (__m128h)_mm_mask_rcp_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_rcp_ph(__mmask32 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_rcp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128
+ return _mm_maskz_rcp_ph(__U, __A);
+}
+
+__m256h test_mm256_rsqrt_ph(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+ return _mm256_rsqrt_ph(__A);
+}
+
+__m256h test_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+ return (__m256h)_mm256_mask_rsqrt_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+ return _mm256_maskz_rsqrt_ph(__U, __A);
+}
+
+__m128h test_mm_rsqrt_ph(__m128h __A) {
+ // CHECK-LABEL: @test_mm_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+ return _mm_rsqrt_ph(__A);
+}
+
+__m128h test_mm_mask_rsqrt_ph(__m128h __W, __mmask32 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+ return (__m128h)_mm_mask_rsqrt_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_rsqrt_ph(__mmask32 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_rsqrt_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+ return _mm_maskz_rsqrt_ph(__U, __A);
+}
+
+__m128h test_mm_getmant_ph(__m128h __A) {
+ // CHECK-LABEL: @test_mm_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128
+ return _mm_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128h test_mm_mask_getmant_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128
+ return _mm_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128h test_mm_maskz_getmant_ph(__mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128
+ return _mm_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256h test_mm256_getmant_ph(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256
+ return _mm256_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256h test_mm256_mask_getmant_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256
+ return _mm256_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256h test_mm256_maskz_getmant_ph(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_getmant_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256
+ return _mm256_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128h test_mm_getexp_ph(__m128h __A) {
+ // CHECK-LABEL: @test_mm_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128
+ return _mm_getexp_ph(__A);
+}
+
+__m128h test_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128
+ return _mm_mask_getexp_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128
+ return _mm_maskz_getexp_ph(__U, __A);
+}
+
+__m256h test_mm256_getexp_ph(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256
+ return _mm256_getexp_ph(__A);
+}
+
+__m256h test_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256
+ return _mm256_mask_getexp_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_getexp_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256
+ return _mm256_maskz_getexp_ph(__U, __A);
+}
+
+__m128h test_mm_scalef_ph(__m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128
+ return _mm_scalef_ph(__A, __B);
+}
+
+__m128h test_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_mask_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128
+ return _mm_mask_scalef_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
+ // CHECK-LABEL: @test_mm_maskz_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128
+ return _mm_maskz_scalef_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_scalef_ph(__m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256
+ return _mm256_scalef_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_mask_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256
+ return _mm256_mask_scalef_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ // CHECK-LABEL: @test_mm256_maskz_scalef_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256
+ return _mm256_maskz_scalef_ph(__U, __A, __B);
+}
+
+__m128h test_mm_roundscale_ph(__m128h __A) {
+ // CHECK-LABEL: @test_mm_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128
+ return _mm_roundscale_ph(__A, 4);
+}
+
+__m128h test_mm_mask_roundscale_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128
+ return _mm_mask_roundscale_ph(__W, __U, __A, 4);
+}
+
+__m128h test_mm_maskz_roundscale_ph(__mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128
+ return _mm_maskz_roundscale_ph(__U, __A, 4);
+}
+
+__m256h test_mm256_roundscale_ph(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256
+ return _mm256_roundscale_ph(__A, 4);
+}
+
+__m256h test_mm256_mask_roundscale_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256
+ return _mm256_mask_roundscale_ph(__W, __U, __A, 4);
+}
+
+__m256h test_mm256_maskz_roundscale_ph(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_roundscale_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256
+ return _mm256_maskz_roundscale_ph(__U, __A, 4);
+}
+
+__m128h test_mm_reduce_ph(__m128h __A) {
+ // CHECK-LABEL: @test_mm_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128
+ return _mm_reduce_ph(__A, 4);
+}
+
+__m128h test_mm_mask_reduce_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128
+ return _mm_mask_reduce_ph(__W, __U, __A, 4);
+}
+
+__m128h test_mm_maskz_reduce_ph(__mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128
+ return _mm_maskz_reduce_ph(__U, __A, 4);
+}
+
+__m256h test_mm256_reduce_ph(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256
+ return _mm256_reduce_ph(__A, 4);
+}
+
+__m256h test_mm256_mask_reduce_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256
+ return _mm256_mask_reduce_ph(__W, __U, __A, 4);
+}
+
+__m256h test_mm256_maskz_reduce_ph(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_reduce_ph
+ // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256
+ return _mm256_maskz_reduce_ph(__U, __A, 4);
+}
+__m128h test_mm_sqrt_ph(__m128h x) {
+ // CHECK-LABEL: test_mm_sqrt_ph
+ // CHECK: call <8 x half> @llvm.sqrt.v8f16(<8 x half> {{.*}})
+ return _mm_sqrt_ph(x);
+}
+
+__m256h test_mm256_sqrt_ph(__m256h A) {
+ // CHECK-LABEL: test_mm256_sqrt_ph
+ // CHECK: call <16 x half> @llvm.sqrt.v16f16(<16 x half> %{{.*}})
+ return _mm256_sqrt_ph(A);
+}
+
+__m128h test_mm_mask_sqrt_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_sqrt_ph
+ // CHECK: @llvm.sqrt.v8f16
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_mask_sqrt_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_sqrt_ph(__mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_maskz_sqrt_ph
+ // CHECK: @llvm.sqrt.v8f16
+ // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+ return _mm_maskz_sqrt_ph(__U, __A);
+}
+
+__m256h test_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_sqrt_ph
+ // CHECK: @llvm.sqrt.v16f16
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_sqrt_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_maskz_sqrt_ph
+ // CHECK: @llvm.sqrt.v16f16
+ // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_sqrt_ph(__U, __A);
+}
+__mmask8 test_mm_mask_fpclass_ph_mask(__mmask8 __U, __m128h __A) {
+ // CHECK-LABEL: @test_mm_mask_fpclass_ph_mask
+ // CHECK: @llvm.x86.avx512fp16.fpclass.ph.128
+ return _mm_mask_fpclass_ph_mask(__U, __A, 2);
+}
+
+__mmask8 test_mm_fpclass_ph_mask(__m128h __A) {
+ // CHECK-LABEL: @test_mm_fpclass_ph_mask
+ // CHECK: @llvm.x86.avx512fp16.fpclass.ph.128
+ return _mm_fpclass_ph_mask(__A, 2);
+}
+
+__mmask16 test_mm256_mask_fpclass_ph_mask(__mmask16 __U, __m256h __A) {
+ // CHECK-LABEL: @test_mm256_mask_fpclass_ph_mask
+ // CHECK: @llvm.x86.avx512fp16.fpclass.ph.256
+ return _mm256_mask_fpclass_ph_mask(__U, __A, 2);
+}
+
+__mmask16 test_mm256_fpclass_ph_mask(__m256h __A) {
+ // CHECK-LABEL: @test_mm256_fpclass_ph_mask
+ // CHECK: @llvm.x86.avx512fp16.fpclass.ph.256
+ return _mm256_fpclass_ph_mask(__A, 2);
+}
+
__m128h test_mm_cvtpd_ph(__m128d A) {
// CHECK-LABEL: test_mm_cvtpd_ph
// CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 72e9c3404775d..c79c6118db680 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5541,4 +5541,172 @@ let TargetPrefix = "x86" in {
: GCCBuiltin<"__builtin_ia32_vcvttsh2usi64">,
Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
[ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+
+ def int_x86_avx512fp16_sqrt_ph_512
+ : Intrinsic<[ llvm_v32f16_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_sqrt_sh
+ : Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_rsqrt_ph_128
+ : GCCBuiltin<"__builtin_ia32_rsqrtph128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rsqrt_ph_256
+ : GCCBuiltin<"__builtin_ia32_rsqrtph256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rsqrt_ph_512
+ : GCCBuiltin<"__builtin_ia32_rsqrtph512_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rsqrt_sh
+ : GCCBuiltin<"__builtin_ia32_rsqrtsh_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rcp_ph_128
+ : GCCBuiltin<"__builtin_ia32_rcpph128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rcp_ph_256
+ : GCCBuiltin<"__builtin_ia32_rcpph256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rcp_ph_512
+ : GCCBuiltin<"__builtin_ia32_rcpph512_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_rcp_sh
+ : GCCBuiltin<"__builtin_ia32_rcpsh_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_reduce_ph_128
+ : GCCBuiltin<"__builtin_ia32_reduceph128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_reduce_ph_256
+ : GCCBuiltin<"__builtin_ia32_reduceph256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_reduce_ph_512
+ : GCCBuiltin<"__builtin_ia32_reduceph512_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_reduce_sh
+ : GCCBuiltin<"__builtin_ia32_reducesh_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+ llvm_i32_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
+ def int_x86_avx512fp16_fpclass_ph_128
+ : Intrinsic<[ llvm_v8i1_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_fpclass_ph_256
+ : Intrinsic<[ llvm_v16i1_ty ], [ llvm_v16f16_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_fpclass_ph_512
+ : Intrinsic<[ llvm_v32i1_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_fpclass_sh
+ : GCCBuiltin<"__builtin_ia32_fpclasssh_mask">,
+ Intrinsic<[ llvm_i8_ty ], [ llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_getexp_ph_128
+ : GCCBuiltin<"__builtin_ia32_getexpph128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_getexp_ph_256
+ : GCCBuiltin<"__builtin_ia32_getexpph256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_getexp_ph_512
+ : GCCBuiltin<"__builtin_ia32_getexpph512_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+ def int_x86_avx512fp16_mask_getexp_sh
+ : GCCBuiltin<"__builtin_ia32_getexpsh128_round_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_getmant_ph_128
+ : GCCBuiltin<"__builtin_ia32_getmantph128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_getmant_ph_256
+ : GCCBuiltin<"__builtin_ia32_getmantph256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_getmant_ph_512
+ : GCCBuiltin<"__builtin_ia32_getmantph512_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_getmant_sh
+ : GCCBuiltin<"__builtin_ia32_getmantsh_round_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty,
+ llvm_i8_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>> ]>;
+ def int_x86_avx512fp16_mask_rndscale_ph_128
+ : GCCBuiltin<"__builtin_ia32_rndscaleph_128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_rndscale_ph_256
+ : GCCBuiltin<"__builtin_ia32_rndscaleph_256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+ def int_x86_avx512fp16_mask_rndscale_ph_512
+ : GCCBuiltin<"__builtin_ia32_rndscaleph_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_rndscale_sh
+ : GCCBuiltin<"__builtin_ia32_rndscalesh_round_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+ llvm_i32_ty, llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
+ def int_x86_avx512fp16_mask_scalef_ph_128
+ : GCCBuiltin<"__builtin_ia32_scalefph128_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_scalef_ph_256
+ : GCCBuiltin<"__builtin_ia32_scalefph256_mask">,
+ Intrinsic<[ llvm_v16f16_ty ],
+ [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+ [ IntrNoMem ]>;
+ def int_x86_avx512fp16_mask_scalef_ph_512
+ : GCCBuiltin<"__builtin_ia32_scalefph512_mask">,
+ Intrinsic<[ llvm_v32f16_ty ],
+ [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+ def int_x86_avx512fp16_mask_scalef_sh
+ : GCCBuiltin<"__builtin_ia32_scalefsh_round_mask">,
+ Intrinsic<[ llvm_v8f16_ty ],
+ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+ llvm_i32_ty ],
+ [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3595bc57e4d88..d396d5c0df7e0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1917,6 +1917,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL, VT, Legal);
setOperationAction(ISD::FDIV, VT, Legal);
setOperationAction(ISD::STRICT_FDIV, VT, Legal);
+ setOperationAction(ISD::FSQRT, VT, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
+
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::LOAD, VT, Legal);
setOperationAction(ISD::STORE, VT, Legal);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 7a2b6ade1796c..c92abc7e8c95d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2816,24 +2816,28 @@ multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
}
multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
- bits<8> opcScalar, X86SchedWriteWidths sched,
- Predicate prd> {
+ bits<8> opcScalar, X86SchedWriteWidths sched> {
+ defm PH : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f16_info, opcVec,
+ sched, HasFP16>,
+ EVEX_CD8<16, CD8VF>, AVX512PSIi8Base, TA;
+ defm SHZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f16x_info, HasFP16>,
+ EVEX_CD8<16, CD8VT1>, AVX512PSIi8Base, TA;
defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
- sched, prd>,
- EVEX_CD8<32, CD8VF>;
+ sched, HasDQI>,
+ EVEX_CD8<32, CD8VF>, AVX512AIi8Base;
defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
- sched, prd>,
- EVEX_CD8<64, CD8VF> , VEX_W;
+ sched, HasDQI>,
+ EVEX_CD8<64, CD8VF>, AVX512AIi8Base, VEX_W;
defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
- sched.Scl, f32x_info, prd>, VEX_LIG,
- EVEX_CD8<32, CD8VT1>;
+ sched.Scl, f32x_info, HasDQI>, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>, AVX512AIi8Base;
defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
- sched.Scl, f64x_info, prd>, VEX_LIG,
- EVEX_CD8<64, CD8VT1>, VEX_W;
+ sched.Scl, f64x_info, HasDQI>, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>, AVX512AIi8Base, VEX_W;
}
-defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp,
- HasDQI>, AVX512AIi8Base, EVEX;
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp>, EVEX;
//-----------------------------------------------------------------
// Mask register copy, including
@@ -5957,35 +5961,50 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
X86SchedWriteWidths sched> {
+ let Predicates = [HasFP16] in {
+ defm PHZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32f16_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v32f16_info>,
+ EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+ defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
+ EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+ }
defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
- EVEX_V512, EVEX_CD8<32, CD8VF>;
+ EVEX_V512, EVEX_CD8<32, CD8VF>, T8PD;
defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
- EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
X86scalefsRnd, sched.Scl>,
- EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
X86scalefsRnd, sched.Scl>,
- EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
+ EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W, T8PD;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
- EVEX_V128, EVEX_CD8<32, CD8VF>;
+ EVEX_V128, EVEX_CD8<32, CD8VF>, T8PD;
defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
- EVEX_V256, EVEX_CD8<32, CD8VF>;
+ EVEX_V256, EVEX_CD8<32, CD8VF>, T8PD;
defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
- EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
- EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
+ }
+
+ let Predicates = [HasFP16, HasVLX] in {
+ defm PHZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8f16x_info>,
+ EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+ defm PHZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16f16x_info>,
+ EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6PD;
}
}
defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
- SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
+ SchedWriteFAdd>, NotEVEX2VEXConvertible;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
@@ -9254,10 +9273,11 @@ let Defs = [EFLAGS], Predicates = [HasFP16] in {
}
}
-/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd, rcpsh, rsqrtsh
multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ Predicate prd = HasAVX512> {
+ let Predicates = [prd], ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -9272,6 +9292,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
+defm VRCPSHZ : avx512_fp14_s<0x4D, "vrcpsh", X86rcp14s, SchedWriteFRcp.Scl,
+ f16x_info, HasFP16>, EVEX_CD8<16, CD8VT1>,
+ T_MAP6PD;
+defm VRSQRTSHZ : avx512_fp14_s<0x4F, "vrsqrtsh", X86rsqrt14s,
+ SchedWriteFRsqrt.Scl, f16x_info, HasFP16>,
+ EVEX_CD8<16, CD8VT1>, T_MAP6PD;
+let Uses = [MXCSR] in {
defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
f32x_info>, EVEX_CD8<32, CD8VT1>,
T8PD;
@@ -9284,6 +9311,7 @@ defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
SchedWriteFRsqrt.Scl, f64x_info>, VEX_W,
EVEX_CD8<64, CD8VT1>, T8PD;
+}
/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -9307,33 +9335,45 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
-let Uses = [MXCSR] in
multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched> {
- defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
- v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM,
- v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ let Uses = [MXCSR] in {
+ defm 14PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"), OpNode, sched.ZMM,
+ v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm 14PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"), OpNode, sched.ZMM,
+ v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+ let Predicates = [HasFP16] in
+ defm PHZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"), OpNode, sched.ZMM,
+ v32f16_info>, EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
// Define only if AVX512VL feature is present.
- let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, sched.XMM, v4f32x_info>,
- EVEX_V128, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, sched.YMM, v8f32x_info>,
- EVEX_V256, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, sched.XMM, v2f64x_info>,
- EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, sched.YMM, v4f64x_info>,
- EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ let Predicates = [HasVLX], Uses = [MXCSR] in {
+ defm 14PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"),
+ OpNode, sched.XMM, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm 14PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"),
+ OpNode, sched.YMM, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm 14PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"),
+ OpNode, sched.XMM, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm 14PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"),
+ OpNode, sched.YMM, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+ let Predicates = [HasFP16, HasVLX] in {
+ defm PHZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
+ OpNode, sched.XMM, v8f16x_info>,
+ EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+ defm PHZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
+ OpNode, sched.YMM, v16f16x_info>,
+ EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
}
}
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
+defm VRSQRT : avx512_fp14_p_vl_all<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>;
+defm VRCP : avx512_fp14_p_vl_all<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -9363,20 +9403,29 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
- sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V;
defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
- sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD, EVEX_4V;
+}
+
+multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
+ let Predicates = [HasFP16] in
+ defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode, OpNodeSAE, sched>,
+ EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V;
}
let Predicates = [HasERI] in {
defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
- SchedWriteFRcp.Scl>, T8PD, EVEX_4V;
+ SchedWriteFRcp.Scl>;
defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
- SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
+ SchedWriteFRsqrt.Scl>;
}
defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
- SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
+ SchedWriteFRnd.Scl>,
+ avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
+ SchedWriteFRnd.Scl>;
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -9440,6 +9489,19 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
}
}
+multiclass avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+ let Predicates = [HasFP16] in
+ defm PHZ : avx512_fp28_p<opc, OpcodeStr#"ph", v32f16_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"ph", v32f16_info, OpNodeSAE, sched.ZMM>,
+ T_MAP6PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
+ let Predicates = [HasFP16, HasVLX] in {
+ defm PHZ128 : avx512_fp28_p<opc, OpcodeStr#"ph", v8f16x_info, OpNode, sched.XMM>,
+ EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+ defm PHZ256 : avx512_fp28_p<opc, OpcodeStr#"ph", v16f16x_info, OpNode, sched.YMM>,
+ EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+ }
+}
let Predicates = [HasERI] in {
defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
SchedWriteFRsqrt>, EVEX;
@@ -9450,6 +9512,8 @@ let Predicates = [HasERI] in {
}
defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
SchedWriteFRnd>,
+ avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
+ SchedWriteFRnd>,
avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp,
SchedWriteFRnd>, EVEX;
@@ -9487,6 +9551,18 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
+ let Predicates = [HasFP16] in
+ defm PHZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+ sched.PH.ZMM, v32f16_info>,
+ EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+ let Predicates = [HasFP16, HasVLX] in {
+ defm PHZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+ sched.PH.XMM, v8f16x_info>,
+ EVEX_V128, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+ defm PHZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+ sched.PH.YMM, v16f16x_info>,
+ EVEX_V256, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+ }
defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
sched.PS.ZMM, v16f32_info>,
EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -9513,6 +9589,10 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
let Uses = [MXCSR] in
multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
+ let Predicates = [HasFP16] in
+ defm PHZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ph"),
+ sched.PH.ZMM, v32f16_info>,
+ EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
sched.PS.ZMM, v16f32_info>,
EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -9522,8 +9602,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
}
multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
- X86VectorVTInfo _, string Name> {
- let ExeDomain = _.ExeDomain in {
+ X86VectorVTInfo _, string Name, Predicate prd = HasAVX512> {
+ let ExeDomain = _.ExeDomain, Predicates = [prd] in {
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -9545,7 +9625,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
(i32 timm:$rc))>,
EVEX_B, EVEX_RC, Sched<[sched]>;
- let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -9558,13 +9638,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
}
}
- let Predicates = [HasAVX512] in {
+ let Predicates = [prd] in {
def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)),
(!cast<Instruction>(Name#Zr)
(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
}
- let Predicates = [HasAVX512, OptForSize] in {
+ let Predicates = [prd, OptForSize] in {
def : Pat<(_.EltVT (any_fsqrt (load addr:$src))),
(!cast<Instruction>(Name#Zm)
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
@@ -9573,6 +9653,8 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
+ defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
+ EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS;
defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
@@ -9637,6 +9719,12 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
}
}
+let Predicates = [HasFP16] in
+defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh",
+ SchedWriteFRnd.Scl, f16x_info>,
+ AVX512PSIi8Base, TA, EVEX_4V,
+ EVEX_CD8<16, CD8VT1>;
+
defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
SchedWriteFRnd.Scl, f32x_info>,
AVX512AIi8Base, EVEX_4V, VEX_LIG,
@@ -9665,6 +9753,9 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
}
}
+defm : avx512_masked_scalar<fsqrt, "SQRTSHZ", X86Movsh,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v8f16x_info,
+ fp16imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasFP16>;
defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
(v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
fp32imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
@@ -10883,24 +10974,26 @@ multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
bits<8> opcPs, bits<8> opcPd, SDPatternOperator OpNode,
SDPatternOperator MaskOpNode, SDNode OpNodeSAE,
X86SchedWriteWidths sched, Predicate prd>{
+ defm PH : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f16_info,
+ opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, HasFP16>,
+ AVX512PSIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
- EVEX_CD8<32, CD8VF>;
+ AVX512AIi8Base, EVEX, EVEX_CD8<32, CD8VF>;
defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
- EVEX_CD8<64, CD8VF>, VEX_W;
+ AVX512AIi8Base, EVEX, EVEX_CD8<64, CD8VF>, VEX_W;
}
defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
X86VReduce, X86VReduce, X86VReduceSAE,
- SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX;
+ SchedWriteFRnd, HasDQI>;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
X86any_VRndScale, X86VRndScale, X86VRndScaleSAE,
- SchedWriteFRnd, HasAVX512>,
- AVX512AIi8Base, EVEX;
+ SchedWriteFRnd, HasAVX512>;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
X86VGetMant, X86VGetMant, X86VGetMantSAE,
- SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX;
+ SchedWriteFRnd, HasAVX512>;
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
0x50, X86VRange, X86VRangeSAE,
@@ -10924,6 +11017,9 @@ defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>,
+ AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
@@ -10931,6 +11027,9 @@ defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>,
+ AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched,
@@ -12193,6 +12292,7 @@ multiclass AVX512_scalar_unary_math_patterns<SDPatternOperator OpNode, string Op
defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSH", X86Movsh, v8f16x_info>;
//===----------------------------------------------------------------------===//
// AES instructions
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 44007b34fcfe2..959c8d4a2d886 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -815,10 +815,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0 },
{ X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0 },
{ X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0 },
+ { X86::VFPCLASSPHZ128rr, X86::VFPCLASSPHZ128rm, 0 },
+ { X86::VFPCLASSPHZ256rr, X86::VFPCLASSPHZ256rm, 0 },
+ { X86::VFPCLASSPHZrr, X86::VFPCLASSPHZrm, 0 },
{ X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rm, 0 },
{ X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rm, 0 },
{ X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrm, 0 },
{ X86::VFPCLASSSDZrr, X86::VFPCLASSSDZrm, TB_NO_REVERSE },
+ { X86::VFPCLASSSHZrr, X86::VFPCLASSSHZrm, TB_NO_REVERSE },
{ X86::VFPCLASSSSZrr, X86::VFPCLASSSSZrm, TB_NO_REVERSE },
{ X86::VFRCZPDYrr, X86::VFRCZPDYrm, 0 },
{ X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
@@ -829,12 +833,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128m, 0 },
{ X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256m, 0 },
{ X86::VGETEXPPDZr, X86::VGETEXPPDZm, 0 },
+ { X86::VGETEXPPHZ128r, X86::VGETEXPPHZ128m, 0 },
+ { X86::VGETEXPPHZ256r, X86::VGETEXPPHZ256m, 0 },
+ { X86::VGETEXPPHZr, X86::VGETEXPPHZm, 0 },
{ X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128m, 0 },
{ X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256m, 0 },
{ X86::VGETEXPPSZr, X86::VGETEXPPSZm, 0 },
{ X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmi, 0 },
{ X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmi, 0 },
{ X86::VGETMANTPDZrri, X86::VGETMANTPDZrmi, 0 },
+ { X86::VGETMANTPHZ128rri, X86::VGETMANTPHZ128rmi, 0 },
+ { X86::VGETMANTPHZ256rri, X86::VGETMANTPHZ256rmi, 0 },
+ { X86::VGETMANTPHZrri, X86::VGETMANTPHZrmi, 0 },
{ X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 },
{ X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 },
{ X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 },
@@ -1161,17 +1171,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VRCP14PSZr, X86::VRCP14PSZm, 0 },
{ X86::VRCP28PDZr, X86::VRCP28PDZm, 0 },
{ X86::VRCP28PSZr, X86::VRCP28PSZm, 0 },
+ { X86::VRCPPHZ128r, X86::VRCPPHZ128m, 0 },
+ { X86::VRCPPHZ256r, X86::VRCPPHZ256m, 0 },
+ { X86::VRCPPHZr, X86::VRCPPHZm, 0 },
{ X86::VRCPPSYr, X86::VRCPPSYm, 0 },
{ X86::VRCPPSr, X86::VRCPPSm, 0 },
{ X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmi, 0 },
{ X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmi, 0 },
{ X86::VREDUCEPDZrri, X86::VREDUCEPDZrmi, 0 },
+ { X86::VREDUCEPHZ128rri, X86::VREDUCEPHZ128rmi, 0 },
+ { X86::VREDUCEPHZ256rri, X86::VREDUCEPHZ256rmi, 0 },
+ { X86::VREDUCEPHZrri, X86::VREDUCEPHZrmi, 0 },
{ X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmi, 0 },
{ X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmi, 0 },
{ X86::VREDUCEPSZrri, X86::VREDUCEPSZrmi, 0 },
{ X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmi, 0 },
{ X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmi, 0 },
{ X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmi, 0 },
+ { X86::VRNDSCALEPHZ128rri, X86::VRNDSCALEPHZ128rmi, 0 },
+ { X86::VRNDSCALEPHZ256rri, X86::VRNDSCALEPHZ256rmi, 0 },
+ { X86::VRNDSCALEPHZrri, X86::VRNDSCALEPHZrmi, 0 },
{ X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0 },
{ X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0 },
{ X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0 },
@@ -1187,6 +1206,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VRSQRT14PSZr, X86::VRSQRT14PSZm, 0 },
{ X86::VRSQRT28PDZr, X86::VRSQRT28PDZm, 0 },
{ X86::VRSQRT28PSZr, X86::VRSQRT28PSZm, 0 },
+ { X86::VRSQRTPHZ128r, X86::VRSQRTPHZ128m, 0 },
+ { X86::VRSQRTPHZ256r, X86::VRSQRTPHZ256m, 0 },
+ { X86::VRSQRTPHZr, X86::VRSQRTPHZm, 0 },
{ X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
{ X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
@@ -1194,6 +1216,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VSQRTPDZ256r, X86::VSQRTPDZ256m, 0 },
{ X86::VSQRTPDZr, X86::VSQRTPDZm, 0 },
{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPHZ128r, X86::VSQRTPHZ128m, 0 },
+ { X86::VSQRTPHZ256r, X86::VSQRTPHZ256m, 0 },
+ { X86::VSQRTPHZr, X86::VSQRTPHZm, 0 },
{ X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
{ X86::VSQRTPSZ128r, X86::VSQRTPSZ128m, 0 },
{ X86::VSQRTPSZ256r, X86::VSQRTPSZ256m, 0 },
@@ -1864,26 +1889,38 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0 },
{ X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0 },
{ X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0 },
+ { X86::VFPCLASSPHZ128rrk, X86::VFPCLASSPHZ128rmk, 0 },
+ { X86::VFPCLASSPHZ256rrk, X86::VFPCLASSPHZ256rmk, 0 },
+ { X86::VFPCLASSPHZrrk, X86::VFPCLASSPHZrmk, 0 },
{ X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmk, 0 },
{ X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmk, 0 },
{ X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmk, 0 },
{ X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE },
+ { X86::VFPCLASSSHZrrk, X86::VFPCLASSSHZrmk, TB_NO_REVERSE },
{ X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE },
{ X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mkz, 0 },
{ X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mkz, 0 },
{ X86::VGETEXPPDZrkz, X86::VGETEXPPDZmkz, 0 },
+ { X86::VGETEXPPHZ128rkz, X86::VGETEXPPHZ128mkz, 0 },
+ { X86::VGETEXPPHZ256rkz, X86::VGETEXPPHZ256mkz, 0 },
+ { X86::VGETEXPPHZrkz, X86::VGETEXPPHZmkz, 0 },
{ X86::VGETEXPPSZ128rkz, X86::VGETEXPPSZ128mkz, 0 },
{ X86::VGETEXPPSZ256rkz, X86::VGETEXPPSZ256mkz, 0 },
{ X86::VGETEXPPSZrkz, X86::VGETEXPPSZmkz, 0 },
{ X86::VGETEXPSDZr, X86::VGETEXPSDZm, TB_NO_REVERSE },
+ { X86::VGETEXPSHZr, X86::VGETEXPSHZm, TB_NO_REVERSE },
{ X86::VGETEXPSSZr, X86::VGETEXPSSZm, TB_NO_REVERSE },
{ X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmikz, 0 },
{ X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmikz, 0 },
{ X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmikz, 0 },
+ { X86::VGETMANTPHZ128rrikz, X86::VGETMANTPHZ128rmikz, 0 },
+ { X86::VGETMANTPHZ256rrikz, X86::VGETMANTPHZ256rmikz, 0 },
+ { X86::VGETMANTPHZrrikz, X86::VGETMANTPHZrmikz, 0 },
{ X86::VGETMANTPSZ128rrikz, X86::VGETMANTPSZ128rmikz, 0 },
{ X86::VGETMANTPSZ256rrikz, X86::VGETMANTPSZ256rmikz, 0 },
{ X86::VGETMANTPSZrrikz, X86::VGETMANTPSZrmikz, 0 },
{ X86::VGETMANTSDZrri, X86::VGETMANTSDZrmi, TB_NO_REVERSE },
+ { X86::VGETMANTSHZrri, X86::VGETMANTSHZrmi, TB_NO_REVERSE },
{ X86::VGETMANTSSZrri, X86::VGETMANTSSZrmi, TB_NO_REVERSE },
{ X86::VGF2P8AFFINEINVQBYrri, X86::VGF2P8AFFINEINVQBYrmi, 0 },
{ X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 },
@@ -2899,24 +2936,37 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VRCP28PSZrkz, X86::VRCP28PSZmkz, 0 },
{ X86::VRCP28SDZr, X86::VRCP28SDZm, TB_NO_REVERSE },
{ X86::VRCP28SSZr, X86::VRCP28SSZm, TB_NO_REVERSE },
+ { X86::VRCPPHZ128rkz, X86::VRCPPHZ128mkz, 0 },
+ { X86::VRCPPHZ256rkz, X86::VRCPPHZ256mkz, 0 },
+ { X86::VRCPPHZrkz, X86::VRCPPHZmkz, 0 },
+ { X86::VRCPSHZrr, X86::VRCPSHZrm, TB_NO_REVERSE },
{ X86::VRCPSSr, X86::VRCPSSm, 0 },
{ X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
{ X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmikz, 0 },
{ X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmikz, 0 },
{ X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmikz, 0 },
+ { X86::VREDUCEPHZ128rrikz, X86::VREDUCEPHZ128rmikz, 0 },
+ { X86::VREDUCEPHZ256rrikz, X86::VREDUCEPHZ256rmikz, 0 },
+ { X86::VREDUCEPHZrrikz, X86::VREDUCEPHZrmikz, 0 },
{ X86::VREDUCEPSZ128rrikz, X86::VREDUCEPSZ128rmikz, 0 },
{ X86::VREDUCEPSZ256rrikz, X86::VREDUCEPSZ256rmikz, 0 },
{ X86::VREDUCEPSZrrikz, X86::VREDUCEPSZrmikz, 0 },
{ X86::VREDUCESDZrri, X86::VREDUCESDZrmi, TB_NO_REVERSE },
+ { X86::VREDUCESHZrri, X86::VREDUCESHZrmi, TB_NO_REVERSE },
{ X86::VREDUCESSZrri, X86::VREDUCESSZrmi, TB_NO_REVERSE },
{ X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmikz, 0 },
{ X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmikz, 0 },
{ X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmikz, 0 },
+ { X86::VRNDSCALEPHZ128rrikz, X86::VRNDSCALEPHZ128rmikz, 0 },
+ { X86::VRNDSCALEPHZ256rrikz, X86::VRNDSCALEPHZ256rmikz, 0 },
+ { X86::VRNDSCALEPHZrrikz, X86::VRNDSCALEPHZrmikz, 0 },
{ X86::VRNDSCALEPSZ128rrikz, X86::VRNDSCALEPSZ128rmikz, 0 },
{ X86::VRNDSCALEPSZ256rrikz, X86::VRNDSCALEPSZ256rmikz, 0 },
{ X86::VRNDSCALEPSZrrikz, X86::VRNDSCALEPSZrmikz, 0 },
{ X86::VRNDSCALESDZr, X86::VRNDSCALESDZm, 0 },
{ X86::VRNDSCALESDZr_Int, X86::VRNDSCALESDZm_Int, TB_NO_REVERSE },
+ { X86::VRNDSCALESHZr, X86::VRNDSCALESHZm, 0 },
+ { X86::VRNDSCALESHZr_Int, X86::VRNDSCALESHZm_Int, TB_NO_REVERSE },
{ X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0 },
{ X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE },
{ X86::VROUNDSDr, X86::VROUNDSDm, 0 },
@@ -2935,15 +2985,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmkz, 0 },
{ X86::VRSQRT28SDZr, X86::VRSQRT28SDZm, TB_NO_REVERSE },
{ X86::VRSQRT28SSZr, X86::VRSQRT28SSZm, TB_NO_REVERSE },
+ { X86::VRSQRTPHZ128rkz, X86::VRSQRTPHZ128mkz, 0 },
+ { X86::VRSQRTPHZ256rkz, X86::VRSQRTPHZ256mkz, 0 },
+ { X86::VRSQRTPHZrkz, X86::VRSQRTPHZmkz, 0 },
+ { X86::VRSQRTSHZrr, X86::VRSQRTSHZrm, TB_NO_REVERSE },
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
{ X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rm, 0 },
{ X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rm, 0 },
{ X86::VSCALEFPDZrr, X86::VSCALEFPDZrm, 0 },
+ { X86::VSCALEFPHZ128rr, X86::VSCALEFPHZ128rm, 0 },
+ { X86::VSCALEFPHZ256rr, X86::VSCALEFPHZ256rm, 0 },
+ { X86::VSCALEFPHZrr, X86::VSCALEFPHZrm, 0 },
{ X86::VSCALEFPSZ128rr, X86::VSCALEFPSZ128rm, 0 },
{ X86::VSCALEFPSZ256rr, X86::VSCALEFPSZ256rm, 0 },
{ X86::VSCALEFPSZrr, X86::VSCALEFPSZrm, 0 },
{ X86::VSCALEFSDZrr, X86::VSCALEFSDZrm, TB_NO_REVERSE },
+ { X86::VSCALEFSHZrr, X86::VSCALEFSHZrm, TB_NO_REVERSE },
{ X86::VSCALEFSSZrr, X86::VSCALEFSSZrm, TB_NO_REVERSE },
{ X86::VSHUFF32X4Z256rri, X86::VSHUFF32X4Z256rmi, 0 },
{ X86::VSHUFF32X4Zrri, X86::VSHUFF32X4Zrmi, 0 },
@@ -2966,6 +3024,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mkz, 0 },
{ X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mkz, 0 },
{ X86::VSQRTPDZrkz, X86::VSQRTPDZmkz, 0 },
+ { X86::VSQRTPHZ128rkz, X86::VSQRTPHZ128mkz, 0 },
+ { X86::VSQRTPHZ256rkz, X86::VSQRTPHZ256mkz, 0 },
+ { X86::VSQRTPHZrkz, X86::VSQRTPHZmkz, 0 },
{ X86::VSQRTPSZ128rkz, X86::VSQRTPSZ128mkz, 0 },
{ X86::VSQRTPSZ256rkz, X86::VSQRTPSZ256mkz, 0 },
{ X86::VSQRTPSZrkz, X86::VSQRTPSZmkz, 0 },
@@ -2973,6 +3034,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VSQRTSDZr_Int, X86::VSQRTSDZm_Int, TB_NO_REVERSE },
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
{ X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSHZr, X86::VSQRTSHZm, 0 },
+ { X86::VSQRTSHZr_Int, X86::VSQRTSHZm_Int, TB_NO_REVERSE },
{ X86::VSQRTSSZr, X86::VSQRTSSZm, 0 },
{ X86::VSQRTSSZr_Int, X86::VSQRTSSZm_Int, TB_NO_REVERSE },
{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
@@ -3539,18 +3602,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mk, 0 },
{ X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mk, 0 },
{ X86::VGETEXPPDZrk, X86::VGETEXPPDZmk, 0 },
+ { X86::VGETEXPPHZ128rk, X86::VGETEXPPHZ128mk, 0 },
+ { X86::VGETEXPPHZ256rk, X86::VGETEXPPHZ256mk, 0 },
+ { X86::VGETEXPPHZrk, X86::VGETEXPPHZmk, 0 },
{ X86::VGETEXPPSZ128rk, X86::VGETEXPPSZ128mk, 0 },
{ X86::VGETEXPPSZ256rk, X86::VGETEXPPSZ256mk, 0 },
{ X86::VGETEXPPSZrk, X86::VGETEXPPSZmk, 0 },
{ X86::VGETEXPSDZrkz, X86::VGETEXPSDZmkz, TB_NO_REVERSE },
+ { X86::VGETEXPSHZrkz, X86::VGETEXPSHZmkz, TB_NO_REVERSE },
{ X86::VGETEXPSSZrkz, X86::VGETEXPSSZmkz, TB_NO_REVERSE },
{ X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmik, 0 },
{ X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmik, 0 },
{ X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmik, 0 },
+ { X86::VGETMANTPHZ128rrik, X86::VGETMANTPHZ128rmik, 0 },
+ { X86::VGETMANTPHZ256rrik, X86::VGETMANTPHZ256rmik, 0 },
+ { X86::VGETMANTPHZrrik, X86::VGETMANTPHZrmik, 0 },
{ X86::VGETMANTPSZ128rrik, X86::VGETMANTPSZ128rmik, 0 },
{ X86::VGETMANTPSZ256rrik, X86::VGETMANTPSZ256rmik, 0 },
{ X86::VGETMANTPSZrrik, X86::VGETMANTPSZrmik, 0 },
{ X86::VGETMANTSDZrrikz, X86::VGETMANTSDZrmikz, TB_NO_REVERSE },
+ { X86::VGETMANTSHZrrikz, X86::VGETMANTSHZrmikz, TB_NO_REVERSE },
{ X86::VGETMANTSSZrrikz, X86::VGETMANTSSZrmikz, TB_NO_REVERSE },
{ X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 },
{ X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 },
@@ -4331,21 +4402,33 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VRCP28PSZrk, X86::VRCP28PSZmk, 0 },
{ X86::VRCP28SDZrkz, X86::VRCP28SDZmkz, TB_NO_REVERSE },
{ X86::VRCP28SSZrkz, X86::VRCP28SSZmkz, TB_NO_REVERSE },
+ { X86::VRCPPHZ128rk, X86::VRCPPHZ128mk, 0 },
+ { X86::VRCPPHZ256rk, X86::VRCPPHZ256mk, 0 },
+ { X86::VRCPPHZrk, X86::VRCPPHZmk, 0 },
+ { X86::VRCPSHZrrkz, X86::VRCPSHZrmkz, TB_NO_REVERSE },
{ X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmik, 0 },
{ X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmik, 0 },
{ X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmik, 0 },
+ { X86::VREDUCEPHZ128rrik, X86::VREDUCEPHZ128rmik, 0 },
+ { X86::VREDUCEPHZ256rrik, X86::VREDUCEPHZ256rmik, 0 },
+ { X86::VREDUCEPHZrrik, X86::VREDUCEPHZrmik, 0 },
{ X86::VREDUCEPSZ128rrik, X86::VREDUCEPSZ128rmik, 0 },
{ X86::VREDUCEPSZ256rrik, X86::VREDUCEPSZ256rmik, 0 },
{ X86::VREDUCEPSZrrik, X86::VREDUCEPSZrmik, 0 },
{ X86::VREDUCESDZrrikz, X86::VREDUCESDZrmikz, TB_NO_REVERSE },
+ { X86::VREDUCESHZrrikz, X86::VREDUCESHZrmikz, TB_NO_REVERSE },
{ X86::VREDUCESSZrrikz, X86::VREDUCESSZrmikz, TB_NO_REVERSE },
{ X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmik, 0 },
{ X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmik, 0 },
{ X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmik, 0 },
+ { X86::VRNDSCALEPHZ128rrik, X86::VRNDSCALEPHZ128rmik, 0 },
+ { X86::VRNDSCALEPHZ256rrik, X86::VRNDSCALEPHZ256rmik, 0 },
+ { X86::VRNDSCALEPHZrrik, X86::VRNDSCALEPHZrmik, 0 },
{ X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0 },
{ X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0 },
{ X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0 },
{ X86::VRNDSCALESDZr_Intkz, X86::VRNDSCALESDZm_Intkz, TB_NO_REVERSE },
+ { X86::VRNDSCALESHZr_Intkz, X86::VRNDSCALESHZm_Intkz, TB_NO_REVERSE },
{ X86::VRNDSCALESSZr_Intkz, X86::VRNDSCALESSZm_Intkz, TB_NO_REVERSE },
{ X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0 },
{ X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0 },
@@ -4359,13 +4442,21 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmk, 0 },
{ X86::VRSQRT28SDZrkz, X86::VRSQRT28SDZmkz, TB_NO_REVERSE },
{ X86::VRSQRT28SSZrkz, X86::VRSQRT28SSZmkz, TB_NO_REVERSE },
+ { X86::VRSQRTPHZ128rk, X86::VRSQRTPHZ128mk, 0 },
+ { X86::VRSQRTPHZ256rk, X86::VRSQRTPHZ256mk, 0 },
+ { X86::VRSQRTPHZrk, X86::VRSQRTPHZmk, 0 },
+ { X86::VRSQRTSHZrrkz, X86::VRSQRTSHZrmkz, TB_NO_REVERSE },
{ X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmkz, 0 },
{ X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmkz, 0 },
{ X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmkz, 0 },
+ { X86::VSCALEFPHZ128rrkz, X86::VSCALEFPHZ128rmkz, 0 },
+ { X86::VSCALEFPHZ256rrkz, X86::VSCALEFPHZ256rmkz, 0 },
+ { X86::VSCALEFPHZrrkz, X86::VSCALEFPHZrmkz, 0 },
{ X86::VSCALEFPSZ128rrkz, X86::VSCALEFPSZ128rmkz, 0 },
{ X86::VSCALEFPSZ256rrkz, X86::VSCALEFPSZ256rmkz, 0 },
{ X86::VSCALEFPSZrrkz, X86::VSCALEFPSZrmkz, 0 },
{ X86::VSCALEFSDZrrkz, X86::VSCALEFSDZrmkz, TB_NO_REVERSE },
+ { X86::VSCALEFSHZrrkz, X86::VSCALEFSHZrmkz, TB_NO_REVERSE },
{ X86::VSCALEFSSZrrkz, X86::VSCALEFSSZrmkz, TB_NO_REVERSE },
{ X86::VSHUFF32X4Z256rrikz, X86::VSHUFF32X4Z256rmikz, 0 },
{ X86::VSHUFF32X4Zrrikz, X86::VSHUFF32X4Zrmikz, 0 },
@@ -4384,10 +4475,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mk, 0 },
{ X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mk, 0 },
{ X86::VSQRTPDZrk, X86::VSQRTPDZmk, 0 },
+ { X86::VSQRTPHZ128rk, X86::VSQRTPHZ128mk, 0 },
+ { X86::VSQRTPHZ256rk, X86::VSQRTPHZ256mk, 0 },
+ { X86::VSQRTPHZrk, X86::VSQRTPHZmk, 0 },
{ X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0 },
{ X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0 },
{ X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0 },
{ X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE },
+ { X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE },
{ X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE },
{ X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
{ X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
@@ -4763,8 +4858,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE },
{ X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE },
{ X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE },
+ { X86::VGETEXPSHZrk, X86::VGETEXPSHZmk, TB_NO_REVERSE },
{ X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE },
{ X86::VGETMANTSDZrrik, X86::VGETMANTSDZrmik, TB_NO_REVERSE },
+ { X86::VGETMANTSHZrrik, X86::VGETMANTSHZrmik, TB_NO_REVERSE },
{ X86::VGETMANTSSZrrik, X86::VGETMANTSSZrmik, TB_NO_REVERSE },
{ X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 },
{ X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 },
@@ -5320,21 +5417,29 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VRCP14SSZrrk, X86::VRCP14SSZrmk, TB_NO_REVERSE },
{ X86::VRCP28SDZrk, X86::VRCP28SDZmk, TB_NO_REVERSE },
{ X86::VRCP28SSZrk, X86::VRCP28SSZmk, TB_NO_REVERSE },
+ { X86::VRCPSHZrrk, X86::VRCPSHZrmk, TB_NO_REVERSE },
{ X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE },
+ { X86::VREDUCESHZrrik, X86::VREDUCESHZrmik, TB_NO_REVERSE },
{ X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE },
{ X86::VRNDSCALESDZr_Intk, X86::VRNDSCALESDZm_Intk, TB_NO_REVERSE },
+ { X86::VRNDSCALESHZr_Intk, X86::VRNDSCALESHZm_Intk, TB_NO_REVERSE },
{ X86::VRNDSCALESSZr_Intk, X86::VRNDSCALESSZm_Intk, TB_NO_REVERSE },
{ X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE },
{ X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE },
{ X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE },
{ X86::VRSQRT28SSZrk, X86::VRSQRT28SSZmk, TB_NO_REVERSE },
+ { X86::VRSQRTSHZrrk, X86::VRSQRTSHZrmk, TB_NO_REVERSE },
{ X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmk, 0 },
{ X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmk, 0 },
{ X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmk, 0 },
+ { X86::VSCALEFPHZ128rrk, X86::VSCALEFPHZ128rmk, 0 },
+ { X86::VSCALEFPHZ256rrk, X86::VSCALEFPHZ256rmk, 0 },
+ { X86::VSCALEFPHZrrk, X86::VSCALEFPHZrmk, 0 },
{ X86::VSCALEFPSZ128rrk, X86::VSCALEFPSZ128rmk, 0 },
{ X86::VSCALEFPSZ256rrk, X86::VSCALEFPSZ256rmk, 0 },
{ X86::VSCALEFPSZrrk, X86::VSCALEFPSZrmk, 0 },
{ X86::VSCALEFSDZrrk, X86::VSCALEFSDZrmk, TB_NO_REVERSE },
+ { X86::VSCALEFSHZrrk, X86::VSCALEFSHZrmk, TB_NO_REVERSE },
{ X86::VSCALEFSSZrrk, X86::VSCALEFSSZrmk, TB_NO_REVERSE },
{ X86::VSHUFF32X4Z256rrik, X86::VSHUFF32X4Z256rmik, 0 },
{ X86::VSHUFF32X4Zrrik, X86::VSHUFF32X4Zrmik, 0 },
@@ -5351,6 +5456,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 },
{ X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 },
{ X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE },
+ { X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE },
{ X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE },
{ X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
{ X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index a57a956e8135e..7f0e151b9eba2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5269,6 +5269,29 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
case X86::VRCP14SDZrm:
case X86::VRCP14SSZrr:
case X86::VRCP14SSZrm:
+ case X86::VRCPSHZrr:
+ case X86::VRCPSHZrm:
+ case X86::VRSQRTSHZrr:
+ case X86::VRSQRTSHZrm:
+ case X86::VREDUCESHZrmi:
+ case X86::VREDUCESHZrri:
+ case X86::VREDUCESHZrrib:
+ case X86::VGETEXPSHZr:
+ case X86::VGETEXPSHZrb:
+ case X86::VGETEXPSHZm:
+ case X86::VGETMANTSHZrri:
+ case X86::VGETMANTSHZrrib:
+ case X86::VGETMANTSHZrmi:
+ case X86::VRNDSCALESHZr:
+ case X86::VRNDSCALESHZr_Int:
+ case X86::VRNDSCALESHZrb_Int:
+ case X86::VRNDSCALESHZm:
+ case X86::VRNDSCALESHZm_Int:
+ case X86::VSQRTSHZr:
+ case X86::VSQRTSHZr_Int:
+ case X86::VSQRTSHZrb_Int:
+ case X86::VSQRTSHZm:
+ case X86::VSQRTSHZm_Int:
case X86::VRCP28SDZr:
case X86::VRCP28SDZrb:
case X86::VRCP28SDZm:
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index db7e42b20cb14..efc4811084f94 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -989,6 +989,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
X86_INTRINSIC_DATA(avx512fp16_add_ph_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512fp16_div_ph_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512fp16_mask_add_sh_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FADDS, X86ISD::FADDS_RND),
X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
@@ -998,12 +1001,52 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
X86_INTRINSIC_DATA(avx512fp16_mask_div_sh_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_fpclass_sh, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getexp_sh, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_128, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_256, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_getmant_sh, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
X86_INTRINSIC_DATA(avx512fp16_mask_max_sh_round, INTR_TYPE_SCALAR_MASK_SAE,
X86ISD::FMAXS, X86ISD::FMAXS_SAE),
X86_INTRINSIC_DATA(avx512fp16_mask_min_sh_round, INTR_TYPE_SCALAR_MASK_SAE,
X86ISD::FMINS, X86ISD::FMINS_SAE),
X86_INTRINSIC_DATA(avx512fp16_mask_mul_sh_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FMULS, X86ISD::FMULS_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rcp_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_reduce_sh, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_sh, INTR_TYPE_SCALAR_MASK,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_scalef_sh, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512fp16_mask_sqrt_sh, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FSUBS, X86ISD::FSUBS_RND),
X86_INTRINSIC_DATA(avx512fp16_mask_vcvtdq2ph_128, TRUNCATE_TO_REG,
@@ -1124,6 +1167,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512fp16_min_ph_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512fp16_min_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512fp16_sqrt_ph_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
/*fp16 scalar convert instruction*/
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
index 194b1c48c3846..e897c195b9068 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
@@ -274,6 +274,174 @@ entry:
ret <32 x i1> %0
}
+define half @fneg(half %x) {
+; CHECK-LABEL: fneg:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a = fneg half %x
+ ret half %a
+}
+
+define half @fneg_idiom(half %x) {
+; CHECK-LABEL: fneg_idiom:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a = fsub half -0.0, %x
+ ret half %a
+}
+
+define half @fabs(half %x) {
+; CHECK-LABEL: fabs:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.fabs.f16(half %x)
+ ret half %a
+}
+declare half @llvm.fabs.f16(half)
+
+define half @fcopysign(half %x, half %y) {
+; CHECK-LABEL: fcopysign:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call half @llvm.copysign.f16(half %x, half %y)
+ ret half %a
+}
+declare half @llvm.copysign.f16(half, half)
+
+define <8 x half> @fnegv8f16(<8 x half> %x) {
+; CHECK-LABEL: fnegv8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a = fneg <8 x half> %x
+ ret <8 x half> %a
+}
+
+define <8 x half> @fneg_idiomv8f16(<8 x half> %x) {
+; CHECK-LABEL: fneg_idiomv8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a = fsub <8 x half> <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>, %x
+ ret <8 x half> %a
+}
+
+define <8 x half> @fabsv8f16(<8 x half> %x) {
+; CHECK-LABEL: fabsv8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.fabs.v8f16(<8 x half> %x)
+ ret <8 x half> %a
+}
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+
+define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fcopysignv8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y)
+ ret <8 x half> %a
+}
+declare <8 x half> @llvm.copysign.v8f16(<8 x half>, <8 x half>)
+
+define <16 x half> @fnegv16f16(<16 x half> %x) {
+; CHECK-LABEL: fnegv16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %a = fneg <16 x half> %x
+ ret <16 x half> %a
+}
+
+define <16 x half> @fneg_idiomv16f16(<16 x half> %x) {
+; CHECK-LABEL: fneg_idiomv16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %a = fsub <16 x half> <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>, %x
+ ret <16 x half> %a
+}
+
+define <16 x half> @fabsv16f16(<16 x half> %x) {
+; CHECK-LABEL: fabsv16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.fabs.v16f16(<16 x half> %x)
+ ret <16 x half> %a
+}
+declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
+
+define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) {
+; CHECK-LABEL: fcopysignv16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y)
+ ret <16 x half> %a
+}
+declare <16 x half> @llvm.copysign.v16f16(<16 x half>, <16 x half>)
+
+define <32 x half> @fnegv32f16(<32 x half> %x) {
+; CHECK-LABEL: fnegv32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = fneg <32 x half> %x
+ ret <32 x half> %a
+}
+
+define <32 x half> @fneg_idiomv32f16(<32 x half> %x) {
+; CHECK-LABEL: fneg_idiomv32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = fsub <32 x half> <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>, %x
+ ret <32 x half> %a
+}
+
+define <32 x half> @fabsv32f16(<32 x half> %x) {
+; CHECK-LABEL: fabsv32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.fabs.v32f16(<32 x half> %x)
+ ret <32 x half> %a
+}
+declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
+
+define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: fcopysignv32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y)
+ ret <32 x half> %a
+}
+declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>)
+
define <8 x half> @regression_test1(<8 x half> %x, <8 x half> %y) #0 {
; CHECK-LABEL: regression_test1:
; CHECK: ## %bb.0: ## %entry
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index cb31baf9a82ea..170e1ea1a6a92 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -13,6 +13,472 @@ define i32 @test_x86_avx512fp16_ucomi_sh_lt(<8 x half> %a0, <8 x half> %a1) {
ret i32 %res
}
+declare <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half>, i32) nounwind readnone
+
+define <32 x half> @test_sqrt_ph_512(<32 x half> %a0) {
+; CHECK-LABEL: test_sqrt_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+ ret <32 x half> %1
+}
+
+define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_sqrt_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+ %2 = bitcast i32 %mask to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %passthru
+ ret <32 x half> %3
+}
+
+define <32 x half> @test_maskz_sqrt_ph_512(<32 x half> %a0, i32 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+ %2 = bitcast i32 %mask to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer
+ ret <32 x half> %3
+}
+
+declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
+
+define <32 x half> @test_sqrt_round_ph_512(<32 x half> %a0) {
+; CHECK-LABEL: test_sqrt_round_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph {rz-sae}, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11)
+ ret <32 x half> %1
+}
+
+define <32 x half> @test_mask_sqrt_round_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_sqrt_round_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph {rz-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11)
+ %2 = bitcast i32 %mask to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %passthru
+ ret <32 x half> %3
+}
+
+define <32 x half> @test_maskz_sqrt_round_ph_512(<32 x half> %a0, i32 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_round_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph {rz-sae}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11)
+ %2 = bitcast i32 %mask to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer
+ ret <32 x half> %3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) nounwind readnone
+
+define <8 x half> @test_sqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_sqrt_sh_r(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sh_r:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtsh {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask, i32 10)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_sqrt_sh_nomask(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+; CHECK-LABEL: test_sqrt_sh_nomask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 -1, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_sqrt_sh_z(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sh_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtsh {ru-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 10)
+ ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <32 x half> @test_rsqrt_ph_512(<32 x half> %a0) {
+; CHECK-LABEL: test_rsqrt_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrtph %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 -1)
+ ret <32 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+; CHECK-LABEL: test_rsqrt_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrtsh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> %a2, i8 -1)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh_load(<8 x half> %a0, <8 x half>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt_sh_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrtsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a1 = load <8 x half>, <8 x half>* %a1ptr
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh_maskz(<8 x half> %a0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_sh_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrsqrtsh %xmm0, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> zeroinitializer, i8 %mask)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh_mask(<8 x half> %a0, <8 x half> %b0, <8 x half> %c0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_sh_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrsqrtsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %b0, <8 x half> %c0, i8 %mask)
+ ret <8 x half> %res
+}
+
+declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
+
+define i32 @test_int_x86_avx512_fpclass_ph_512(<32 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfpclassph $2, %zmm0, %k1
+; CHECK-NEXT: vfpclassph $4, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %x0, i32 4)
+ %res1 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %x0, i32 2)
+ %1 = and <32 x i1> %res1, %res
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
+
+define i8 @test_int_x86_avx512_mask_fpclass_sh(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfpclasssh $4, %xmm0, %k1
+; CHECK-NEXT: vfpclasssh $2, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 2, i8 -1)
+ %res1 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 4, i8 %res)
+ ret i8 %res1
+}
+
+define i8 @test_int_x86_avx512_mask_fpclass_sh_load(<8 x half>* %x0ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfpclasssh $4, (%rdi), %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %x0 = load <8 x half>, <8 x half>* %x0ptr
+ %res = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 4, i8 -1)
+ ret i8 %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_rcp_ph_512(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
+; CHECK-LABEL: test_rcp_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrcpph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 %mask)
+ ret <32 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <8 x half> @test_rcp_sh(<8 x half> %a0) {
+; CHECK-LABEL: test_rcp_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrcpsh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> zeroinitializer, i8 -1)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_rcp_sh_load(<8 x half> %a0, <8 x half>* %a1ptr) {
+; CHECK-LABEL: test_rcp_sh_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrcpsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a1 = load <8 x half>, <8 x half>* %a1ptr
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 -1)
+ ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_reduce_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vreduceph $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vreduceph $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4)
+ %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8)
+ %res2 = fadd <32 x half> %res, %res1
+ ret <32 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_reduce_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vreducesh $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_reduce_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sh_nomask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vreducesh $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 4, i32 8)
+ ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_rndscale_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrndscaleph $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vrndscaleph $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4)
+ %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8)
+ %res2 = fadd <32 x half> %res, %res1
+ ret <32 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_rndscale_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrndscalesh $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_rndscale_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_sh_nomask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscalesh $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 4, i32 8)
+ ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_getexp_ph_512(<32 x half> %x0, <32 x half> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetexpph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vgetexpph {sae}, %zmm0, %zmm0
+; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %x0, <32 x half> %x1, i32 %x2, i32 4)
+ %res2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %x0, <32 x half> zeroinitializer, i32 -1, i32 8)
+ %res3 = fadd <32 x half> %res1, %res2
+ ret <32 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetexpsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh_nomask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgetexpsh {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 8)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_sh_load(<8 x half> %x0, <8 x half>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgetexpsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <8 x half>, <8 x half>* %x1ptr
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> undef, i8 -1, i32 4)
+ ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_getmant_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetmantph $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vgetmantph $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4)
+ %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8)
+ %res2 = fadd <32 x half> %res, %res1
+ ret <32 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> %x3, i8 %x4, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh_nomask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> %x3, i8 -1, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_sh_z(<8 x half> %x0, <8 x half> %x1, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> zeroinitializer, i8 %x4, i32 4)
+ ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half>, <32 x half>, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_scalef_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vscalefph {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vscalefph {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddph %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %mask = bitcast i32 %x3 to <32 x i1>
+ %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3, i32 11)
+ %res2 = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> zeroinitializer, i32 -1, i32 8)
+ %res3 = fadd <32 x half> %res1, %res2
+ ret <32 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vscalefsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh_nomask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vscalefsh {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 8)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_sh_load(<8 x half> %x0, <8 x half>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vscalefsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <8 x half>, <8 x half>* %x1ptr
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> undef, i8 -1, i32 4)
+ ret <8 x half> %res
+}
+
declare <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
diff --git a/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll b/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll
new file mode 100644
index 0000000000000..c958b7e86d9f1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512fp16 | FileCheck %s
+
+declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
+declare <16 x half> @llvm.ceil.v16f16(<16 x half>)
+declare <32 x half> @llvm.ceil.v32f16(<32 x half>)
+
+define <8 x half> @ceil_v8f16(<8 x half> %p) {
+; CHECK-LABEL: ceil_v8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $10, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t = call <8 x half> @llvm.ceil.v8f16(<8 x half> %p)
+ ret <8 x half> %t
+}
+
+define <16 x half> @ceil_v16f16(<16 x half> %p) {
+; CHECK-LABEL: ceil_v16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $10, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t = call <16 x half> @llvm.ceil.v16f16(<16 x half> %p)
+ ret <16 x half> %t
+}
+
+define <32 x half> @ceil_v32f16(<32 x half> %p) {
+; CHECK-LABEL: ceil_v32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $10, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %t = call <32 x half> @llvm.ceil.v32f16(<32 x half> %p)
+ ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.floor.v8f16(<8 x half>)
+declare <16 x half> @llvm.floor.v16f16(<16 x half>)
+declare <32 x half> @llvm.floor.v32f16(<32 x half>)
+
+define <8 x half> @floor_v8f16(<8 x half> %p) {
+; CHECK-LABEL: floor_v8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $9, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t = call <8 x half> @llvm.floor.v8f16(<8 x half> %p)
+ ret <8 x half> %t
+}
+
+define <16 x half> @floor_v16f16(<16 x half> %p) {
+; CHECK-LABEL: floor_v16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $9, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t = call <16 x half> @llvm.floor.v16f16(<16 x half> %p)
+ ret <16 x half> %t
+}
+
+define <32 x half> @floor_v32f16(<32 x half> %p) {
+; CHECK-LABEL: floor_v32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $9, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %t = call <32 x half> @llvm.floor.v32f16(<32 x half> %p)
+ ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
+declare <16 x half> @llvm.trunc.v16f16(<16 x half>)
+declare <32 x half> @llvm.trunc.v32f16(<32 x half>)
+
+define <8 x half> @trunc_v8f16(<8 x half> %p) {
+; CHECK-LABEL: trunc_v8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $11, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t = call <8 x half> @llvm.trunc.v8f16(<8 x half> %p)
+ ret <8 x half> %t
+}
+
+define <16 x half> @trunc_v16f16(<16 x half> %p) {
+; CHECK-LABEL: trunc_v16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $11, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t = call <16 x half> @llvm.trunc.v16f16(<16 x half> %p)
+ ret <16 x half> %t
+}
+
+define <32 x half> @trunc_v32f16(<32 x half> %p) {
+; CHECK-LABEL: trunc_v32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $11, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %t = call <32 x half> @llvm.trunc.v32f16(<32 x half> %p)
+ ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
+declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>)
+declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>)
+
+define <8 x half> @nearbyint_v8f16(<8 x half> %p) {
+; CHECK-LABEL: nearbyint_v8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $12, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %p)
+ ret <8 x half> %t
+}
+
+define <16 x half> @nearbyint_v16f16(<16 x half> %p) {
+; CHECK-LABEL: nearbyint_v16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $12, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %p)
+ ret <16 x half> %t
+}
+
+define <32 x half> @nearbyint_v32f16(<32 x half> %p) {
+; CHECK-LABEL: nearbyint_v32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $12, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %t = call <32 x half> @llvm.nearbyint.v32f16(<32 x half> %p)
+ ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.rint.v8f16(<8 x half>)
+declare <16 x half> @llvm.rint.v16f16(<16 x half>)
+declare <32 x half> @llvm.rint.v32f16(<32 x half>)
+
+define <8 x half> @rint_v8f16(<8 x half> %p) {
+; CHECK-LABEL: rint_v8f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $4, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t = call <8 x half> @llvm.rint.v8f16(<8 x half> %p)
+ ret <8 x half> %t
+}
+
+define <16 x half> @rint_v16f16(<16 x half> %p) {
+; CHECK-LABEL: rint_v16f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $4, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t = call <16 x half> @llvm.rint.v16f16(<16 x half> %p)
+ ret <16 x half> %t
+}
+
+define <32 x half> @rint_v32f16(<32 x half> %p) {
+; CHECK-LABEL: rint_v32f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaleph $4, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %t = call <32 x half> @llvm.rint.v32f16(<32 x half> %p)
+ ret <32 x half> %t
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-scalar.ll b/llvm/test/CodeGen/X86/avx512fp16-scalar.ll
new file mode 100644
index 0000000000000..36145e86469aa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-scalar.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s
+
+declare half @llvm.ceil.f16(half)
+declare half @llvm.floor.f16(half)
+declare half @llvm.trunc.f16(half)
+declare half @llvm.rint.f16(half)
+declare half @llvm.nearbyint.f16(half)
+
+define half @test_ceil(half %a) {
+; CHECK-LABEL: test_ceil:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscalesh $10, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0a]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %c = call half @llvm.ceil.f16(half %a)
+ ret half %c
+}
+
+define half @test_floor(half %a) {
+; CHECK-LABEL: test_floor:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscalesh $9, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x09]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %c = call half @llvm.floor.f16(half %a)
+ ret half %c
+}
+
+define half @test_trunc(half %a) {
+; CHECK-LABEL: test_trunc:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0b]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %c = call half @llvm.trunc.f16(half %a)
+ ret half %c
+}
+
+define half @test_rint(half %a) {
+; CHECK-LABEL: test_rint:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscalesh $4, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x04]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %c = call half @llvm.rint.f16(half %a)
+ ret half %c
+}
+
+define half @test_nearbyint(half %a) {
+; CHECK-LABEL: test_nearbyint:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscalesh $12, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0c]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %c = call half @llvm.nearbyint.f16(half %a)
+ ret half %c
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
index 6bfd69ef5db97..93efbace3e759 100644
--- a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
@@ -945,3 +945,377 @@ define <4 x i64> @test_int_x86_avx512_maskz_cvtt_ph2uqq_256(<8 x half> %x0, i8 %
%res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> zeroinitializer, i8 %x2)
ret <4 x i64> %res
}
+
+declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
+declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
+
+define <8 x half> @test_sqrt_ph_128(<8 x half> %a0) {
+; CHECK-LABEL: test_sqrt_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+ ret <8 x half> %1
+}
+
+define <8 x half> @test_mask_sqrt_ph_128(<8 x half> %a0, <8 x half> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_sqrt_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> %passthru
+ ret <8 x half> %3
+}
+
+define <8 x half> @test_maskz_sqrt_ph_128(<8 x half> %a0, i8 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> zeroinitializer
+ ret <8 x half> %3
+}
+
+define <16 x half> @test_sqrt_ph_256(<16 x half> %a0) {
+; CHECK-LABEL: test_sqrt_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+ ret <16 x half> %1
+}
+
+define <16 x half> @test_mask_sqrt_ph_256(<16 x half> %a0, <16 x half> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_sqrt_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> %passthru
+ ret <16 x half> %3
+}
+
+define <16 x half> @test_maskz_sqrt_ph_256(<16 x half> %a0, i16 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vsqrtph %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> zeroinitializer
+ ret <16 x half> %3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
+
+define <8 x half> @test_rsqrt_ph_128(<8 x half> %a0) {
+; CHECK-LABEL: test_rsqrt_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrtph %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 -1)
+ ret <8 x half> %res
+}
+
+define <16 x half> @test_rsqrt_ph_256(<16 x half> %a0) {
+; CHECK-LABEL: test_rsqrt_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrtph %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 -1)
+ ret <16 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <8 x half> @test_rcp_ph_128(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
+; CHECK-LABEL: test_rcp_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrcpph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %a1, i8 %mask)
+ ret <8 x half> %res
+}
+
+define <16 x half> @test_rcp_ph_256(<16 x half> %a0, <16 x half> %a1, i16 %mask) {
+; CHECK-LABEL: test_rcp_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrcpph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %a1, i16 %mask)
+ ret <16 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_mask_reduce_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vreduceph $8, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vreduceph $4, %xmm0, %xmm0
+; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3)
+ %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1)
+ %res2 = fadd <8 x half> %res, %res1
+ ret <8 x half> %res2
+}
+
+define <16 x half>@test_int_x86_avx512_mask_reduce_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vreduceph $8, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vreduceph $4, %ymm0, %ymm0
+; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3)
+ %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1)
+ %res2 = fadd <16 x half> %res, %res1
+ ret <16 x half> %res2
+}
+
+declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
+declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
+
+define i8 @test_int_x86_avx512_fpclass_ph_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfpclassph $2, %xmm0, %k1
+; CHECK-NEXT: vfpclassph $4, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %res = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %x0, i32 4)
+ %res1 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %x0, i32 2)
+ %1 = and <8 x i1> %res1, %res
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define i16 @test_int_x86_avx512_fpclass_ph_256(<16 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfpclassph $2, %ymm0, %k1
+; CHECK-NEXT: vfpclassph $4, %ymm0, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %x0, i32 4)
+ %res1 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %x0, i32 2)
+ %1 = and <16 x i1> %res1, %res
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_getexp_ph_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_getexp_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgetexpph %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> zeroinitializer, i8 -1)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_ph_128(<8 x half> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetexpph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> %x1, i8 %x2)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_maskz_getexp_ph_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_getexp_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetexpph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> zeroinitializer, i8 %x2)
+ ret <8 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_getexp_ph_256(<16 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_getexp_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgetexpph %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> zeroinitializer, i16 -1)
+ ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_mask_getexp_ph_256(<16 x half> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetexpph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> %x1, i16 %x2)
+ ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_maskz_getexp_ph_256(<16 x half> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_getexp_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetexpph %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> zeroinitializer, i16 %x2)
+ ret <16 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetmantph $8, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vgetmantph $4, %xmm0, %xmm0
+; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3)
+ %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1)
+ %res2 = fadd <8 x half> %res, %res1
+ ret <8 x half> %res2
+}
+
+define <16 x half>@test_int_x86_avx512_mask_getmant_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vgetmantph $8, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vgetmantph $4, %ymm0, %ymm0
+; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3)
+ %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1)
+ %res2 = fadd <16 x half> %res, %res1
+ ret <16 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_mask_rndscale_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrndscaleph $8, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vrndscaleph $4, %xmm0, %xmm0
+; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3)
+ %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1)
+ %res2 = fadd <8 x half> %res, %res1
+ ret <8 x half> %res2
+}
+
+define <16 x half>@test_int_x86_avx512_mask_rndscale_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vrndscaleph $8, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vrndscaleph $4, %ymm0, %ymm0
+; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3)
+ %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1)
+ %res2 = fadd <16 x half> %res, %res1
+ ret <16 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half>, <8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half>, <16 x half>, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_scalef_ph_128(<8 x half> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_scalef_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vscalefph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> zeroinitializer, i8 -1)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vscalefph %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %mask = bitcast i8 %x3 to <8 x i1>
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3)
+ ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_maskz_scalef_ph_128(<8 x half> %x0, <8 x half> %x1, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_scalef_ph_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vscalefph %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = bitcast i8 %x3 to <8 x i1>
+ %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> zeroinitializer, i8 %x3)
+ ret <8 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_scalef_ph_256(<16 x half> %x0, <16 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_scalef_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vscalefph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> zeroinitializer, i16 -1)
+ ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_mask_scalef_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vscalefph %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %mask = bitcast i16 %x3 to <16 x i1>
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3)
+ ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_maskz_scalef_ph_256(<16 x half> %x0, <16 x half> %x1, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_scalef_ph_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vscalefph %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = bitcast i16 %x3 to <16 x i1>
+ %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> zeroinitializer, i16 %x3)
+ ret <16 x half> %res
+}
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index a2c289f1a26e4..e7e52f153bc35 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -10,6 +10,7 @@ declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
+declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
define half @fadd_f16(half %a, half %b) nounwind strictfp {
; X86-LABEL: fadd_f16:
@@ -173,4 +174,27 @@ define void @fptrunc_double_to_f16(double* %val, half *%ret) nounwind strictfp {
ret void
}
+define void @fsqrt_f16(half* %a) nounwind strictfp {
+; X86-LABEL: fsqrt_f16:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovsh (%eax), %xmm0
+; X86-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovsh %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: fsqrt_f16:
+; X64: # %bb.0:
+; X64-NEXT: vmovsh (%rdi), %xmm0
+; X64-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovsh %xmm0, (%rdi)
+; X64-NEXT: retq
+ %1 = load half, half* %a, align 4
+ %res = call half @llvm.experimental.constrained.sqrt.f16(half %1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ store half %res, half* %a, align 4
+ ret void
+}
+
attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
new file mode 100644
index 0000000000000..5832301aeb4e5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64
+
+declare half @llvm.experimental.constrained.ceil.f16(half, metadata)
+declare half @llvm.experimental.constrained.floor.f16(half, metadata)
+declare half @llvm.experimental.constrained.trunc.f16(half, metadata)
+declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata)
+
+define half @fceil32(half %f) #0 {
+; X86-LABEL: fceil32:
+; X86: # %bb.0:
+; X86-NEXT: vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: fceil32:
+; X64: # %bb.0:
+; X64-NEXT: vrndscalesh $10, %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call half @llvm.experimental.constrained.ceil.f16(
+ half %f, metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+define half @ffloor32(half %f) #0 {
+; X86-LABEL: ffloor32:
+; X86: # %bb.0:
+; X86-NEXT: vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: ffloor32:
+; X64: # %bb.0:
+; X64-NEXT: vrndscalesh $9, %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call half @llvm.experimental.constrained.floor.f16(
+ half %f, metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+define half @ftrunc32(half %f) #0 {
+; X86-LABEL: ftrunc32:
+; X86: # %bb.0:
+; X86-NEXT: vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: ftrunc32:
+; X64: # %bb.0:
+; X64-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call half @llvm.experimental.constrained.trunc.f16(
+ half %f, metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+define half @frint32(half %f) #0 {
+; X86-LABEL: frint32:
+; X86: # %bb.0:
+; X86-NEXT: vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: frint32:
+; X64: # %bb.0:
+; X64-NEXT: vrndscalesh $4, %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call half @llvm.experimental.constrained.rint.f16(
+ half %f,
+ metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+define half @fnearbyint32(half %f) #0 {
+; X86-LABEL: fnearbyint32:
+; X86: # %bb.0:
+; X86-NEXT: vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: fnearbyint32:
+; X64: # %bb.0:
+; X64-NEXT: vrndscalesh $12, %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call half @llvm.experimental.constrained.nearbyint.f16(
+ half %f,
+ metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index 222abba7d2f7c..ef84bf32619e0 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -124,6 +124,153 @@ define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) {
ret <8 x half> %5
}
+define i32 @stack_fold_fpclassph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_fpclassph:
+ ;CHECK: vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
+ %3 = bitcast <32 x i1> %2 to i32
+ ret i32 %3
+}
+declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
+
+define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, <32 x i1>* %p) {
+ ;CHECK-LABEL: stack_fold_fpclassph_mask:
+ ;CHECK: vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
+ %mask = load <32 x i1>, <32 x i1>* %p
+ %3 = and <32 x i1> %2, %mask
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
+ ;CHECK-LABEl: stack_fold_fpclasssh:
+ ;CHECK: vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1)
+ ret i8 %2
+}
+declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
+
+define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, i8* %p) {
+ ;CHECK-LABEL: stack_fold_fpclasssh_mask:
+ ;CHECK: vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %p
+ %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask)
+ ret i8 %2
+}
+
+define <32 x half> @stack_fold_getexpph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_getexpph:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_getexpph_mask:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_getexpph_maskz:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_getexpsh:
+ ;CHECK: vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_getexpsh_mask:
+ ;CHECK: vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_getexpsh_maskz:
+ ;CHECK: vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
+ ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_getmantph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_getmantph:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_getmantph_mask:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_getmantph_maskz:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_getmantsh:
+ ;CHECK: vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_getmantsh_mask:
+ ;CHECK: vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_getmantsh_maskz:
+ ;CHECK: vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4)
+ ret <8 x half> %3
+}
+
define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
;CHECK-LABEL: stack_fold_maxph_zmm:
;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
@@ -541,6 +688,280 @@ define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) {
ret <8 x half> %5
}
+define <32 x half> @stack_fold_rcpph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rcpph:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_rcpph_mask:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_rcpph_maskz:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
+ ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_rcpsh:
+ ;CHECK: vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_rcpsh_mask:
+ ;CHECK: vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_rcpsh_maskz:
+ ;CHECK: vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_reduceph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_reduceph:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_reduceph_mask:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_reduceph_maskz:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_reducesh:
+ ;CHECK: vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
+
+define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_reducesh_mask:
+ ;CHECK: vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_reducesh_maskz:
+ ;CHECK: vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
+ ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rndscaleph:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_mask:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_maskz:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
+ ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_rndscalesh:
+ ;CHECK: vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
+
+define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_rndscalesh_mask:
+ ;CHECK: vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_rndscalesh_maskz:
+ ;CHECK: vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
+ ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rsqrtph:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_mask:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
+ ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_maskz:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
+ ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_rsqrtsh:
+ ;CHECK: vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtsh_mask:
+ ;CHECK: vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtsh_maskz:
+ ;CHECK: vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_sqrtph:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+ ret <32 x half> %2
+}
+declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
+
+define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtph_mask:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <32 x half>, <32 x half>* %passthru
+ %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+ %4 = bitcast i32 %mask to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %2
+ ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, i32* %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtph_maskz:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i32, i32* %mask
+ %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+ %4 = bitcast i32 %2 to <32 x i1>
+ %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> zeroinitializer
+ ret <32 x half> %5
+}
+
+define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) {
+ ;CHECK-LABEL: stack_fold_sqrtsh:
+ ;CHECK: vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtsh_mask:
+ ;CHECK: vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtsh_maskz:
+ ;CHECK: vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
+ ret <8 x half> %3
+}
+
define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
;CHECK-LABEL: stack_fold_subph_zmm
;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index fab7059dd959d..92cb57f27b9ab 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -60,6 +60,156 @@ define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) {
ret <16 x half> %2
}
+define i8 @stack_fold_fpclassph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_fpclassph:
+ ;CHECK: fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
+
+define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, <8 x i1>* %p) {
+ ;CHECK-LABEL: stack_fold_fpclassph_mask:
+ ;CHECK: fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
+ %mask = load <8 x i1>, <8 x i1>* %p
+ %3 = and <8 x i1> %2, %mask
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_fpclassph_ymm:
+ ;CHECK: fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
+ %3 = bitcast <16 x i1> %2 to i16
+ ret i16 %3
+}
+declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
+
+define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, <16 x i1>* %p) {
+ ;CHECK-LABEL: stack_fold_fpclassph_mask_ymm:
+ ;CHECK: fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
+ %mask = load <16 x i1>, <16 x i1>* %p
+ %3 = and <16 x i1> %2, %mask
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define <8 x half> @stack_fold_getexpph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_getexpph:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_getexpph_mask:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_getexpph_maskz:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_getexpph_ymm:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_getexpph_mask_ymm:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_getexpph_maskz_ymm:
+ ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
+ ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_getmantph:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
+
+define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_getmantph_mask:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_getmantph_maskz:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_getmantph_ymm:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_getmantph_mask_ymm:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_getmantph_maskz_ymm:
+ ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
+ ret <16 x half> %3
+}
+
define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
;CHECK-LABEL: stack_fold_maxph
;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -144,6 +294,284 @@ define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) {
ret <16 x half> %2
}
+define <8 x half> @stack_fold_rcpph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rcpph:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_rcpph_mask:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_rcpph_maskz:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rcpph_ymm:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_rcpph_mask_ymm:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_rcpph_maskz_ymm:
+ ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
+ ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_reduceph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_reduceph:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
+
+define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_reduceph_mask:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_reduceph_maskz:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_reduceph_ymm:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_reduceph_mask_ymm:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_reduceph_maskz_ymm:
+ ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
+ ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rndscaleph:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_mask:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_maskz:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_ymm:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_mask_ymm:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm:
+ ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
+ ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rsqrtph:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_mask:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
+ ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_maskz:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
+ ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_ymm:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
+
+define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_mask_ymm:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
+ ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm:
+ ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
+ ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_sqrtph:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+ ret <8 x half> %2
+}
+declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
+
+define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtph_mask:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x half>, <8 x half>* %passthru
+ %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %2
+ ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, i8* %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtph_maskz:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+ %4 = bitcast i8 %2 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> zeroinitializer
+ ret <8 x half> %5
+}
+
+define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) {
+ ;CHECK-LABEL: stack_fold_sqrtph_ymm:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+ ret <16 x half> %2
+}
+declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
+
+define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtph_mask_ymm:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <16 x half>, <16 x half>* %passthru
+ %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+ %4 = bitcast i16 %mask to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %2
+ ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+ ;CHECK-LABEL: stack_fold_sqrtph_maskz_ymm:
+ ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+ %4 = bitcast i16 %2 to <16 x i1>
+ %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> zeroinitializer
+ ret <16 x half> %5
+}
+
define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) {
;CHECK-LABEL: stack_fold_subph
;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index 7171ac32336d3..f73742947b5cc 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -6,6 +6,7 @@ declare <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half>, <8 x ha
declare <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half>, <8 x half>, metadata, metadata)
declare <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half>, <8 x half>, metadata, metadata)
declare <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half>, <8 x half>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half>, metadata, metadata)
declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata)
@@ -61,6 +62,18 @@ define <8 x half> @f8(<8 x half> %a, <8 x half> %b) #0 {
ret <8 x half> %ret
}
+define <8 x half> @f10(<8 x half> %a) #0 {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %sqrt = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(
+ <8 x half> %a,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <8 x half > %sqrt
+}
+
define <8 x half> @f11(<2 x double> %a0, <8 x half> %a1) #0 {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
index 8b78a5b5c492c..d5868287823fb 100644
--- a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
@@ -6,10 +6,16 @@ declare <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half>, <16
declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16 x half>, metadata, metadata)
declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata)
declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.sqrt.v16f16(<16 x half>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata)
declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata)
declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(<4 x double>, metadata, metadata)
declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32(<8 x float>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.ceil.v16f16(<16 x half>, metadata)
+declare <16 x half> @llvm.experimental.constrained.floor.v16f16(<16 x half>, metadata)
+declare <16 x half> @llvm.experimental.constrained.trunc.v16f16(<16 x half>, metadata)
+declare <16 x half> @llvm.experimental.constrained.rint.v16f16(<16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half>, metadata, metadata)
define <16 x half> @f2(<16 x half> %a, <16 x half> %b) #0 {
; CHECK-LABEL: f2:
@@ -55,6 +61,19 @@ define <16 x half> @f8(<16 x half> %a, <16 x half> %b) #0 {
ret <16 x half> %ret
}
+
+define <16 x half> @f10(<16 x half> %a) #0 {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %ret = call <16 x half> @llvm.experimental.constrained.sqrt.v16f16(
+ <16 x half> %a,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <16 x half > %ret
+}
+
define <4 x double> @f11(<4 x half> %a) #0 {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
@@ -103,4 +122,57 @@ define <8 x half> @f15(<8 x float> %a) #0 {
ret <8 x half> %ret
}
+define <16 x half> @fceilv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: fceilv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $10, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x half> @llvm.experimental.constrained.ceil.v16f16(
+ <16 x half> %f, metadata !"fpexcept.strict") #0
+ ret <16 x half> %res
+}
+
+define <16 x half> @ffloorv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: ffloorv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $9, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x half> @llvm.experimental.constrained.floor.v16f16(
+ <16 x half> %f, metadata !"fpexcept.strict") #0
+ ret <16 x half> %res
+}
+
+
+define <16 x half> @ftruncv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: ftruncv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $11, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x half> @llvm.experimental.constrained.trunc.v16f16(
+ <16 x half> %f, metadata !"fpexcept.strict") #0
+ ret <16 x half> %res
+}
+
+define <16 x half> @frintv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: frintv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $4, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x half> @llvm.experimental.constrained.rint.v16f16(
+ <16 x half> %f,
+ metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret <16 x half> %res
+}
+
+define <16 x half> @fnearbyintv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: fnearbyintv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $12, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(
+ <16 x half> %f,
+ metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret <16 x half> %res
+}
+
attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
index 0a25d1c9d3d01..6273a525b15d6 100644
--- a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
@@ -6,10 +6,16 @@ declare <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half>, <32
declare <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half>, <32 x half>, metadata, metadata)
declare <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half>, <32 x half>, metadata, metadata)
declare <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half>, <32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.sqrt.v32f16(<32 x half>, metadata, metadata)
declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata)
declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata)
declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(<8 x double>, metadata, metadata)
declare <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(<16 x float>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, metadata)
+declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, metadata)
+declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, metadata)
+declare <32 x half> @llvm.experimental.constrained.rint.v32f16(<32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half>, metadata, metadata)
define <32 x half> @f2(<32 x half> %a, <32 x half> %b) #0 {
; CHECK-LABEL: f2:
@@ -55,6 +61,18 @@ define <32 x half> @f8(<32 x half> %a, <32 x half> %b) #0 {
ret <32 x half> %ret
}
+define <32 x half> @f10(<32 x half> %a) #0 {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsqrtph %zmm0, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %ret = call <32 x half> @llvm.experimental.constrained.sqrt.v32f16(
+ <32 x half> %a,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <32 x half > %ret
+}
+
define <8 x double> @f11(<8 x half> %a) #0 {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
@@ -102,4 +120,51 @@ define <16 x half> @f15(<16 x float> %a) #0 {
ret <16 x half> %ret
}
+define <32 x half> @strict_vector_fceil_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_fceil_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $10, %zmm0, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0
+ ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_ffloor_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_ffloor_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $9, %zmm0, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0
+ ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_ftrunc_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_ftrunc_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $11, %zmm0, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0
+ ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_frint_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_frint_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $4, %zmm0, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x half> @llvm.experimental.constrained.rint.v32f16(<32 x half> %f,
+ metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_fnearbyint_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_fnearbyint_v32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrndscaleph $12, %zmm0, %zmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %f,
+ metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret <32 x half> %res
+}
+
attributes #0 = { strictfp }
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
index 8115431808335..67514e50b1e12 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
@@ -1356,3 +1356,411 @@
# ATT: vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z}
# INTEL: vcvtw2ph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80
+
+# ATT: vfpclassph $123, %zmm30, %k5
+# INTEL: vfpclassph k5, zmm30, 123
+0x62,0x93,0x7c,0x48,0x66,0xee,0x7b
+
+# ATT: vfpclassphz $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x7c,0x4f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vfpclassph $123, (%r9){1to32}, %k5
+# INTEL: vfpclassph k5, word ptr [r9]{1to32}, 123
+0x62,0xd3,0x7c,0x58,0x66,0x29,0x7b
+
+# ATT: vfpclassphz $123, 8128(%rcx), %k5
+# INTEL: vfpclassph k5, zmmword ptr [rcx + 8128], 123
+0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b
+
+# ATT: vfpclassph $123, -256(%rdx){1to32}, %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to32}, 123
+0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b
+
+# ATT: vfpclasssh $123, %xmm30, %k5
+# INTEL: vfpclasssh k5, xmm30, 123
+0x62,0x93,0x7c,0x08,0x67,0xee,0x7b
+
+# ATT: vfpclasssh $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+# INTEL: vfpclasssh k5 {k7}, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x7c,0x0f,0x67,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vfpclasssh $123, (%r9), %k5
+# INTEL: vfpclasssh k5, word ptr [r9], 123
+0x62,0xd3,0x7c,0x08,0x67,0x29,0x7b
+
+# ATT: vfpclasssh $123, 254(%rcx), %k5
+# INTEL: vfpclasssh k5, word ptr [rcx + 254], 123
+0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b
+
+# ATT: vfpclasssh $123, -256(%rdx), %k5 {%k7}
+# INTEL: vfpclasssh k5 {k7}, word ptr [rdx - 256], 123
+0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b
+
+# ATT: vgetexpph %zmm29, %zmm30
+# INTEL: vgetexpph zmm30, zmm29
+0x62,0x06,0x7d,0x48,0x42,0xf5
+
+# ATT: vgetexpph {sae}, %zmm29, %zmm30
+# INTEL: vgetexpph zmm30, zmm29, {sae}
+0x62,0x06,0x7d,0x18,0x42,0xf5
+
+# ATT: vgetexpph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vgetexpph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vgetexpph (%r9){1to32}, %zmm30
+# INTEL: vgetexpph zmm30, word ptr [r9]{1to32}
+0x62,0x46,0x7d,0x58,0x42,0x31
+
+# ATT: vgetexpph 8128(%rcx), %zmm30
+# INTEL: vgetexpph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x66,0x7d,0x48,0x42,0x71,0x7f
+
+# ATT: vgetexpph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vgetexpph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x7d,0xdf,0x42,0x72,0x80
+
+# ATT: vgetexpsh %xmm28, %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x43,0xf4
+
+# ATT: vgetexpsh {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, xmm28, {sae}
+0x62,0x06,0x15,0x10,0x43,0xf4
+
+# ATT: vgetexpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vgetexpsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x43,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vgetexpsh (%r9), %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x43,0x31
+
+# ATT: vgetexpsh 254(%rcx), %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x43,0x71,0x7f
+
+# ATT: vgetexpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vgetexpsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x43,0x72,0x80
+
+# ATT: vgetmantph $123, %zmm29, %zmm30
+# INTEL: vgetmantph zmm30, zmm29, 123
+0x62,0x03,0x7c,0x48,0x26,0xf5,0x7b
+
+# ATT: vgetmantph $123, {sae}, %zmm29, %zmm30
+# INTEL: vgetmantph zmm30, zmm29, {sae}, 123
+0x62,0x03,0x7c,0x18,0x26,0xf5,0x7b
+
+# ATT: vgetmantph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vgetmantph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x7c,0x4f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vgetmantph $123, (%r9){1to32}, %zmm30
+# INTEL: vgetmantph zmm30, word ptr [r9]{1to32}, 123
+0x62,0x43,0x7c,0x58,0x26,0x31,0x7b
+
+# ATT: vgetmantph $123, 8128(%rcx), %zmm30
+# INTEL: vgetmantph zmm30, zmmword ptr [rcx + 8128], 123
+0x62,0x63,0x7c,0x48,0x26,0x71,0x7f,0x7b
+
+# ATT: vgetmantph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vgetmantph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0x63,0x7c,0xdf,0x26,0x72,0x80,0x7b
+
+# ATT: vgetmantsh $123, %xmm28, %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, xmm28, 123
+0x62,0x03,0x14,0x00,0x27,0xf4,0x7b
+
+# ATT: vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, xmm28, {sae}, 123
+0x62,0x03,0x14,0x10,0x27,0xf4,0x7b
+
+# ATT: vgetmantsh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vgetmantsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x14,0x07,0x27,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vgetmantsh $123, (%r9), %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, word ptr [r9], 123
+0x62,0x43,0x14,0x00,0x27,0x31,0x7b
+
+# ATT: vgetmantsh $123, 254(%rcx), %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, word ptr [rcx + 254], 123
+0x62,0x63,0x14,0x00,0x27,0x71,0x7f,0x7b
+
+# ATT: vgetmantsh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vgetmantsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123
+0x62,0x63,0x14,0x87,0x27,0x72,0x80,0x7b
+
+# ATT: vrcpph %zmm29, %zmm30
+# INTEL: vrcpph zmm30, zmm29
+0x62,0x06,0x7d,0x48,0x4c,0xf5
+
+# ATT: vrcpph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vrcpph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vrcpph (%r9){1to32}, %zmm30
+# INTEL: vrcpph zmm30, word ptr [r9]{1to32}
+0x62,0x46,0x7d,0x58,0x4c,0x31
+
+# ATT: vrcpph 8128(%rcx), %zmm30
+# INTEL: vrcpph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x66,0x7d,0x48,0x4c,0x71,0x7f
+
+# ATT: vrcpph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vrcpph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x7d,0xdf,0x4c,0x72,0x80
+
+# ATT: vrcpsh %xmm28, %xmm29, %xmm30
+# INTEL: vrcpsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x4d,0xf4
+
+# ATT: vrcpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vrcpsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x4d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vrcpsh (%r9), %xmm29, %xmm30
+# INTEL: vrcpsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x4d,0x31
+
+# ATT: vrcpsh 254(%rcx), %xmm29, %xmm30
+# INTEL: vrcpsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x4d,0x71,0x7f
+
+# ATT: vrcpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vrcpsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x4d,0x72,0x80
+
+# ATT: vreduceph $123, %zmm29, %zmm30
+# INTEL: vreduceph zmm30, zmm29, 123
+0x62,0x03,0x7c,0x48,0x56,0xf5,0x7b
+
+# ATT: vreduceph $123, {sae}, %zmm29, %zmm30
+# INTEL: vreduceph zmm30, zmm29, {sae}, 123
+0x62,0x03,0x7c,0x18,0x56,0xf5,0x7b
+
+# ATT: vreduceph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vreduceph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x7c,0x4f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vreduceph $123, (%r9){1to32}, %zmm30
+# INTEL: vreduceph zmm30, word ptr [r9]{1to32}, 123
+0x62,0x43,0x7c,0x58,0x56,0x31,0x7b
+
+# ATT: vreduceph $123, 8128(%rcx), %zmm30
+# INTEL: vreduceph zmm30, zmmword ptr [rcx + 8128], 123
+0x62,0x63,0x7c,0x48,0x56,0x71,0x7f,0x7b
+
+# ATT: vreduceph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vreduceph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0x63,0x7c,0xdf,0x56,0x72,0x80,0x7b
+
+# ATT: vreducesh $123, %xmm28, %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, xmm28, 123
+0x62,0x03,0x14,0x00,0x57,0xf4,0x7b
+
+# ATT: vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, xmm28, {sae}, 123
+0x62,0x03,0x14,0x10,0x57,0xf4,0x7b
+
+# ATT: vreducesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vreducesh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x14,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vreducesh $123, (%r9), %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, word ptr [r9], 123
+0x62,0x43,0x14,0x00,0x57,0x31,0x7b
+
+# ATT: vreducesh $123, 254(%rcx), %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, word ptr [rcx + 254], 123
+0x62,0x63,0x14,0x00,0x57,0x71,0x7f,0x7b
+
+# ATT: vreducesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vreducesh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123
+0x62,0x63,0x14,0x87,0x57,0x72,0x80,0x7b
+
+# ATT: vrndscaleph $123, %zmm29, %zmm30
+# INTEL: vrndscaleph zmm30, zmm29, 123
+0x62,0x03,0x7c,0x48,0x08,0xf5,0x7b
+
+# ATT: vrndscaleph $123, {sae}, %zmm29, %zmm30
+# INTEL: vrndscaleph zmm30, zmm29, {sae}, 123
+0x62,0x03,0x7c,0x18,0x08,0xf5,0x7b
+
+# ATT: vrndscaleph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vrndscaleph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x7c,0x4f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vrndscaleph $123, (%r9){1to32}, %zmm30
+# INTEL: vrndscaleph zmm30, word ptr [r9]{1to32}, 123
+0x62,0x43,0x7c,0x58,0x08,0x31,0x7b
+
+# ATT: vrndscaleph $123, 8128(%rcx), %zmm30
+# INTEL: vrndscaleph zmm30, zmmword ptr [rcx + 8128], 123
+0x62,0x63,0x7c,0x48,0x08,0x71,0x7f,0x7b
+
+# ATT: vrndscaleph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vrndscaleph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0x63,0x7c,0xdf,0x08,0x72,0x80,0x7b
+
+# ATT: vrndscalesh $123, %xmm28, %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, xmm28, 123
+0x62,0x03,0x14,0x00,0x0a,0xf4,0x7b
+
+# ATT: vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, xmm28, {sae}, 123
+0x62,0x03,0x14,0x10,0x0a,0xf4,0x7b
+
+# ATT: vrndscalesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vrndscalesh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x14,0x07,0x0a,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vrndscalesh $123, (%r9), %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, word ptr [r9], 123
+0x62,0x43,0x14,0x00,0x0a,0x31,0x7b
+
+# ATT: vrndscalesh $123, 254(%rcx), %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, word ptr [rcx + 254], 123
+0x62,0x63,0x14,0x00,0x0a,0x71,0x7f,0x7b
+
+# ATT: vrndscalesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vrndscalesh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123
+0x62,0x63,0x14,0x87,0x0a,0x72,0x80,0x7b
+
+# ATT: vrsqrtph %zmm29, %zmm30
+# INTEL: vrsqrtph zmm30, zmm29
+0x62,0x06,0x7d,0x48,0x4e,0xf5
+
+# ATT: vrsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vrsqrtph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vrsqrtph (%r9){1to32}, %zmm30
+# INTEL: vrsqrtph zmm30, word ptr [r9]{1to32}
+0x62,0x46,0x7d,0x58,0x4e,0x31
+
+# ATT: vrsqrtph 8128(%rcx), %zmm30
+# INTEL: vrsqrtph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x66,0x7d,0x48,0x4e,0x71,0x7f
+
+# ATT: vrsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vrsqrtph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x7d,0xdf,0x4e,0x72,0x80
+
+# ATT: vrsqrtsh %xmm28, %xmm29, %xmm30
+# INTEL: vrsqrtsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x4f,0xf4
+
+# ATT: vrsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vrsqrtsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x4f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vrsqrtsh (%r9), %xmm29, %xmm30
+# INTEL: vrsqrtsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x4f,0x31
+
+# ATT: vrsqrtsh 254(%rcx), %xmm29, %xmm30
+# INTEL: vrsqrtsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x4f,0x71,0x7f
+
+# ATT: vrsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vrsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x4f,0x72,0x80
+
+# ATT: vscalefph %zmm28, %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x2c,0xf4
+
+# ATT: vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x2c,0xf4
+
+# ATT: vscalefph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vscalefph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vscalefph (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x2c,0x31
+
+# ATT: vscalefph 8128(%rcx), %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x2c,0x71,0x7f
+
+# ATT: vscalefph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vscalefph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x2c,0x72,0x80
+
+# ATT: vscalefsh %xmm28, %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x2d,0xf4
+
+# ATT: vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x2d,0xf4
+
+# ATT: vscalefsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vscalefsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x2d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vscalefsh (%r9), %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x2d,0x31
+
+# ATT: vscalefsh 254(%rcx), %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x2d,0x71,0x7f
+
+# ATT: vscalefsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vscalefsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x2d,0x72,0x80
+
+# ATT: vsqrtph %zmm29, %zmm30
+# INTEL: vsqrtph zmm30, zmm29
+0x62,0x05,0x7c,0x48,0x51,0xf5
+
+# ATT: vsqrtph {rn-sae}, %zmm29, %zmm30
+# INTEL: vsqrtph zmm30, zmm29, {rn-sae}
+0x62,0x05,0x7c,0x18,0x51,0xf5
+
+# ATT: vsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vsqrtph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vsqrtph (%r9){1to32}, %zmm30
+# INTEL: vsqrtph zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7c,0x58,0x51,0x31
+
+# ATT: vsqrtph 8128(%rcx), %zmm30
+# INTEL: vsqrtph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7c,0x48,0x51,0x71,0x7f
+
+# ATT: vsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vsqrtph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7c,0xdf,0x51,0x72,0x80
+
+# ATT: vsqrtsh %xmm28, %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x51,0xf4
+
+# ATT: vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x16,0x10,0x51,0xf4
+
+# ATT: vsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vsqrtsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vsqrtsh (%r9), %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x51,0x31
+
+# ATT: vsqrtsh 254(%rcx), %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x51,0x71,0x7f
+
+# ATT: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x51,0x72,0x80
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
index 63acd5be1946f..8f480fc13d82f 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
@@ -1136,3 +1136,359 @@
# ATT: vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z}
# INTEL: vcvtw2ph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80
+
+# ATT: vfpclassph $123, %xmm6, %k5
+# INTEL: vfpclassph k5, xmm6, 123
+0x62,0xf3,0x7c,0x08,0x66,0xee,0x7b
+
+# ATT: vfpclassph $123, %ymm6, %k5
+# INTEL: vfpclassph k5, ymm6, 123
+0x62,0xf3,0x7c,0x28,0x66,0xee,0x7b
+
+# ATT: vfpclassphx $123, 268435456(%esp,%esi,8), %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vfpclassph $123, (%ecx){1to8}, %k5
+# INTEL: vfpclassph k5, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x66,0x29,0x7b
+
+# ATT: vfpclassphx $123, 2032(%ecx), %k5
+# INTEL: vfpclassph k5, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b
+
+# ATT: vfpclassph $123, -256(%edx){1to8}, %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b
+
+# ATT: vfpclassph $123, (%ecx){1to16}, %k5
+# INTEL: vfpclassph k5, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x66,0x29,0x7b
+
+# ATT: vfpclassphy $123, 4064(%ecx), %k5
+# INTEL: vfpclassph k5, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b
+
+# ATT: vfpclassph $123, -256(%edx){1to16}, %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b
+
+# ATT: vgetexpph %xmm5, %xmm6
+# INTEL: vgetexpph xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x42,0xf5
+
+# ATT: vgetexpph %ymm5, %ymm6
+# INTEL: vgetexpph ymm6, ymm5
+0x62,0xf6,0x7d,0x28,0x42,0xf5
+
+# ATT: vgetexpph 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vgetexpph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vgetexpph (%ecx){1to8}, %xmm6
+# INTEL: vgetexpph xmm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x18,0x42,0x31
+
+# ATT: vgetexpph 2032(%ecx), %xmm6
+# INTEL: vgetexpph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x08,0x42,0x71,0x7f
+
+# ATT: vgetexpph -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vgetexpph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0x9f,0x42,0x72,0x80
+
+# ATT: vgetexpph 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vgetexpph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vgetexpph (%ecx){1to16}, %ymm6
+# INTEL: vgetexpph ymm6, word ptr [ecx]{1to16}
+0x62,0xf6,0x7d,0x38,0x42,0x31
+
+# ATT: vgetexpph 4064(%ecx), %ymm6
+# INTEL: vgetexpph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7d,0x28,0x42,0x71,0x7f
+
+# ATT: vgetexpph -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vgetexpph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7d,0xbf,0x42,0x72,0x80
+
+# ATT: vgetmantph $123, %ymm5, %ymm6
+# INTEL: vgetmantph ymm6, ymm5, 123
+0x62,0xf3,0x7c,0x28,0x26,0xf5,0x7b
+
+# ATT: vgetmantph $123, %xmm5, %xmm6
+# INTEL: vgetmantph xmm6, xmm5, 123
+0x62,0xf3,0x7c,0x08,0x26,0xf5,0x7b
+
+# ATT: vgetmantph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vgetmantph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vgetmantph $123, (%ecx){1to8}, %xmm6
+# INTEL: vgetmantph xmm6, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x26,0x31,0x7b
+
+# ATT: vgetmantph $123, 2032(%ecx), %xmm6
+# INTEL: vgetmantph xmm6, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x26,0x71,0x7f,0x7b
+
+# ATT: vgetmantph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vgetmantph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x9f,0x26,0x72,0x80,0x7b
+
+# ATT: vgetmantph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vgetmantph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x2f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vgetmantph $123, (%ecx){1to16}, %ymm6
+# INTEL: vgetmantph ymm6, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x26,0x31,0x7b
+
+# ATT: vgetmantph $123, 4064(%ecx), %ymm6
+# INTEL: vgetmantph ymm6, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x26,0x71,0x7f,0x7b
+
+# ATT: vgetmantph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vgetmantph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0xbf,0x26,0x72,0x80,0x7b
+
+# ATT: vrcpph %xmm5, %xmm6
+# INTEL: vrcpph xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x4c,0xf5
+
+# ATT: vrcpph %ymm5, %ymm6
+# INTEL: vrcpph ymm6, ymm5
+0x62,0xf6,0x7d,0x28,0x4c,0xf5
+
+# ATT: vrcpph 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vrcpph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vrcpph (%ecx){1to8}, %xmm6
+# INTEL: vrcpph xmm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x18,0x4c,0x31
+
+# ATT: vrcpph 2032(%ecx), %xmm6
+# INTEL: vrcpph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x08,0x4c,0x71,0x7f
+
+# ATT: vrcpph -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vrcpph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0x9f,0x4c,0x72,0x80
+
+# ATT: vrcpph 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vrcpph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vrcpph (%ecx){1to16}, %ymm6
+# INTEL: vrcpph ymm6, word ptr [ecx]{1to16}
+0x62,0xf6,0x7d,0x38,0x4c,0x31
+
+# ATT: vrcpph 4064(%ecx), %ymm6
+# INTEL: vrcpph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7d,0x28,0x4c,0x71,0x7f
+
+# ATT: vrcpph -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vrcpph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7d,0xbf,0x4c,0x72,0x80
+
+# ATT: vreduceph $123, %ymm5, %ymm6
+# INTEL: vreduceph ymm6, ymm5, 123
+0x62,0xf3,0x7c,0x28,0x56,0xf5,0x7b
+
+# ATT: vreduceph $123, %xmm5, %xmm6
+# INTEL: vreduceph xmm6, xmm5, 123
+0x62,0xf3,0x7c,0x08,0x56,0xf5,0x7b
+
+# ATT: vreduceph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vreduceph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vreduceph $123, (%ecx){1to8}, %xmm6
+# INTEL: vreduceph xmm6, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x56,0x31,0x7b
+
+# ATT: vreduceph $123, 2032(%ecx), %xmm6
+# INTEL: vreduceph xmm6, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x56,0x71,0x7f,0x7b
+
+# ATT: vreduceph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vreduceph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x9f,0x56,0x72,0x80,0x7b
+
+# ATT: vreduceph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vreduceph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vreduceph $123, (%ecx){1to16}, %ymm6
+# INTEL: vreduceph ymm6, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x56,0x31,0x7b
+
+# ATT: vreduceph $123, 4064(%ecx), %ymm6
+# INTEL: vreduceph ymm6, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x56,0x71,0x7f,0x7b
+
+# ATT: vreduceph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vreduceph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0xbf,0x56,0x72,0x80,0x7b
+
+# ATT: vrndscaleph $123, %ymm5, %ymm6
+# INTEL: vrndscaleph ymm6, ymm5, 123
+0x62,0xf3,0x7c,0x28,0x08,0xf5,0x7b
+
+# ATT: vrndscaleph $123, %xmm5, %xmm6
+# INTEL: vrndscaleph xmm6, xmm5, 123
+0x62,0xf3,0x7c,0x08,0x08,0xf5,0x7b
+
+# ATT: vrndscaleph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vrndscaleph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vrndscaleph $123, (%ecx){1to8}, %xmm6
+# INTEL: vrndscaleph xmm6, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x08,0x31,0x7b
+
+# ATT: vrndscaleph $123, 2032(%ecx), %xmm6
+# INTEL: vrndscaleph xmm6, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x08,0x71,0x7f,0x7b
+
+# ATT: vrndscaleph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vrndscaleph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x9f,0x08,0x72,0x80,0x7b
+
+# ATT: vrndscaleph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vrndscaleph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x2f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT: vrndscaleph $123, (%ecx){1to16}, %ymm6
+# INTEL: vrndscaleph ymm6, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x08,0x31,0x7b
+
+# ATT: vrndscaleph $123, 4064(%ecx), %ymm6
+# INTEL: vrndscaleph ymm6, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x08,0x71,0x7f,0x7b
+
+# ATT: vrndscaleph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vrndscaleph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0xbf,0x08,0x72,0x80,0x7b
+
+# ATT: vrsqrtph %xmm5, %xmm6
+# INTEL: vrsqrtph xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x4e,0xf5
+
+# ATT: vrsqrtph %ymm5, %ymm6
+# INTEL: vrsqrtph ymm6, ymm5
+0x62,0xf6,0x7d,0x28,0x4e,0xf5
+
+# ATT: vrsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vrsqrtph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vrsqrtph (%ecx){1to8}, %xmm6
+# INTEL: vrsqrtph xmm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x18,0x4e,0x31
+
+# ATT: vrsqrtph 2032(%ecx), %xmm6
+# INTEL: vrsqrtph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x08,0x4e,0x71,0x7f
+
+# ATT: vrsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vrsqrtph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0x9f,0x4e,0x72,0x80
+
+# ATT: vrsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vrsqrtph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vrsqrtph (%ecx){1to16}, %ymm6
+# INTEL: vrsqrtph ymm6, word ptr [ecx]{1to16}
+0x62,0xf6,0x7d,0x38,0x4e,0x31
+
+# ATT: vrsqrtph 4064(%ecx), %ymm6
+# INTEL: vrsqrtph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7d,0x28,0x4e,0x71,0x7f
+
+# ATT: vrsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vrsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7d,0xbf,0x4e,0x72,0x80
+
+# ATT: vscalefph %ymm4, %ymm5, %ymm6
+# INTEL: vscalefph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x2c,0xf4
+
+# ATT: vscalefph %xmm4, %xmm5, %xmm6
+# INTEL: vscalefph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x2c,0xf4
+
+# ATT: vscalefph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vscalefph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vscalefph (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vscalefph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x2c,0x31
+
+# ATT: vscalefph 4064(%ecx), %ymm5, %ymm6
+# INTEL: vscalefph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x2c,0x71,0x7f
+
+# ATT: vscalefph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vscalefph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x2c,0x72,0x80
+
+# ATT: vscalefph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vscalefph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vscalefph (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vscalefph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x2c,0x31
+
+# ATT: vscalefph 2032(%ecx), %xmm5, %xmm6
+# INTEL: vscalefph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x2c,0x71,0x7f
+
+# ATT: vscalefph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vscalefph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x2c,0x72,0x80
+
+# ATT: vsqrtph %xmm5, %xmm6
+# INTEL: vsqrtph xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x51,0xf5
+
+# ATT: vsqrtph %ymm5, %ymm6
+# INTEL: vsqrtph ymm6, ymm5
+0x62,0xf5,0x7c,0x28,0x51,0xf5
+
+# ATT: vsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vsqrtph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vsqrtph (%ecx){1to8}, %xmm6
+# INTEL: vsqrtph xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x18,0x51,0x31
+
+# ATT: vsqrtph 2032(%ecx), %xmm6
+# INTEL: vsqrtph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x08,0x51,0x71,0x7f
+
+# ATT: vsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vsqrtph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7c,0x9f,0x51,0x72,0x80
+
+# ATT: vsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vsqrtph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vsqrtph (%ecx){1to16}, %ymm6
+# INTEL: vsqrtph ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7c,0x38,0x51,0x31
+
+# ATT: vsqrtph 4064(%ecx), %ymm6
+# INTEL: vsqrtph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7c,0x28,0x51,0x71,0x7f
+
+# ATT: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80
diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s
index 1ca659f29acea..b358705fbedc8 100644
--- a/llvm/test/MC/X86/avx512fp16.s
+++ b/llvm/test/MC/X86/avx512fp16.s
@@ -1355,3 +1355,411 @@
// CHECK: vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z}
// CHECK: encoding: [0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80]
vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vfpclassph $123, %zmm30, %k5
+// CHECK: encoding: [0x62,0x93,0x7c,0x48,0x66,0xee,0x7b]
+ vfpclassph $123, %zmm30, %k5
+
+// CHECK: vfpclassphz $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7c,0x4f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vfpclassphz $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+
+// CHECK: vfpclassph $123, (%r9){1to32}, %k5
+// CHECK: encoding: [0x62,0xd3,0x7c,0x58,0x66,0x29,0x7b]
+ vfpclassph $123, (%r9){1to32}, %k5
+
+// CHECK: vfpclassphz $123, 8128(%rcx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b]
+ vfpclassphz $123, 8128(%rcx), %k5
+
+// CHECK: vfpclassph $123, -256(%rdx){1to32}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b]
+ vfpclassph $123, -256(%rdx){1to32}, %k5 {%k7}
+
+// CHECK: vfpclasssh $123, %xmm30, %k5
+// CHECK: encoding: [0x62,0x93,0x7c,0x08,0x67,0xee,0x7b]
+ vfpclasssh $123, %xmm30, %k5
+
+// CHECK: vfpclasssh $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7c,0x0f,0x67,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vfpclasssh $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+
+// CHECK: vfpclasssh $123, (%r9), %k5
+// CHECK: encoding: [0x62,0xd3,0x7c,0x08,0x67,0x29,0x7b]
+ vfpclasssh $123, (%r9), %k5
+
+// CHECK: vfpclasssh $123, 254(%rcx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b]
+ vfpclasssh $123, 254(%rcx), %k5
+
+// CHECK: vfpclasssh $123, -256(%rdx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b]
+ vfpclasssh $123, -256(%rdx), %k5 {%k7}
+
+// CHECK: vgetexpph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x42,0xf5]
+ vgetexpph %zmm29, %zmm30
+
+// CHECK: vgetexpph {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x18,0x42,0xf5]
+ vgetexpph {sae}, %zmm29, %zmm30
+
+// CHECK: vgetexpph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vgetexpph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vgetexpph (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x42,0x31]
+ vgetexpph (%r9){1to32}, %zmm30
+
+// CHECK: vgetexpph 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x42,0x71,0x7f]
+ vgetexpph 8128(%rcx), %zmm30
+
+// CHECK: vgetexpph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x42,0x72,0x80]
+ vgetexpph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vgetexpsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x43,0xf4]
+ vgetexpsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetexpsh {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x43,0xf4]
+ vgetexpsh {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetexpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x43,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vgetexpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vgetexpsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x43,0x31]
+ vgetexpsh (%r9), %xmm29, %xmm30
+
+// CHECK: vgetexpsh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x43,0x71,0x7f]
+ vgetexpsh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vgetexpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x43,0x72,0x80]
+ vgetexpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vgetmantph $123, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x26,0xf5,0x7b]
+ vgetmantph $123, %zmm29, %zmm30
+
+// CHECK: vgetmantph $123, {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x26,0xf5,0x7b]
+ vgetmantph $123, {sae}, %zmm29, %zmm30
+
+// CHECK: vgetmantph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vgetmantph $123, (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x26,0x31,0x7b]
+ vgetmantph $123, (%r9){1to32}, %zmm30
+
+// CHECK: vgetmantph $123, 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x26,0x71,0x7f,0x7b]
+ vgetmantph $123, 8128(%rcx), %zmm30
+
+// CHECK: vgetmantph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x26,0x72,0x80,0x7b]
+ vgetmantph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vgetmantsh $123, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x00,0x27,0xf4,0x7b]
+ vgetmantsh $123, %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x10,0x27,0xf4,0x7b]
+ vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetmantsh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x14,0x07,0x27,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantsh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vgetmantsh $123, (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x43,0x14,0x00,0x27,0x31,0x7b]
+ vgetmantsh $123, (%r9), %xmm29, %xmm30
+
+// CHECK: vgetmantsh $123, 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x63,0x14,0x00,0x27,0x71,0x7f,0x7b]
+ vgetmantsh $123, 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vgetmantsh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x14,0x87,0x27,0x72,0x80,0x7b]
+ vgetmantsh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vrcpph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x4c,0xf5]
+ vrcpph %zmm29, %zmm30
+
+// CHECK: vrcpph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrcpph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vrcpph (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x4c,0x31]
+ vrcpph (%r9){1to32}, %zmm30
+
+// CHECK: vrcpph 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x4c,0x71,0x7f]
+ vrcpph 8128(%rcx), %zmm30
+
+// CHECK: vrcpph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x4c,0x72,0x80]
+ vrcpph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vrcpsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x4d,0xf4]
+ vrcpsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vrcpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x4d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrcpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vrcpsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x4d,0x31]
+ vrcpsh (%r9), %xmm29, %xmm30
+
+// CHECK: vrcpsh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x4d,0x71,0x7f]
+ vrcpsh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vrcpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x4d,0x72,0x80]
+ vrcpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vreduceph $123, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x56,0xf5,0x7b]
+ vreduceph $123, %zmm29, %zmm30
+
+// CHECK: vreduceph $123, {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x56,0xf5,0x7b]
+ vreduceph $123, {sae}, %zmm29, %zmm30
+
+// CHECK: vreduceph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vreduceph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vreduceph $123, (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x56,0x31,0x7b]
+ vreduceph $123, (%r9){1to32}, %zmm30
+
+// CHECK: vreduceph $123, 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x56,0x71,0x7f,0x7b]
+ vreduceph $123, 8128(%rcx), %zmm30
+
+// CHECK: vreduceph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x56,0x72,0x80,0x7b]
+ vreduceph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vreducesh $123, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x00,0x57,0xf4,0x7b]
+ vreducesh $123, %xmm28, %xmm29, %xmm30
+
+// CHECK: vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x10,0x57,0xf4,0x7b]
+ vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vreducesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x14,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vreducesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vreducesh $123, (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x43,0x14,0x00,0x57,0x31,0x7b]
+ vreducesh $123, (%r9), %xmm29, %xmm30
+
+// CHECK: vreducesh $123, 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x63,0x14,0x00,0x57,0x71,0x7f,0x7b]
+ vreducesh $123, 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vreducesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x14,0x87,0x57,0x72,0x80,0x7b]
+ vreducesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vrndscaleph $123, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x08,0xf5,0x7b]
+ vrndscaleph $123, %zmm29, %zmm30
+
+// CHECK: vrndscaleph $123, {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x08,0xf5,0x7b]
+ vrndscaleph $123, {sae}, %zmm29, %zmm30
+
+// CHECK: vrndscaleph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vrndscaleph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vrndscaleph $123, (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x08,0x31,0x7b]
+ vrndscaleph $123, (%r9){1to32}, %zmm30
+
+// CHECK: vrndscaleph $123, 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x08,0x71,0x7f,0x7b]
+ vrndscaleph $123, 8128(%rcx), %zmm30
+
+// CHECK: vrndscaleph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x08,0x72,0x80,0x7b]
+ vrndscaleph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vrndscalesh $123, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x00,0x0a,0xf4,0x7b]
+ vrndscalesh $123, %xmm28, %xmm29, %xmm30
+
+// CHECK: vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x10,0x0a,0xf4,0x7b]
+ vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vrndscalesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x14,0x07,0x0a,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vrndscalesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vrndscalesh $123, (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x43,0x14,0x00,0x0a,0x31,0x7b]
+ vrndscalesh $123, (%r9), %xmm29, %xmm30
+
+// CHECK: vrndscalesh $123, 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x63,0x14,0x00,0x0a,0x71,0x7f,0x7b]
+ vrndscalesh $123, 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vrndscalesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x14,0x87,0x0a,0x72,0x80,0x7b]
+ vrndscalesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vrsqrtph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x4e,0xf5]
+ vrsqrtph %zmm29, %zmm30
+
+// CHECK: vrsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vrsqrtph (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x4e,0x31]
+ vrsqrtph (%r9){1to32}, %zmm30
+
+// CHECK: vrsqrtph 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x4e,0x71,0x7f]
+ vrsqrtph 8128(%rcx), %zmm30
+
+// CHECK: vrsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x4e,0x72,0x80]
+ vrsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vrsqrtsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x4f,0xf4]
+ vrsqrtsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vrsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x4f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vrsqrtsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x4f,0x31]
+ vrsqrtsh (%r9), %xmm29, %xmm30
+
+// CHECK: vrsqrtsh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x4f,0x71,0x7f]
+ vrsqrtsh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vrsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x4f,0x72,0x80]
+ vrsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vscalefph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x2c,0xf4]
+ vscalefph %zmm28, %zmm29, %zmm30
+
+// CHECK: vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x2c,0xf4]
+ vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vscalefph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vscalefph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vscalefph (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x2c,0x31]
+ vscalefph (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vscalefph 8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x2c,0x71,0x7f]
+ vscalefph 8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vscalefph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x2c,0x72,0x80]
+ vscalefph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vscalefsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x2d,0xf4]
+ vscalefsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x2d,0xf4]
+ vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vscalefsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x2d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vscalefsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vscalefsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x2d,0x31]
+ vscalefsh (%r9), %xmm29, %xmm30
+
+// CHECK: vscalefsh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x2d,0x71,0x7f]
+ vscalefsh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vscalefsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x2d,0x72,0x80]
+ vscalefsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vsqrtph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x51,0xf5]
+ vsqrtph %zmm29, %zmm30
+
+// CHECK: vsqrtph {rn-sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x51,0xf5]
+ vsqrtph {rn-sae}, %zmm29, %zmm30
+
+// CHECK: vsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vsqrtph (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x51,0x31]
+ vsqrtph (%r9){1to32}, %zmm30
+
+// CHECK: vsqrtph 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x51,0x71,0x7f]
+ vsqrtph 8128(%rcx), %zmm30
+
+// CHECK: vsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x51,0x72,0x80]
+ vsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vsqrtsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x51,0xf4]
+ vsqrtsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x51,0xf4]
+ vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vsqrtsh (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x51,0x31]
+ vsqrtsh (%r9), %xmm29, %xmm30
+
+// CHECK: vsqrtsh 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x51,0x71,0x7f]
+ vsqrtsh 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x51,0x72,0x80]
+ vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z}
diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s
index 466af9663d21a..91c45a56a2e8a 100644
--- a/llvm/test/MC/X86/avx512fp16vl.s
+++ b/llvm/test/MC/X86/avx512fp16vl.s
@@ -1135,3 +1135,359 @@
// CHECK: vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z}
// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80]
vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vfpclassph $123, %xmm6, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0xee,0x7b]
+ vfpclassph $123, %xmm6, %k5
+
+// CHECK: vfpclassph $123, %ymm6, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0xee,0x7b]
+ vfpclassph $123, %ymm6, %k5
+
+// CHECK: vfpclassphx $123, 268435456(%esp,%esi,8), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vfpclassphx $123, 268435456(%esp,%esi,8), %k5 {%k7}
+
+// CHECK: vfpclassph $123, (%ecx){1to8}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x66,0x29,0x7b]
+ vfpclassph $123, (%ecx){1to8}, %k5
+
+// CHECK: vfpclassphx $123, 2032(%ecx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b]
+ vfpclassphx $123, 2032(%ecx), %k5
+
+// CHECK: vfpclassph $123, -256(%edx){1to8}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b]
+ vfpclassph $123, -256(%edx){1to8}, %k5 {%k7}
+
+// CHECK: vfpclassph $123, (%ecx){1to16}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x66,0x29,0x7b]
+ vfpclassph $123, (%ecx){1to16}, %k5
+
+// CHECK: vfpclassphy $123, 4064(%ecx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b]
+ vfpclassphy $123, 4064(%ecx), %k5
+
+// CHECK: vfpclassph $123, -256(%edx){1to16}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b]
+ vfpclassph $123, -256(%edx){1to16}, %k5 {%k7}
+
+// CHECK: vgetexpph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x42,0xf5]
+ vgetexpph %xmm5, %xmm6
+
+// CHECK: vgetexpph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x42,0xf5]
+ vgetexpph %ymm5, %ymm6
+
+// CHECK: vgetexpph 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vgetexpph 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vgetexpph (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x42,0x31]
+ vgetexpph (%ecx){1to8}, %xmm6
+
+// CHECK: vgetexpph 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x42,0x71,0x7f]
+ vgetexpph 2032(%ecx), %xmm6
+
+// CHECK: vgetexpph -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x42,0x72,0x80]
+ vgetexpph -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vgetexpph 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vgetexpph 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vgetexpph (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x42,0x31]
+ vgetexpph (%ecx){1to16}, %ymm6
+
+// CHECK: vgetexpph 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x42,0x71,0x7f]
+ vgetexpph 4064(%ecx), %ymm6
+
+// CHECK: vgetexpph -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x42,0x72,0x80]
+ vgetexpph -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vgetmantph $123, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x26,0xf5,0x7b]
+ vgetmantph $123, %ymm5, %ymm6
+
+// CHECK: vgetmantph $123, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x26,0xf5,0x7b]
+ vgetmantph $123, %xmm5, %xmm6
+
+// CHECK: vgetmantph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vgetmantph $123, (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x26,0x31,0x7b]
+ vgetmantph $123, (%ecx){1to8}, %xmm6
+
+// CHECK: vgetmantph $123, 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x26,0x71,0x7f,0x7b]
+ vgetmantph $123, 2032(%ecx), %xmm6
+
+// CHECK: vgetmantph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x26,0x72,0x80,0x7b]
+ vgetmantph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vgetmantph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vgetmantph $123, (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x26,0x31,0x7b]
+ vgetmantph $123, (%ecx){1to16}, %ymm6
+
+// CHECK: vgetmantph $123, 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x26,0x71,0x7f,0x7b]
+ vgetmantph $123, 4064(%ecx), %ymm6
+
+// CHECK: vgetmantph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x26,0x72,0x80,0x7b]
+ vgetmantph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vrcpph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4c,0xf5]
+ vrcpph %xmm5, %xmm6
+
+// CHECK: vrcpph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4c,0xf5]
+ vrcpph %ymm5, %ymm6
+
+// CHECK: vrcpph 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrcpph 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vrcpph (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x4c,0x31]
+ vrcpph (%ecx){1to8}, %xmm6
+
+// CHECK: vrcpph 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4c,0x71,0x7f]
+ vrcpph 2032(%ecx), %xmm6
+
+// CHECK: vrcpph -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x4c,0x72,0x80]
+ vrcpph -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vrcpph 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrcpph 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vrcpph (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x4c,0x31]
+ vrcpph (%ecx){1to16}, %ymm6
+
+// CHECK: vrcpph 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4c,0x71,0x7f]
+ vrcpph 4064(%ecx), %ymm6
+
+// CHECK: vrcpph -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x4c,0x72,0x80]
+ vrcpph -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vreduceph $123, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x56,0xf5,0x7b]
+ vreduceph $123, %ymm5, %ymm6
+
+// CHECK: vreduceph $123, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x56,0xf5,0x7b]
+ vreduceph $123, %xmm5, %xmm6
+
+// CHECK: vreduceph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vreduceph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vreduceph $123, (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x56,0x31,0x7b]
+ vreduceph $123, (%ecx){1to8}, %xmm6
+
+// CHECK: vreduceph $123, 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x56,0x71,0x7f,0x7b]
+ vreduceph $123, 2032(%ecx), %xmm6
+
+// CHECK: vreduceph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x56,0x72,0x80,0x7b]
+ vreduceph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vreduceph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vreduceph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vreduceph $123, (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x56,0x31,0x7b]
+ vreduceph $123, (%ecx){1to16}, %ymm6
+
+// CHECK: vreduceph $123, 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x56,0x71,0x7f,0x7b]
+ vreduceph $123, 4064(%ecx), %ymm6
+
+// CHECK: vreduceph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x56,0x72,0x80,0x7b]
+ vreduceph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vrndscaleph $123, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x08,0xf5,0x7b]
+ vrndscaleph $123, %ymm5, %ymm6
+
+// CHECK: vrndscaleph $123, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x08,0xf5,0x7b]
+ vrndscaleph $123, %xmm5, %xmm6
+
+// CHECK: vrndscaleph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vrndscaleph $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vrndscaleph $123, (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x08,0x31,0x7b]
+ vrndscaleph $123, (%ecx){1to8}, %xmm6
+
+// CHECK: vrndscaleph $123, 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x08,0x71,0x7f,0x7b]
+ vrndscaleph $123, 2032(%ecx), %xmm6
+
+// CHECK: vrndscaleph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x08,0x72,0x80,0x7b]
+ vrndscaleph $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vrndscaleph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vrndscaleph $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vrndscaleph $123, (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x08,0x31,0x7b]
+ vrndscaleph $123, (%ecx){1to16}, %ymm6
+
+// CHECK: vrndscaleph $123, 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x08,0x71,0x7f,0x7b]
+ vrndscaleph $123, 4064(%ecx), %ymm6
+
+// CHECK: vrndscaleph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x08,0x72,0x80,0x7b]
+ vrndscaleph $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vrsqrtph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4e,0xf5]
+ vrsqrtph %xmm5, %xmm6
+
+// CHECK: vrsqrtph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4e,0xf5]
+ vrsqrtph %ymm5, %ymm6
+
+// CHECK: vrsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vrsqrtph (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x4e,0x31]
+ vrsqrtph (%ecx){1to8}, %xmm6
+
+// CHECK: vrsqrtph 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4e,0x71,0x7f]
+ vrsqrtph 2032(%ecx), %xmm6
+
+// CHECK: vrsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x4e,0x72,0x80]
+ vrsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vrsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vrsqrtph (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x4e,0x31]
+ vrsqrtph (%ecx){1to16}, %ymm6
+
+// CHECK: vrsqrtph 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4e,0x71,0x7f]
+ vrsqrtph 4064(%ecx), %ymm6
+
+// CHECK: vrsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x4e,0x72,0x80]
+ vrsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vscalefph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x2c,0xf4]
+ vscalefph %ymm4, %ymm5, %ymm6
+
+// CHECK: vscalefph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2c,0xf4]
+ vscalefph %xmm4, %xmm5, %xmm6
+
+// CHECK: vscalefph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vscalefph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vscalefph (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x2c,0x31]
+ vscalefph (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vscalefph 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x2c,0x71,0x7f]
+ vscalefph 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vscalefph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x2c,0x72,0x80]
+ vscalefph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vscalefph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vscalefph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vscalefph (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2c,0x31]
+ vscalefph (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vscalefph 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2c,0x71,0x7f]
+ vscalefph 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vscalefph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x2c,0x72,0x80]
+ vscalefph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vsqrtph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x51,0xf5]
+ vsqrtph %xmm5, %xmm6
+
+// CHECK: vsqrtph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x51,0xf5]
+ vsqrtph %ymm5, %ymm6
+
+// CHECK: vsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vsqrtph (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x51,0x31]
+ vsqrtph (%ecx){1to8}, %xmm6
+
+// CHECK: vsqrtph 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x51,0x71,0x7f]
+ vsqrtph 2032(%ecx), %xmm6
+
+// CHECK: vsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x51,0x72,0x80]
+ vsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vsqrtph (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x51,0x31]
+ vsqrtph (%ecx){1to16}, %ymm6
+
+// CHECK: vsqrtph 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x51,0x71,0x7f]
+ vsqrtph 4064(%ecx), %ymm6
+
+// CHECK: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80]
+ vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z}
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
index 4b842f9bc622c..36ca110e12e6e 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
@@ -1227,3 +1227,411 @@
// CHECK: vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x7d,0x72,0x80]
vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vfpclassph k5, zmm6, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0xee,0x7b]
+ vfpclassph k5, zmm6, 123
+
+// CHECK: vfpclassph k5 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vfpclassph k5 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vfpclassph k5, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x66,0x29,0x7b]
+ vfpclassph k5, word ptr [ecx]{1to32}, 123
+
+// CHECK: vfpclassph k5, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b]
+ vfpclassph k5, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vfpclassph k5 {k7}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b]
+ vfpclassph k5 {k7}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vfpclasssh k5, xmm6, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0xee,0x7b]
+ vfpclasssh k5, xmm6, 123
+
+// CHECK: vfpclasssh k5 {k7}, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vfpclasssh k5 {k7}, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vfpclasssh k5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x29,0x7b]
+ vfpclasssh k5, word ptr [ecx], 123
+
+// CHECK: vfpclasssh k5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b]
+ vfpclasssh k5, word ptr [ecx + 254], 123
+
+// CHECK: vfpclasssh k5 {k7}, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b]
+ vfpclasssh k5 {k7}, word ptr [edx - 256], 123
+
+// CHECK: vgetexpph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x42,0xf5]
+ vgetexpph zmm6, zmm5
+
+// CHECK: vgetexpph zmm6, zmm5, {sae}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x42,0xf5]
+ vgetexpph zmm6, zmm5, {sae}
+
+// CHECK: vgetexpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vgetexpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexpph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x42,0x31]
+ vgetexpph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vgetexpph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x42,0x71,0x7f]
+ vgetexpph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vgetexpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x42,0x72,0x80]
+ vgetexpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vgetexpsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0xf4]
+ vgetexpsh xmm6, xmm5, xmm4
+
+// CHECK: vgetexpsh xmm6, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x43,0xf4]
+ vgetexpsh xmm6, xmm5, xmm4, {sae}
+
+// CHECK: vgetexpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x43,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vgetexpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexpsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0x31]
+ vgetexpsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vgetexpsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0x71,0x7f]
+ vgetexpsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vgetexpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x43,0x72,0x80]
+ vgetexpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vgetmantph zmm6, zmm5, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x26,0xf5,0x7b]
+ vgetmantph zmm6, zmm5, 123
+
+// CHECK: vgetmantph zmm6, zmm5, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x26,0xf5,0x7b]
+ vgetmantph zmm6, zmm5, {sae}, 123
+
+// CHECK: vgetmantph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantph zmm6, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x26,0x31,0x7b]
+ vgetmantph zmm6, word ptr [ecx]{1to32}, 123
+
+// CHECK: vgetmantph zmm6, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x26,0x71,0x7f,0x7b]
+ vgetmantph zmm6, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vgetmantph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x26,0x72,0x80,0x7b]
+ vgetmantph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vgetmantsh xmm6, xmm5, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0xf4,0x7b]
+ vgetmantsh xmm6, xmm5, xmm4, 123
+
+// CHECK: vgetmantsh xmm6, xmm5, xmm4, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x27,0xf4,0x7b]
+ vgetmantsh xmm6, xmm5, xmm4, {sae}, 123
+
+// CHECK: vgetmantsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x27,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantsh xmm6, xmm5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0x31,0x7b]
+ vgetmantsh xmm6, xmm5, word ptr [ecx], 123
+
+// CHECK: vgetmantsh xmm6, xmm5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0x71,0x7f,0x7b]
+ vgetmantsh xmm6, xmm5, word ptr [ecx + 254], 123
+
+// CHECK: vgetmantsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x27,0x72,0x80,0x7b]
+ vgetmantsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+
+// CHECK: vrcpph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4c,0xf5]
+ vrcpph zmm6, zmm5
+
+// CHECK: vrcpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrcpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcpph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x4c,0x31]
+ vrcpph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vrcpph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4c,0x71,0x7f]
+ vrcpph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vrcpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x4c,0x72,0x80]
+ vrcpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vrcpsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0xf4]
+ vrcpsh xmm6, xmm5, xmm4
+
+// CHECK: vrcpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x4d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrcpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcpsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0x31]
+ vrcpsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vrcpsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0x71,0x7f]
+ vrcpsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vrcpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x4d,0x72,0x80]
+ vrcpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vreduceph zmm6, zmm5, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x56,0xf5,0x7b]
+ vreduceph zmm6, zmm5, 123
+
+// CHECK: vreduceph zmm6, zmm5, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x56,0xf5,0x7b]
+ vreduceph zmm6, zmm5, {sae}, 123
+
+// CHECK: vreduceph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vreduceph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreduceph zmm6, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x56,0x31,0x7b]
+ vreduceph zmm6, word ptr [ecx]{1to32}, 123
+
+// CHECK: vreduceph zmm6, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x56,0x71,0x7f,0x7b]
+ vreduceph zmm6, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vreduceph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x56,0x72,0x80,0x7b]
+ vreduceph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vreducesh xmm6, xmm5, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0xf4,0x7b]
+ vreducesh xmm6, xmm5, xmm4, 123
+
+// CHECK: vreducesh xmm6, xmm5, xmm4, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x57,0xf4,0x7b]
+ vreducesh xmm6, xmm5, xmm4, {sae}, 123
+
+// CHECK: vreducesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vreducesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreducesh xmm6, xmm5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0x31,0x7b]
+ vreducesh xmm6, xmm5, word ptr [ecx], 123
+
+// CHECK: vreducesh xmm6, xmm5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0x71,0x7f,0x7b]
+ vreducesh xmm6, xmm5, word ptr [ecx + 254], 123
+
+// CHECK: vreducesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x57,0x72,0x80,0x7b]
+ vreducesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+
+// CHECK: vrndscaleph zmm6, zmm5, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x08,0xf5,0x7b]
+ vrndscaleph zmm6, zmm5, 123
+
+// CHECK: vrndscaleph zmm6, zmm5, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x08,0xf5,0x7b]
+ vrndscaleph zmm6, zmm5, {sae}, 123
+
+// CHECK: vrndscaleph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vrndscaleph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscaleph zmm6, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x08,0x31,0x7b]
+ vrndscaleph zmm6, word ptr [ecx]{1to32}, 123
+
+// CHECK: vrndscaleph zmm6, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x08,0x71,0x7f,0x7b]
+ vrndscaleph zmm6, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vrndscaleph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x08,0x72,0x80,0x7b]
+ vrndscaleph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vrndscalesh xmm6, xmm5, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0xf4,0x7b]
+ vrndscalesh xmm6, xmm5, xmm4, 123
+
+// CHECK: vrndscalesh xmm6, xmm5, xmm4, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x0a,0xf4,0x7b]
+ vrndscalesh xmm6, xmm5, xmm4, {sae}, 123
+
+// CHECK: vrndscalesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x0a,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+ vrndscalesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscalesh xmm6, xmm5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0x31,0x7b]
+ vrndscalesh xmm6, xmm5, word ptr [ecx], 123
+
+// CHECK: vrndscalesh xmm6, xmm5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0x71,0x7f,0x7b]
+ vrndscalesh xmm6, xmm5, word ptr [ecx + 254], 123
+
+// CHECK: vrndscalesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x0a,0x72,0x80,0x7b]
+ vrndscalesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+
+// CHECK: vrsqrtph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4e,0xf5]
+ vrsqrtph zmm6, zmm5
+
+// CHECK: vrsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x4e,0x31]
+ vrsqrtph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vrsqrtph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4e,0x71,0x7f]
+ vrsqrtph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vrsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x4e,0x72,0x80]
+ vrsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vrsqrtsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0xf4]
+ vrsqrtsh xmm6, xmm5, xmm4
+
+// CHECK: vrsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x4f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vrsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0x31]
+ vrsqrtsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vrsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0x71,0x7f]
+ vrsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vrsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x4f,0x72,0x80]
+ vrsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vscalefph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x2c,0xf4]
+ vscalefph zmm6, zmm5, zmm4
+
+// CHECK: vscalefph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2c,0xf4]
+ vscalefph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vscalefph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vscalefph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x2c,0x31]
+ vscalefph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vscalefph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x2c,0x71,0x7f]
+ vscalefph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vscalefph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x2c,0x72,0x80]
+ vscalefph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vscalefsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0xf4]
+ vscalefsh xmm6, xmm5, xmm4
+
+// CHECK: vscalefsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2d,0xf4]
+ vscalefsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vscalefsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x2d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vscalefsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0x31]
+ vscalefsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vscalefsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0x71,0x7f]
+ vscalefsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vscalefsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x2d,0x72,0x80]
+ vscalefsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vsqrtph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x51,0xf5]
+ vsqrtph zmm6, zmm5
+
+// CHECK: vsqrtph zmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x51,0xf5]
+ vsqrtph zmm6, zmm5, {rn-sae}
+
+// CHECK: vsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x51,0x31]
+ vsqrtph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vsqrtph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x51,0x71,0x7f]
+ vsqrtph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x51,0x72,0x80]
+ vsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vsqrtsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0xf4]
+ vsqrtsh xmm6, xmm5, xmm4
+
+// CHECK: vsqrtsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x51,0xf4]
+ vsqrtsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ vsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0x31]
+ vsqrtsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0x71,0x7f]
+ vsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x51,0x72,0x80]
+ vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
index 5c53fc376e1cc..6091599b87d66 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
@@ -1135,3 +1135,359 @@
// CHECK: vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
// CHECK: encoding: [0x62,0x65,0x7e,0xbf,0x7d,0x72,0x80]
vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfpclassph k5, xmm30, 123
+// CHECK: encoding: [0x62,0x93,0x7c,0x08,0x66,0xee,0x7b]
+ vfpclassph k5, xmm30, 123
+
+// CHECK: vfpclassph k5, ymm30, 123
+// CHECK: encoding: [0x62,0x93,0x7c,0x28,0x66,0xee,0x7b]
+ vfpclassph k5, ymm30, 123
+
+// CHECK: vfpclassph k5 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xb3,0x7c,0x0f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vfpclassph k5 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vfpclassph k5, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0xd3,0x7c,0x18,0x66,0x29,0x7b]
+ vfpclassph k5, word ptr [r9]{1to8}, 123
+
+// CHECK: vfpclassph k5, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b]
+ vfpclassph k5, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b]
+ vfpclassph k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vfpclassph k5, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0xd3,0x7c,0x38,0x66,0x29,0x7b]
+ vfpclassph k5, word ptr [r9]{1to16}, 123
+
+// CHECK: vfpclassph k5, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b]
+ vfpclassph k5, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b]
+ vfpclassph k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vgetexpph xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x42,0xf5]
+ vgetexpph xmm30, xmm29
+
+// CHECK: vgetexpph ymm30, ymm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x42,0xf5]
+ vgetexpph ymm30, ymm29
+
+// CHECK: vgetexpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vgetexpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexpph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x42,0x31]
+ vgetexpph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vgetexpph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x42,0x71,0x7f]
+ vgetexpph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vgetexpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x42,0x72,0x80]
+ vgetexpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vgetexpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vgetexpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexpph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x42,0x31]
+ vgetexpph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vgetexpph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x42,0x71,0x7f]
+ vgetexpph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vgetexpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x42,0x72,0x80]
+ vgetexpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vgetmantph ymm30, ymm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x26,0xf5,0x7b]
+ vgetmantph ymm30, ymm29, 123
+
+// CHECK: vgetmantph xmm30, xmm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x26,0xf5,0x7b]
+ vgetmantph xmm30, xmm29, 123
+
+// CHECK: vgetmantph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantph xmm30, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x26,0x31,0x7b]
+ vgetmantph xmm30, word ptr [r9]{1to8}, 123
+
+// CHECK: vgetmantph xmm30, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x26,0x71,0x7f,0x7b]
+ vgetmantph xmm30, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vgetmantph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x26,0x72,0x80,0x7b]
+ vgetmantph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vgetmantph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vgetmantph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantph ymm30, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x26,0x31,0x7b]
+ vgetmantph ymm30, word ptr [r9]{1to16}, 123
+
+// CHECK: vgetmantph ymm30, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x26,0x71,0x7f,0x7b]
+ vgetmantph ymm30, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vgetmantph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x26,0x72,0x80,0x7b]
+ vgetmantph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrcpph xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x4c,0xf5]
+ vrcpph xmm30, xmm29
+
+// CHECK: vrcpph ymm30, ymm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x4c,0xf5]
+ vrcpph ymm30, ymm29
+
+// CHECK: vrcpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrcpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcpph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x4c,0x31]
+ vrcpph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vrcpph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x4c,0x71,0x7f]
+ vrcpph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vrcpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x4c,0x72,0x80]
+ vrcpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrcpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrcpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcpph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x4c,0x31]
+ vrcpph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vrcpph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x4c,0x71,0x7f]
+ vrcpph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vrcpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x4c,0x72,0x80]
+ vrcpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vreduceph ymm30, ymm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x56,0xf5,0x7b]
+ vreduceph ymm30, ymm29, 123
+
+// CHECK: vreduceph xmm30, xmm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x56,0xf5,0x7b]
+ vreduceph xmm30, xmm29, 123
+
+// CHECK: vreduceph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vreduceph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreduceph xmm30, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x56,0x31,0x7b]
+ vreduceph xmm30, word ptr [r9]{1to8}, 123
+
+// CHECK: vreduceph xmm30, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x56,0x71,0x7f,0x7b]
+ vreduceph xmm30, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vreduceph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x56,0x72,0x80,0x7b]
+ vreduceph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vreduceph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vreduceph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreduceph ymm30, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x56,0x31,0x7b]
+ vreduceph ymm30, word ptr [r9]{1to16}, 123
+
+// CHECK: vreduceph ymm30, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x56,0x71,0x7f,0x7b]
+ vreduceph ymm30, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vreduceph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x56,0x72,0x80,0x7b]
+ vreduceph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrndscaleph ymm30, ymm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x08,0xf5,0x7b]
+ vrndscaleph ymm30, ymm29, 123
+
+// CHECK: vrndscaleph xmm30, xmm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x08,0xf5,0x7b]
+ vrndscaleph xmm30, xmm29, 123
+
+// CHECK: vrndscaleph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vrndscaleph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscaleph xmm30, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x08,0x31,0x7b]
+ vrndscaleph xmm30, word ptr [r9]{1to8}, 123
+
+// CHECK: vrndscaleph xmm30, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x08,0x71,0x7f,0x7b]
+ vrndscaleph xmm30, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vrndscaleph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x08,0x72,0x80,0x7b]
+ vrndscaleph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vrndscaleph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+ vrndscaleph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscaleph ymm30, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x08,0x31,0x7b]
+ vrndscaleph ymm30, word ptr [r9]{1to16}, 123
+
+// CHECK: vrndscaleph ymm30, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x08,0x71,0x7f,0x7b]
+ vrndscaleph ymm30, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vrndscaleph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x08,0x72,0x80,0x7b]
+ vrndscaleph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrsqrtph xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x4e,0xf5]
+ vrsqrtph xmm30, xmm29
+
+// CHECK: vrsqrtph ymm30, ymm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x4e,0xf5]
+ vrsqrtph ymm30, ymm29
+
+// CHECK: vrsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x4e,0x31]
+ vrsqrtph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vrsqrtph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x4e,0x71,0x7f]
+ vrsqrtph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vrsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x4e,0x72,0x80]
+ vrsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vrsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x4e,0x31]
+ vrsqrtph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vrsqrtph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x4e,0x71,0x7f]
+ vrsqrtph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vrsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x4e,0x72,0x80]
+ vrsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vscalefph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x2c,0xf4]
+ vscalefph ymm30, ymm29, ymm28
+
+// CHECK: vscalefph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x2c,0xf4]
+ vscalefph xmm30, xmm29, xmm28
+
+// CHECK: vscalefph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vscalefph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x2c,0x31]
+ vscalefph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vscalefph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x2c,0x71,0x7f]
+ vscalefph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vscalefph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x2c,0x72,0x80]
+ vscalefph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vscalefph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vscalefph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x2c,0x31]
+ vscalefph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vscalefph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x2c,0x71,0x7f]
+ vscalefph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vscalefph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x2c,0x72,0x80]
+ vscalefph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsqrtph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x51,0xf5]
+ vsqrtph xmm30, xmm29
+
+// CHECK: vsqrtph ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x51,0xf5]
+ vsqrtph ymm30, ymm29
+
+// CHECK: vsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x51,0x31]
+ vsqrtph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vsqrtph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x51,0x71,0x7f]
+ vsqrtph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x51,0x72,0x80]
+ vsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x51,0x31]
+ vsqrtph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vsqrtph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x51,0x71,0x7f]
+ vsqrtph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x51,0x72,0x80]
+ vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
More information about the cfe-commits
mailing list