[llvm] 3e6b904 - Force insert zero-idiom and break false dependency of dest register for several instructions.
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 21 01:47:24 PDT 2022
Author: gpei-dev
Date: 2022-04-21T16:47:13+08:00
New Revision: 3e6b904f0a5075a3f33683ce38b5a4fd18280e5e
URL: https://github.com/llvm/llvm-project/commit/3e6b904f0a5075a3f33683ce38b5a4fd18280e5e
DIFF: https://github.com/llvm/llvm-project/commit/3e6b904f0a5075a3f33683ce38b5a4fd18280e5e.diff
LOG: Force insert zero-idiom and break false dependency of dest register for several instructions.
The related instructions are:
VPERMD/Q/PS/PD
VRANGEPD/PS/SD/SS
VGETMANTSS/SD/SH
VGETMANDPS/PD - mem version only
VPMULLQ
VFMULCSH/PH
VFCMULCSH/PH
Differential Revision: https://reviews.llvm.org/D116072
Added:
llvm/test/CodeGen/X86/getmant-false-deps.ll
llvm/test/CodeGen/X86/mulc-false-deps.ll
llvm/test/CodeGen/X86/perm.avx2-false-deps.ll
llvm/test/CodeGen/X86/perm.avx512-false-deps.ll
llvm/test/CodeGen/X86/pmullq-false-deps.ll
llvm/test/CodeGen/X86/range-false-deps.ll
Modified:
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86TargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 60c662517d5d0..0634194b2c4ef 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -457,6 +457,27 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
"HasLZCNTFalseDeps", "true",
"LZCNT/TZCNT have a false dependency on dest register">;
+def TuningMULCFalseDeps : SubtargetFeature<"false-deps-mulc",
+ "HasMULCFalseDeps", "true",
+ "VF[C]MULCPH/SH has a false dependency on dest register">;
+
+def TuningPERMFalseDeps : SubtargetFeature<"false-deps-perm",
+ "HasPERMFalseDeps", "true",
+ "VPERMD/Q/PS/PD has a false dependency on dest register">;
+
+def TuningRANGEFalseDeps : SubtargetFeature<"false-deps-range",
+ "HasRANGEFalseDeps", "true",
+ "VRANGEPD/PS/SD/SS has a false dependency on dest register">;
+
+def TuningGETMANTFalseDeps : SubtargetFeature<"false-deps-getmant",
+ "HasGETMANTFalseDeps", "true",
+ "VGETMANTSS/SD/SH and VGETMANDPS/PD(memory version) has a"
+ " false dependency on dest register">;
+
+def TuningMULLQFalseDeps : SubtargetFeature<"false-deps-mullq",
+ "HasMULLQFalseDeps", "true",
+ "VPMULLQ has a false dependency on dest register">;
+
def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking",
"HasSBBDepBreaking", "true",
"SBB with same register has no source dependency">;
@@ -879,7 +900,12 @@ def ProcessorFeatures {
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureUINTR];
- list<SubtargetFeature> SPRTuning = ICXTuning;
+ list<SubtargetFeature> SPRAdditionalTuning = [TuningMULCFalseDeps,
+ TuningPERMFalseDeps,
+ TuningRANGEFalseDeps,
+ TuningGETMANTFalseDeps,
+ TuningMULLQFalseDeps];
+ list<SubtargetFeature> SPRTuning = !listconcat(ICXTuning, SPRAdditionalTuning);
list<SubtargetFeature> SPRFeatures =
!listconcat(ICXFeatures, SPRAdditionalFeatures);
@@ -985,7 +1011,8 @@ def ProcessorFeatures {
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureWAITPKG];
- list<SubtargetFeature> ADLTuning = SKLTuning;
+ list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps];
+ list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
list<SubtargetFeature> ADLFeatures =
!listconcat(TRMFeatures, ADLAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 705301e5d3610..2a5f01a83ff86 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4939,6 +4939,255 @@ static bool hasPartialRegUpdate(unsigned Opcode,
case X86::SQRTSDr_Int:
case X86::SQRTSDm_Int:
return true;
+ case X86::VFCMULCPHZ128rm:
+ case X86::VFCMULCPHZ128rmb:
+ case X86::VFCMULCPHZ128rmbkz:
+ case X86::VFCMULCPHZ128rmkz:
+ case X86::VFCMULCPHZ128rr:
+ case X86::VFCMULCPHZ128rrkz:
+ case X86::VFCMULCPHZ256rm:
+ case X86::VFCMULCPHZ256rmb:
+ case X86::VFCMULCPHZ256rmbkz:
+ case X86::VFCMULCPHZ256rmkz:
+ case X86::VFCMULCPHZ256rr:
+ case X86::VFCMULCPHZ256rrkz:
+ case X86::VFCMULCPHZrm:
+ case X86::VFCMULCPHZrmb:
+ case X86::VFCMULCPHZrmbkz:
+ case X86::VFCMULCPHZrmkz:
+ case X86::VFCMULCPHZrr:
+ case X86::VFCMULCPHZrrb:
+ case X86::VFCMULCPHZrrbkz:
+ case X86::VFCMULCPHZrrkz:
+ case X86::VFMULCPHZ128rm:
+ case X86::VFMULCPHZ128rmb:
+ case X86::VFMULCPHZ128rmbkz:
+ case X86::VFMULCPHZ128rmkz:
+ case X86::VFMULCPHZ128rr:
+ case X86::VFMULCPHZ128rrkz:
+ case X86::VFMULCPHZ256rm:
+ case X86::VFMULCPHZ256rmb:
+ case X86::VFMULCPHZ256rmbkz:
+ case X86::VFMULCPHZ256rmkz:
+ case X86::VFMULCPHZ256rr:
+ case X86::VFMULCPHZ256rrkz:
+ case X86::VFMULCPHZrm:
+ case X86::VFMULCPHZrmb:
+ case X86::VFMULCPHZrmbkz:
+ case X86::VFMULCPHZrmkz:
+ case X86::VFMULCPHZrr:
+ case X86::VFMULCPHZrrb:
+ case X86::VFMULCPHZrrbkz:
+ case X86::VFMULCPHZrrkz:
+ case X86::VFCMULCSHZrm:
+ case X86::VFCMULCSHZrmkz:
+ case X86::VFCMULCSHZrr:
+ case X86::VFCMULCSHZrrb:
+ case X86::VFCMULCSHZrrbkz:
+ case X86::VFCMULCSHZrrkz:
+ case X86::VFMULCSHZrm:
+ case X86::VFMULCSHZrmkz:
+ case X86::VFMULCSHZrr:
+ case X86::VFMULCSHZrrb:
+ case X86::VFMULCSHZrrbkz:
+ case X86::VFMULCSHZrrkz:
+ return Subtarget.hasMULCFalseDeps();
+ case X86::VPERMDYrm:
+ case X86::VPERMDYrr:
+ case X86::VPERMQYmi:
+ case X86::VPERMQYri:
+ case X86::VPERMPSYrm:
+ case X86::VPERMPSYrr:
+ case X86::VPERMPDYmi:
+ case X86::VPERMPDYri:
+ case X86::VPERMDZ256rm:
+ case X86::VPERMDZ256rmb:
+ case X86::VPERMDZ256rmbkz:
+ case X86::VPERMDZ256rmkz:
+ case X86::VPERMDZ256rr:
+ case X86::VPERMDZ256rrkz:
+ case X86::VPERMDZrm:
+ case X86::VPERMDZrmb:
+ case X86::VPERMDZrmbkz:
+ case X86::VPERMDZrmkz:
+ case X86::VPERMDZrr:
+ case X86::VPERMDZrrkz:
+ case X86::VPERMQZ256mbi:
+ case X86::VPERMQZ256mbikz:
+ case X86::VPERMQZ256mi:
+ case X86::VPERMQZ256mikz:
+ case X86::VPERMQZ256ri:
+ case X86::VPERMQZ256rikz:
+ case X86::VPERMQZ256rm:
+ case X86::VPERMQZ256rmb:
+ case X86::VPERMQZ256rmbkz:
+ case X86::VPERMQZ256rmkz:
+ case X86::VPERMQZ256rr:
+ case X86::VPERMQZ256rrkz:
+ case X86::VPERMQZmbi:
+ case X86::VPERMQZmbikz:
+ case X86::VPERMQZmi:
+ case X86::VPERMQZmikz:
+ case X86::VPERMQZri:
+ case X86::VPERMQZrikz:
+ case X86::VPERMQZrm:
+ case X86::VPERMQZrmb:
+ case X86::VPERMQZrmbkz:
+ case X86::VPERMQZrmkz:
+ case X86::VPERMQZrr:
+ case X86::VPERMQZrrkz:
+ case X86::VPERMPSZ256rm:
+ case X86::VPERMPSZ256rmb:
+ case X86::VPERMPSZ256rmbkz:
+ case X86::VPERMPSZ256rmkz:
+ case X86::VPERMPSZ256rr:
+ case X86::VPERMPSZ256rrkz:
+ case X86::VPERMPSZrm:
+ case X86::VPERMPSZrmb:
+ case X86::VPERMPSZrmbkz:
+ case X86::VPERMPSZrmkz:
+ case X86::VPERMPSZrr:
+ case X86::VPERMPSZrrkz:
+ case X86::VPERMPDZ256mbi:
+ case X86::VPERMPDZ256mbikz:
+ case X86::VPERMPDZ256mi:
+ case X86::VPERMPDZ256mikz:
+ case X86::VPERMPDZ256ri:
+ case X86::VPERMPDZ256rikz:
+ case X86::VPERMPDZ256rm:
+ case X86::VPERMPDZ256rmb:
+ case X86::VPERMPDZ256rmbkz:
+ case X86::VPERMPDZ256rmkz:
+ case X86::VPERMPDZ256rr:
+ case X86::VPERMPDZ256rrkz:
+ case X86::VPERMPDZmbi:
+ case X86::VPERMPDZmbikz:
+ case X86::VPERMPDZmi:
+ case X86::VPERMPDZmikz:
+ case X86::VPERMPDZri:
+ case X86::VPERMPDZrikz:
+ case X86::VPERMPDZrm:
+ case X86::VPERMPDZrmb:
+ case X86::VPERMPDZrmbkz:
+ case X86::VPERMPDZrmkz:
+ case X86::VPERMPDZrr:
+ case X86::VPERMPDZrrkz:
+ return Subtarget.hasPERMFalseDeps();
+ case X86::VRANGEPDZ128rmbi:
+ case X86::VRANGEPDZ128rmbikz:
+ case X86::VRANGEPDZ128rmi:
+ case X86::VRANGEPDZ128rmikz:
+ case X86::VRANGEPDZ128rri:
+ case X86::VRANGEPDZ128rrikz:
+ case X86::VRANGEPDZ256rmbi:
+ case X86::VRANGEPDZ256rmbikz:
+ case X86::VRANGEPDZ256rmi:
+ case X86::VRANGEPDZ256rmikz:
+ case X86::VRANGEPDZ256rri:
+ case X86::VRANGEPDZ256rrikz:
+ case X86::VRANGEPDZrmbi:
+ case X86::VRANGEPDZrmbikz:
+ case X86::VRANGEPDZrmi:
+ case X86::VRANGEPDZrmikz:
+ case X86::VRANGEPDZrri:
+ case X86::VRANGEPDZrrib:
+ case X86::VRANGEPDZrribkz:
+ case X86::VRANGEPDZrrikz:
+ case X86::VRANGEPSZ128rmbi:
+ case X86::VRANGEPSZ128rmbikz:
+ case X86::VRANGEPSZ128rmi:
+ case X86::VRANGEPSZ128rmikz:
+ case X86::VRANGEPSZ128rri:
+ case X86::VRANGEPSZ128rrikz:
+ case X86::VRANGEPSZ256rmbi:
+ case X86::VRANGEPSZ256rmbikz:
+ case X86::VRANGEPSZ256rmi:
+ case X86::VRANGEPSZ256rmikz:
+ case X86::VRANGEPSZ256rri:
+ case X86::VRANGEPSZ256rrikz:
+ case X86::VRANGEPSZrmbi:
+ case X86::VRANGEPSZrmbikz:
+ case X86::VRANGEPSZrmi:
+ case X86::VRANGEPSZrmikz:
+ case X86::VRANGEPSZrri:
+ case X86::VRANGEPSZrrib:
+ case X86::VRANGEPSZrribkz:
+ case X86::VRANGEPSZrrikz:
+ case X86::VRANGESDZrmi:
+ case X86::VRANGESDZrmikz:
+ case X86::VRANGESDZrri:
+ case X86::VRANGESDZrrib:
+ case X86::VRANGESDZrribkz:
+ case X86::VRANGESDZrrikz:
+ case X86::VRANGESSZrmi:
+ case X86::VRANGESSZrmikz:
+ case X86::VRANGESSZrri:
+ case X86::VRANGESSZrrib:
+ case X86::VRANGESSZrribkz:
+ case X86::VRANGESSZrrikz:
+ return Subtarget.hasRANGEFalseDeps();
+ case X86::VGETMANTSSZrmi:
+ case X86::VGETMANTSSZrmikz:
+ case X86::VGETMANTSSZrri:
+ case X86::VGETMANTSSZrrib:
+ case X86::VGETMANTSSZrribkz:
+ case X86::VGETMANTSSZrrikz:
+ case X86::VGETMANTSDZrmi:
+ case X86::VGETMANTSDZrmikz:
+ case X86::VGETMANTSDZrri:
+ case X86::VGETMANTSDZrrib:
+ case X86::VGETMANTSDZrribkz:
+ case X86::VGETMANTSDZrrikz:
+ case X86::VGETMANTSHZrmi:
+ case X86::VGETMANTSHZrmikz:
+ case X86::VGETMANTSHZrri:
+ case X86::VGETMANTSHZrrib:
+ case X86::VGETMANTSHZrribkz:
+ case X86::VGETMANTSHZrrikz:
+ case X86::VGETMANTPSZ128rmbi:
+ case X86::VGETMANTPSZ128rmbikz:
+ case X86::VGETMANTPSZ128rmi:
+ case X86::VGETMANTPSZ128rmikz:
+ case X86::VGETMANTPSZ256rmbi:
+ case X86::VGETMANTPSZ256rmbikz:
+ case X86::VGETMANTPSZ256rmi:
+ case X86::VGETMANTPSZ256rmikz:
+ case X86::VGETMANTPSZrmbi:
+ case X86::VGETMANTPSZrmbikz:
+ case X86::VGETMANTPSZrmi:
+ case X86::VGETMANTPSZrmikz:
+ case X86::VGETMANTPDZ128rmbi:
+ case X86::VGETMANTPDZ128rmbikz:
+ case X86::VGETMANTPDZ128rmi:
+ case X86::VGETMANTPDZ128rmikz:
+ case X86::VGETMANTPDZ256rmbi:
+ case X86::VGETMANTPDZ256rmbikz:
+ case X86::VGETMANTPDZ256rmi:
+ case X86::VGETMANTPDZ256rmikz:
+ case X86::VGETMANTPDZrmbi:
+ case X86::VGETMANTPDZrmbikz:
+ case X86::VGETMANTPDZrmi:
+ case X86::VGETMANTPDZrmikz:
+ return Subtarget.hasGETMANTFalseDeps();
+ case X86::VPMULLQZ128rm:
+ case X86::VPMULLQZ128rmb:
+ case X86::VPMULLQZ128rmbkz:
+ case X86::VPMULLQZ128rmkz:
+ case X86::VPMULLQZ128rr:
+ case X86::VPMULLQZ128rrkz:
+ case X86::VPMULLQZ256rm:
+ case X86::VPMULLQZ256rmb:
+ case X86::VPMULLQZ256rmbkz:
+ case X86::VPMULLQZ256rmkz:
+ case X86::VPMULLQZ256rr:
+ case X86::VPMULLQZ256rrkz:
+ case X86::VPMULLQZrm:
+ case X86::VPMULLQZrmb:
+ case X86::VPMULLQZrmbkz:
+ case X86::VPMULLQZrmkz:
+ case X86::VPMULLQZrr:
+ case X86::VPMULLQZrrkz:
+ return Subtarget.hasMULLQFalseDeps();
// GPR
case X86::POPCNT32rm:
case X86::POPCNT32rr:
@@ -5365,6 +5614,28 @@ void X86InstrInfo::breakPartialRegDependency(
.addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);
MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR128XRegClass.contains(Reg)) {
+ // Only handle VLX targets.
+ if (!Subtarget.hasVLX())
+ return;
+ // Since vxorps requires AVX512DQ, vpxord should be the best choice.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256XRegClass.contains(Reg) ||
+ X86::VR512RegClass.contains(Reg)) {
+ // Only handle VLX targets.
+ if (!Subtarget.hasVLX())
+ return;
+ // Use vpxord to clear the full ymm/zmm register.
+ // It wants to read and write the xmm sub-register.
+ Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
} else if (X86::GR64RegClass.contains(Reg)) {
// Using XOR32rr because it has shorter encoding and zeros up the upper bits
// as well.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index d262835dd44a4..4f874783f6989 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -68,6 +68,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningMacroFusion,
X86::TuningPadShortFunctions,
X86::TuningPOPCNTFalseDeps,
+ X86::TuningMULCFalseDeps,
+ X86::TuningPERMFalseDeps,
+ X86::TuningRANGEFalseDeps,
+ X86::TuningGETMANTFalseDeps,
+ X86::TuningMULLQFalseDeps,
X86::TuningSlow3OpsLEA,
X86::TuningSlowDivide32,
X86::TuningSlowDivide64,
diff --git a/llvm/test/CodeGen/X86/getmant-false-deps.ll b/llvm/test/CodeGen/X86/getmant-false-deps.ll
new file mode 100644
index 0000000000000..8880ae041c7c4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/getmant-false-deps.ll
@@ -0,0 +1,589 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-getmant -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-getmant -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
+
+define <4 x float> @getmantps_mem_128(<4 x float>* %p0) {
+; ENABLE-LABEL: getmantps_mem_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantps $88, (%rdi), %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantps_mem_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantps $88, (%rdi), %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x float>, <4 x float>* %p0, align 64
+ %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %a0, i32 88, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @getmantps_broadcast_128(float* %p0) {
+; ENABLE-LABEL: getmantps_broadcast_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantps $88, (%rdi){1to4}, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantps_broadcast_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantps $88, (%rdi){1to4}, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load float, float* %p0, align 4
+ %t0 = insertelement <4 x float> undef, float %v0, i64 0
+ %a0 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %a0, i32 88, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <8 x float> @getmantps_mem_256(<8 x float>* %p0) {
+; ENABLE-LABEL: getmantps_mem_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantps $88, (%rdi), %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantps_mem_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantps $88, (%rdi), %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x float>, <8 x float>* %p0, align 64
+ %2 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %a0, i32 88, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @getmantps_broadcast_256(float* %p0) {
+; ENABLE-LABEL: getmantps_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantps $88, (%rdi){1to8}, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantps_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantps $88, (%rdi){1to8}, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load float, float* %p0, align 4
+ %t0 = insertelement <8 x float> undef, float %v0, i64 0
+ %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %a0, i32 88, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <16 x float> @getmantps_mem_512(<16 x float>* %p0) {
+; ENABLE-LABEL: getmantps_mem_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantps $88, (%rdi), %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantps_mem_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantps $88, (%rdi), %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x float>, <16 x float>* %p0, align 64
+ %2 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %a0, i32 88, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @getmantps_broadcast_512(float* %p0) {
+; ENABLE-LABEL: getmantps_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantps $88, (%rdi){1to16}, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantps_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantps $88, (%rdi){1to16}, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load float, float* %p0, align 4
+ %t0 = insertelement <16 x float> undef, float %v0, i64 0
+ %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %a0, i32 88, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+
+
+define <2 x double> @getmantpd_mem_128(<2 x double>* %p0) {
+; ENABLE-LABEL: getmantpd_mem_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantpd $88, (%rdi), %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantpd_mem_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantpd $88, (%rdi), %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <2 x double>, <2 x double>* %p0, align 64
+ %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %a0, i32 88, <2 x double> undef, i8 -1)
+ ret <2 x double> %2
+}
+
+define <2 x double> @getmantpd_broadcast_128(double* %p0) {
+; ENABLE-LABEL: getmantpd_broadcast_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantpd $88, (%rdi){1to2}, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantpd_broadcast_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantpd $88, (%rdi){1to2}, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load double, double* %p0, align 4
+ %t0 = insertelement <2 x double> undef, double %v0, i64 0
+ %a0 = shufflevector <2 x double> %t0, <2 x double> undef, <2 x i32> zeroinitializer
+ %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %a0, i32 88, <2 x double> undef, i8 -1)
+ ret <2 x double> %2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <4 x double> @getmantpd_mem_256(<4 x double>* %p0) {
+; ENABLE-LABEL: getmantpd_mem_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantpd $88, (%rdi), %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantpd_mem_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantpd $88, (%rdi), %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x double>, <4 x double>* %p0, align 64
+ %2 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %a0, i32 88, <4 x double> undef, i8 -1)
+ ret <4 x double> %2
+}
+
+define <4 x double> @getmantpd_broadcast_256(double* %p0) {
+; ENABLE-LABEL: getmantpd_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantpd $88, (%rdi){1to4}, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantpd_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantpd $88, (%rdi){1to4}, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load double, double* %p0, align 4
+ %t0 = insertelement <4 x double> undef, double %v0, i64 0
+ %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %a0, i32 88, <4 x double> undef, i8 -1)
+ ret <4 x double> %2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <8 x double> @getmantpd_mem_512(<8 x double>* %p0) {
+; ENABLE-LABEL: getmantpd_mem_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantpd $88, (%rdi), %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantpd_mem_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantpd $88, (%rdi), %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x double>, <8 x double>* %p0, align 64
+ %2 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %a0, i32 88, <8 x double> undef, i8 -1, i32 4)
+ ret <8 x double> %2
+}
+
+define <8 x double> @getmantpd_broadcast_512(double* %p0) {
+; ENABLE-LABEL: getmantpd_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantpd $88, (%rdi){1to8}, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantpd_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vgetmantpd $88, (%rdi){1to8}, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load double, double* %p0, align 4
+ %t0 = insertelement <8 x double> undef, double %v0, i64 0
+ %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %a0, i32 88, <8 x double> undef, i8 -1, i32 4)
+ ret <8 x double> %2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x half> @getmantsh(<8 x half> %a0, <8 x half> %a1) {
+; ENABLE-LABEL: getmantsh:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vgetmantsh $11, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddph %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddph %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantsh:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vgetmantsh $11, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddph %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddph %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 11, <8 x half> undef, i8 -1, i32 4)
+ %t = fadd <8 x half> %a0, %a1
+ %res = fadd <8 x half> %2, %t
+ ret <8 x half> %res
+}
+
+define <8 x half> @getmantsh_mem(<8 x half> %a0, <8 x half>* %p1) {
+; ENABLE-LABEL: getmantsh_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantsh $11, (%rdi), %xmm1, %xmm0
+; ENABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantsh_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vgetmantsh $11, (%rdi), %xmm1, %xmm0
+; DISABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <8 x half>, <8 x half>* %p1, align 64
+ %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 11, <8 x half> undef, i8 -1, i32 4)
+ %res = fadd <8 x half> %2, %a0
+ ret <8 x half> %res
+}
+
+define <8 x half> @getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+; ENABLE-LABEL: getmantsh_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: vaddph %xmm0, %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantsh_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; DISABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: vaddph %xmm0, %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 11, <8 x half> zeroinitializer, i8 %2, i32 4)
+ %t = fadd <8 x half> %a0, %a1
+ %res = fadd <8 x half> %3, %t
+ ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
+
+define <4 x float> @getmantss(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: getmantss:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vgetmantss $11, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantss:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vgetmantss $11, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %a0, <4 x float> %a1, i32 11, <4 x float> undef, i8 -1, i32 4)
+ %t = fadd <4 x float> %a0, %a1
+ %res = fadd <4 x float> %2, %t
+ ret <4 x float> %res
+}
+
+define <4 x float> @getmantss_mem(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: getmantss_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantss $11, (%rdi), %xmm1, %xmm0
+; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantss_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vgetmantss $11, (%rdi), %xmm1, %xmm0
+; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %a0, <4 x float> %a1, i32 11, <4 x float> undef, i8 -1, i32 4)
+ %res = fadd <4 x float> %2, %a0
+ ret <4 x float> %res
+}
+
+define <4 x float> @getmantss_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: getmantss_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantss_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %a0, <4 x float> %a1, i32 11, <4 x float> zeroinitializer, i8 %2, i32 4)
+ %t = fadd <4 x float> %a0, %a1
+ %res = fadd <4 x float> %3, %t
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
+
+define <2 x double> @getmantsd(<2 x double> %a0, <2 x double> %a1) {
+; ENABLE-LABEL: getmantsd:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vgetmantsd $11, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantsd:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vgetmantsd $11, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %a0, <2 x double> %a1, i32 11, <2 x double> undef, i8 -1, i32 4)
+ %t = fadd <2 x double> %a0, %a1
+ %res = fadd <2 x double> %2, %t
+ ret <2 x double> %res
+}
+
+define <2 x double> @getmantsd_mem(<2 x double> %a0, <2 x double>* %p1) {
+; ENABLE-LABEL: getmantsd_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vgetmantsd $11, (%rdi), %xmm1, %xmm0
+; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantsd_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vgetmantsd $11, (%rdi), %xmm1, %xmm0
+; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <2 x double>, <2 x double>* %p1, align 64
+ %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %a0, <2 x double> %a1, i32 11, <2 x double> undef, i8 -1, i32 4)
+ %res = fadd <2 x double> %2, %a0
+ ret <2 x double> %res
+}
+
+define <2 x double> @getmantsd_maskz(<2 x double> %a0, <2 x double> %a1, i8* %mask) {
+; ENABLE-LABEL: getmantsd_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: getmantsd_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm2 {%k1} {z}
+; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %a0, <2 x double> %a1, i32 11, <2 x double> zeroinitializer, i8 %2, i32 4)
+ %t = fadd <2 x double> %a0, %a1
+ %res = fadd <2 x double> %3, %t
+ ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
diff --git a/llvm/test/CodeGen/X86/mulc-false-deps.ll b/llvm/test/CodeGen/X86/mulc-false-deps.ll
new file mode 100644
index 0000000000000..f4f1563f0fc7a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mulc-false-deps.ll
@@ -0,0 +1,872 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
+
+define <16 x float> @fmulcph(<16 x float> %a0, <16 x float> %a1) {
+; ENABLE-LABEL: fmulcph:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcph %zmm1, %zmm0, %zmm2
+; ENABLE-NEXT: vmovaps %zmm2, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcph:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; DISABLE-NEXT: vmovaps %zmm2, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @fmulcph_mem(<16 x float> %a0, <16 x float>* %p1) {
+; ENABLE-LABEL: fmulcph_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcph (%rdi), %zmm0, %zmm1
+; ENABLE-NEXT: vmovaps %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcph_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; DISABLE-NEXT: vfmulcph (%rdi), %zmm0, %zmm1
+; DISABLE-NEXT: vmovaps %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <16 x float>, <16 x float>* %p1, align 64
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @fmulcph_broadcast(<16 x float> %a0, float* %p1) {
+; ENABLE-LABEL: fmulcph_broadcast:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcph (%rdi){1to16}, %zmm0, %zmm1
+; ENABLE-NEXT: vmovaps %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcph_broadcast:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; DISABLE-NEXT: vfmulcph (%rdi){1to16}, %zmm0, %zmm1
+; DISABLE-NEXT: vmovaps %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <16 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) {
+; ENABLE-LABEL: fmulcph_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovw (%rdi), %k1
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %zmm2, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcph_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovw (%rdi), %k1
+; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
+; DISABLE-NEXT: vmovaps %zmm2, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
+ ret <16 x float> %3
+}
+
+define <16 x float> @fcmulcph(<16 x float> %a0, <16 x float> %a1) {
+; ENABLE-LABEL: fcmulcph:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2
+; ENABLE-NEXT: vmovaps %zmm2, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcph:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; DISABLE-NEXT: vmovaps %zmm2, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @fcmulcph_mem(<16 x float> %a0, <16 x float>* %p1) {
+; ENABLE-LABEL: fcmulcph_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcph (%rdi), %zmm0, %zmm1
+; ENABLE-NEXT: vmovaps %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcph_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; DISABLE-NEXT: vfcmulcph (%rdi), %zmm0, %zmm1
+; DISABLE-NEXT: vmovaps %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <16 x float>, <16 x float>* %p1, align 64
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @fcmulcph_broadcast(<16 x float> %a0, float* %p1) {
+; ENABLE-LABEL: fcmulcph_broadcast:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcph (%rdi){1to16}, %zmm0, %zmm1
+; ENABLE-NEXT: vmovaps %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcph_broadcast:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; DISABLE-NEXT: vfcmulcph (%rdi){1to16}, %zmm0, %zmm1
+; DISABLE-NEXT: vmovaps %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <16 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
+ ret <16 x float> %2
+}
+
+define <16 x float> @fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) {
+; ENABLE-LABEL: fcmulcph_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovw (%rdi), %k1
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %zmm2, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcph_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovw (%rdi), %k1
+; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
+; DISABLE-NEXT: vmovaps %zmm2, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
+ ret <16 x float> %3
+}
+
+define <4 x float> @fmulc(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: fmulc:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcph %xmm1, %xmm0, %xmm2
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fmulc_mem(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: fmulc_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcph (%rdi), %xmm0, %xmm1
+; ENABLE-NEXT: vmovaps %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; DISABLE-NEXT: vfmulcph (%rdi), %xmm0, %xmm1
+; DISABLE-NEXT: vmovaps %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fmulc_broadcast(<4 x float> %a0, float* %p1) {
+; ENABLE-LABEL: fmulc_broadcast:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcph (%rdi){1to4}, %xmm0, %xmm1
+; ENABLE-NEXT: vmovaps %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_broadcast:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; DISABLE-NEXT: vfmulcph (%rdi){1to4}, %xmm0, %xmm1
+; DISABLE-NEXT: vmovaps %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <4 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: fmulc_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
+ ret <4 x float> %3
+}
+
+define <4 x float> @fcmulc(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: fcmulc:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fcmulc_mem(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: fcmulc_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcph (%rdi), %xmm0, %xmm1
+; ENABLE-NEXT: vmovaps %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; DISABLE-NEXT: vfcmulcph (%rdi), %xmm0, %xmm1
+; DISABLE-NEXT: vmovaps %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fcmulc_broadcast(<4 x float> %a0, float* %p1) {
+; ENABLE-LABEL: fcmulc_broadcast:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcph (%rdi){1to4}, %xmm0, %xmm1
+; ENABLE-NEXT: vmovaps %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_broadcast:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; DISABLE-NEXT: vfcmulcph (%rdi){1to4}, %xmm0, %xmm1
+; DISABLE-NEXT: vmovaps %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <4 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: fcmulc_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
+ ret <4 x float> %3
+}
+
+define <8 x float> @fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
+; ENABLE-LABEL: fmulc_ymm:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcph %ymm1, %ymm0, %ymm2
+; ENABLE-NEXT: vmovaps %ymm2, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_ymm:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; DISABLE-NEXT: vmovaps %ymm2, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @fmulc_ymm_mem(<8 x float> %a0, <8 x float>* %p1) {
+; ENABLE-LABEL: fmulc_ymm_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcph (%rdi), %ymm0, %ymm1
+; ENABLE-NEXT: vmovaps %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_ymm_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfmulcph (%rdi), %ymm0, %ymm1
+; DISABLE-NEXT: vmovaps %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <8 x float>, <8 x float>* %p1, align 64
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @fmulc_ymm_broadcast(<8 x float> %a0, float* %p1) {
+; ENABLE-LABEL: fmulc_ymm_broadcast:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcph (%rdi){1to8}, %ymm0, %ymm1
+; ENABLE-NEXT: vmovaps %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_ymm_broadcast:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfmulcph (%rdi){1to8}, %ymm0, %ymm1
+; DISABLE-NEXT: vmovaps %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <8 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: fmulc_maskz_ymm:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %ymm2, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulc_maskz_ymm:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
+; DISABLE-NEXT: vmovaps %ymm2, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
+ ret <8 x float> %3
+}
+
+define <8 x float> @fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
+; ENABLE-LABEL: fcmulc_ymm:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2
+; ENABLE-NEXT: vmovaps %ymm2, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_ymm:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; DISABLE-NEXT: vmovaps %ymm2, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @fcmulc_ymm_mem(<8 x float> %a0, <8 x float>* %p1) {
+; ENABLE-LABEL: fcmulc_ymm_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcph (%rdi), %ymm0, %ymm1
+; ENABLE-NEXT: vmovaps %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_ymm_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfcmulcph (%rdi), %ymm0, %ymm1
+; DISABLE-NEXT: vmovaps %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <8 x float>, <8 x float>* %p1, align 64
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @fcmulc_ymm_broadcast(<8 x float> %a0, float* %p1) {
+; ENABLE-LABEL: fcmulc_ymm_broadcast:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcph (%rdi){1to8}, %ymm0, %ymm1
+; ENABLE-NEXT: vmovaps %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_ymm_broadcast:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfcmulcph (%rdi){1to8}, %ymm0, %ymm1
+; DISABLE-NEXT: vmovaps %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <8 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
+ ret <8 x float> %2
+}
+
+define <8 x float> @fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: fcmulc_maskz_ymm:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %ymm2, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulc_maskz_ymm:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
+; DISABLE-NEXT: vmovaps %ymm2, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
+ ret <8 x float> %3
+}
+
+define <4 x float> @fmulcsh(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: fmulcsh:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcsh:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fmulcsh_mem(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: fmulcsh_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfmulcsh (%rdi), %xmm0, %xmm1
+; ENABLE-NEXT: vmovaps %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcsh_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; DISABLE-NEXT: vfmulcsh (%rdi), %xmm0, %xmm1
+; DISABLE-NEXT: vmovaps %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: fmulcsh_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fmulcsh_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
+ ret <4 x float> %3
+}
+
+define <4 x float> @fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: fcmulcsh:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcsh:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fcmulcsh_mem(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: fcmulcsh_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vfcmulcsh (%rdi), %xmm0, %xmm1
+; ENABLE-NEXT: vmovaps %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcsh_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; DISABLE-NEXT: vfcmulcsh (%rdi), %xmm0, %xmm1
+; DISABLE-NEXT: vmovaps %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %2
+}
+
+define <4 x float> @fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) {
+; ENABLE-LABEL: fcmulcsh_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
+; ENABLE-NEXT: vmovaps %xmm2, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: fcmulcsh_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; DISABLE-NEXT: vmovaps %xmm2, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
+ ret <4 x float> %3
+}
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
diff --git a/llvm/test/CodeGen/X86/perm.avx2-false-deps.ll b/llvm/test/CodeGen/X86/perm.avx2-false-deps.ll
new file mode 100644
index 0000000000000..33bc951c34c7e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/perm.avx2-false-deps.ll
@@ -0,0 +1,306 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-ADL
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-SPR
+; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-ADL
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-SPR
+
+define <8 x i32> @permd(<8 x i32> %a0, <8 x i32> %a1) {
+; ENABLE-ADL-LABEL: permd:
+; ENABLE-ADL: # %bb.0:
+; ENABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-ADL-NEXT: #APP
+; ENABLE-ADL-NEXT: nop
+; ENABLE-ADL-NEXT: #NO_APP
+; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0
+; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-ADL-NEXT: retq
+;
+; ENABLE-SPR-LABEL: permd:
+; ENABLE-SPR: # %bb.0:
+; ENABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16
+; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17
+; ENABLE-SPR-NEXT: #APP
+; ENABLE-SPR-NEXT: nop
+; ENABLE-SPR-NEXT: #NO_APP
+; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0
+; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1
+; ENABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-SPR-NEXT: retq
+;
+; DISABLE-ADL-LABEL: permd:
+; DISABLE-ADL: # %bb.0:
+; DISABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-ADL-NEXT: #APP
+; DISABLE-ADL-NEXT: nop
+; DISABLE-ADL-NEXT: #NO_APP
+; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0
+; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-ADL-NEXT: retq
+;
+; DISABLE-SPR-LABEL: permd:
+; DISABLE-SPR: # %bb.0:
+; DISABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16
+; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17
+; DISABLE-SPR-NEXT: #APP
+; DISABLE-SPR-NEXT: nop
+; DISABLE-SPR-NEXT: #NO_APP
+; DISABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0
+; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1
+; DISABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-SPR-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1)
+ %3 = add <8 x i32> %a0, %a1
+ %res = add <8 x i32> %2, %3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @permd_mem(<8 x i32>* %p0, <8 x i32> %a1) {
+; ENABLE-ADL-LABEL: permd_mem:
+; ENABLE-ADL: # %bb.0:
+; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-ADL-NEXT: #APP
+; ENABLE-ADL-NEXT: nop
+; ENABLE-ADL-NEXT: #NO_APP
+; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0
+; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-ADL-NEXT: retq
+;
+; ENABLE-SPR-LABEL: permd_mem:
+; ENABLE-SPR: # %bb.0:
+; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16
+; ENABLE-SPR-NEXT: #APP
+; ENABLE-SPR-NEXT: nop
+; ENABLE-SPR-NEXT: #NO_APP
+; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0
+; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0
+; ENABLE-SPR-NEXT: retq
+;
+; DISABLE-ADL-LABEL: permd_mem:
+; DISABLE-ADL: # %bb.0:
+; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-ADL-NEXT: #APP
+; DISABLE-ADL-NEXT: nop
+; DISABLE-ADL-NEXT: #NO_APP
+; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0
+; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-ADL-NEXT: retq
+;
+; DISABLE-SPR-LABEL: permd_mem:
+; DISABLE-SPR: # %bb.0:
+; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16
+; DISABLE-SPR-NEXT: #APP
+; DISABLE-SPR-NEXT: nop
+; DISABLE-SPR-NEXT: #NO_APP
+; DISABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0
+; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0
+; DISABLE-SPR-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %a0 = load <8 x i32>, <8 x i32>* %p0, align 64
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1)
+ %res = add <8 x i32> %2, %a1
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <4 x i64> @permq(<4 x i64> %a0) {
+; ENABLE-LABEL: permq:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ %res = add <4 x i64> %2, %a0
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @permq_mem(<4 x i64>* %p0) {
+; ENABLE-LABEL: permq_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %a0 = load <4 x i64>, <4 x i64>* %p0, align 64
+ %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ ret <4 x i64> %2
+}
+
+define <8 x float> @permps(<8 x float> %a0, <8 x i32> %a1) {
+; ENABLE-ADL-LABEL: permps:
+; ENABLE-ADL: # %bb.0:
+; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-ADL-NEXT: #APP
+; ENABLE-ADL-NEXT: nop
+; ENABLE-ADL-NEXT: #NO_APP
+; ENABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1
+; ENABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0
+; ENABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; ENABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; ENABLE-ADL-NEXT: retq
+;
+; ENABLE-SPR-LABEL: permps:
+; ENABLE-SPR: # %bb.0:
+; ENABLE-SPR-NEXT: vmovaps %ymm0, %ymm16
+; ENABLE-SPR-NEXT: #APP
+; ENABLE-SPR-NEXT: nop
+; ENABLE-SPR-NEXT: #NO_APP
+; ENABLE-SPR-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1
+; ENABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0
+; ENABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0
+; ENABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; ENABLE-SPR-NEXT: retq
+;
+; DISABLE-ADL-LABEL: permps:
+; DISABLE-ADL: # %bb.0:
+; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-ADL-NEXT: #APP
+; DISABLE-ADL-NEXT: nop
+; DISABLE-ADL-NEXT: #NO_APP
+; DISABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1
+; DISABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0
+; DISABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; DISABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; DISABLE-ADL-NEXT: retq
+;
+; DISABLE-SPR-LABEL: permps:
+; DISABLE-SPR: # %bb.0:
+; DISABLE-SPR-NEXT: vmovaps %ymm0, %ymm16
+; DISABLE-SPR-NEXT: #APP
+; DISABLE-SPR-NEXT: nop
+; DISABLE-SPR-NEXT: #NO_APP
+; DISABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1
+; DISABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0
+; DISABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0
+; DISABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; DISABLE-SPR-NEXT: retq
+ %1 = tail call <8 x i32> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
+ %t = sitofp <8 x i32> %1 to <8 x float>
+ %3 = fadd <8 x float> %t, %a0
+ %res = fadd <8 x float> %2, %3
+ ret <8 x float> %res
+}
+
+define <8 x float> @permps_mem(<8 x float>* %p0, <8 x i32> %a1) {
+; ENABLE-LABEL: permps_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1
+; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
+; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1
+; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
+; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %a0 = load <8 x float>, <8 x float>* %p0, align 64
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1)
+ %t = sitofp <8 x i32> %a1 to <8 x float>
+ %res = fadd <8 x float> %2, %t
+ ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
+
+define <4 x double> @permpd(<4 x double> %a0) {
+; ENABLE-LABEL: permpd:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ %res = fadd <4 x double> %2, %a0
+ ret <4 x double> %res
+}
+
+define <4 x double> @permpd_mem(<4 x double>* %p0) {
+; ENABLE-LABEL: permpd_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %a0 = load <4 x double>, <4 x double>* %p0, align 64
+ %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ ret <4 x double> %2
+}
diff --git a/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll b/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll
new file mode 100644
index 0000000000000..5accc99715746
--- /dev/null
+++ b/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll
@@ -0,0 +1,1161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
+
+define <4 x i64> @permq_ri_256(<4 x i64> %a0) {
+; ENABLE-LABEL: permq_ri_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_ri_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ %res = add <4 x i64> %2, %a0
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @permq_rr_256(<4 x i64> %a0, <4 x i64> %idx) {
+; ENABLE-LABEL: permq_rr_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1
+; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_rr_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1
+; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
+ %t = add <4 x i64> %a0, %idx
+ %res = add <4 x i64> %t, %2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @permq_rm_256(<4 x i64>* %p0, <4 x i64> %idx) {
+; ENABLE-LABEL: permq_rm_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1
+; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_rm_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1
+; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x i64>, <4 x i64>* %p0, align 64
+ %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
+ %res = add <4 x i64> %idx, %2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @permq_mi_256(<4 x i64>* %p0) {
+; ENABLE-LABEL: permq_mi_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_mi_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x i64>, <4 x i64>* %p0, align 64
+ %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @permq_broadcast_256(i64* %p0, <4 x i64> %idx) {
+; ENABLE-LABEL: permq_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0
+; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0
+; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load i64, i64* %p0, align 4
+ %t0 = insertelement <4 x i64> undef, i64 %v0, i64 0
+ %a0 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
+ %res = add <4 x i64> %2, %idx
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @permq_maskz_256(<4 x i64> %a0, <4 x i64> %idx, i8* %mask) {
+; ENABLE-LABEL: permq_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1}
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1}
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx, <4 x i64> zeroinitializer, i8 %2)
+ %t = add <4 x i64> %a0, %idx
+ %res = add <4 x i64> %3, %t
+ ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <8 x i64> @permq_rr_512(<8 x i64> %a0, <8 x i64> %idx) {
+; ENABLE-LABEL: permq_rr_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1
+; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_rr_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1
+; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
+ %t = add <8 x i64> %a0, %idx
+ %res = add <8 x i64> %t, %2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @permq_rm_512(<8 x i64>* %p0, <8 x i64> %idx) {
+; ENABLE-LABEL: permq_rm_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1
+; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_rm_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1
+; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x i64>, <8 x i64>* %p0, align 64
+ %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
+ %res = add <8 x i64> %idx, %2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @permq_broadcast_512(i64* %p0, <8 x i64> %idx) {
+; ENABLE-LABEL: permq_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0
+; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0
+; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load i64, i64* %p0, align 4
+ %t0 = insertelement <8 x i64> undef, i64 %v0, i64 0
+ %a0 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
+ %res = add <8 x i64> %2, %idx
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @permq_maskz_512(<8 x i64> %a0, <8 x i64> %idx, i8* %mask) {
+; ENABLE-LABEL: permq_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1}
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permq_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1}
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx, <8 x i64> zeroinitializer, i8 %2)
+ %t = add <8 x i64> %a0, %idx
+ %res = add <8 x i64> %3, %t
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i32> @permd_rr_256(<8 x i32> %a0, <8 x i32> %idx) {
+; ENABLE-LABEL: permd_rr_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1
+; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_rr_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1
+; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1)
+ %t = add <8 x i32> %a0, %idx
+ %res = add <8 x i32> %t, %2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @permd_rm_256(<8 x i32>* %p0, <8 x i32> %idx) {
+; ENABLE-LABEL: permd_rm_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1
+; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_rm_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1
+; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x i32>, <8 x i32>* %p0, align 64
+ %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1)
+ %res = add <8 x i32> %idx, %2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @permd_broadcast_256(i32* %p0, <8 x i32> %idx) {
+; ENABLE-LABEL: permd_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0
+; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0
+; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load i32, i32* %p0, align 4
+ %t0 = insertelement <8 x i32> undef, i32 %v0, i32 0
+ %a0 = shufflevector <8 x i32> %t0, <8 x i32> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 -1)
+ %res = add <8 x i32> %2, %idx
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @permd_maskz_256(<8 x i32> %a0, <8 x i32> %idx, i8* %mask) {
+; ENABLE-LABEL: permd_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1}
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1}
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 %2)
+ %t = add <8 x i32> %a0, %idx
+ %res = add <8 x i32> %3, %t
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <16 x i32> @permd_rr_512(<16 x i32> %a0, <16 x i32> %idx) {
+; ENABLE-LABEL: permd_rr_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1
+; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_rr_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1
+; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
+ %t = add <16 x i32> %a0, %idx
+ %res = add <16 x i32> %t, %2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @permd_rm_512(<16 x i32>* %p0, <16 x i32> %idx) {
+; ENABLE-LABEL: permd_rm_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1
+; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_rm_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1
+; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x i32>, <16 x i32>* %p0, align 64
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
+ %res = add <16 x i32> %idx, %2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @permd_broadcast_512(i32* %p0, <16 x i32> %idx) {
+; ENABLE-LABEL: permd_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0
+; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0
+; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load i32, i32* %p0, align 4
+ %t0 = insertelement <16 x i32> undef, i32 %v0, i32 0
+ %a0 = shufflevector <16 x i32> %t0, <16 x i32> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
+ %res = add <16 x i32> %2, %idx
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @permd_maskz_512(<16 x i32> %a0, <16 x i32> %idx, i16* %mask) {
+; ENABLE-LABEL: permd_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2
+; ENABLE-NEXT: kmovw (%rdi), %k1
+; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1}
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permd_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2
+; DISABLE-NEXT: kmovw (%rdi), %k1
+; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1}
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> zeroinitializer, i16 %2)
+ %t = add <16 x i32> %a0, %idx
+ %res = add <16 x i32> %3, %t
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <4 x double> @permpd_ri_256(<4 x double> %a0) {
+; ENABLE-LABEL: permpd_ri_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_ri_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
+; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ %res = fadd <4 x double> %2, %a0
+ ret <4 x double> %res
+}
+
+define <4 x double> @permpd_rr_256(<4 x double> %a0, <4 x i64> %idx) {
+; ENABLE-LABEL: permpd_rr_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd %ymm0, %ymm2
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1
+; ENABLE-NEXT: vcvtqq2pd %ymm0, %ymm0
+; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_rr_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd %ymm0, %ymm2
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; DISABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1
+; DISABLE-NEXT: vcvtqq2pd %ymm0, %ymm0
+; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <4 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %1, <4 x i64> %idx)
+ %a1 = sitofp <4 x i64> %idx to <4 x double>
+ %t = fadd <4 x double> %1, %a1
+ %res = fadd <4 x double> %2, %t
+ ret <4 x double> %res
+}
+
+define <4 x double> @permpd_rm_256(<4 x double>* %p0, <4 x i64> %idx) {
+; ENABLE-LABEL: permpd_rm_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0
+; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
+; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_rm_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0
+; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
+; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x double>, <4 x double>* %p0, align 64
+ %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx)
+ %a1 = sitofp <4 x i64> %idx to <4 x double>
+ %res = fadd <4 x double> %2, %a1
+ ret <4 x double> %res
+}
+
+define <4 x double> @permpd_mi_256(<4 x double>* %p0) {
+; ENABLE-LABEL: permpd_mi_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_mi_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <4 x double>, <4 x double>* %p0, align 64
+ %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+ ret <4 x double> %2
+}
+
+define <4 x double> @permpd_broadcast_256(double* %p0, <4 x i64> %idx) {
+; ENABLE-LABEL: permpd_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0
+; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
+; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0
+; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
+; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load double, double* %p0, align 4
+ %t0 = insertelement <4 x double> undef, double %v0, i64 0
+ %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx)
+ %a1 = sitofp <4 x i64> %idx to <4 x double>
+ %res = fadd <4 x double> %2, %a1
+ ret <4 x double> %res
+}
+
+define <4 x double> @permpd_maskz_256(<4 x double> %a0, <4 x i64> %idx, i8* %mask) {
+; ENABLE-LABEL: permpd_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z}
+; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
+; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z}
+; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
+; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> %idx, <4 x double> zeroinitializer, i8 %2)
+ %a1 = sitofp <4 x i64> %idx to <4 x double>
+ %t = fadd <4 x double> %a0, %a1
+ %res = fadd <4 x double> %3, %t
+ ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
+declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <8 x double> @permpd_rr_512(<8 x double> %a0, <8 x i64> %idx) {
+; ENABLE-LABEL: permpd_rr_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd %zmm0, %zmm2
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1
+; ENABLE-NEXT: vcvtqq2pd %zmm0, %zmm0
+; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_rr_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd %zmm0, %zmm2
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; DISABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1
+; DISABLE-NEXT: vcvtqq2pd %zmm0, %zmm0
+; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> %idx)
+ %a1 = sitofp <8 x i64> %idx to <8 x double>
+ %t = fadd <8 x double> %1, %a1
+ %res = fadd <8 x double> %2, %t
+ ret <8 x double> %res
+}
+
+define <8 x double> @permpd_rm_512(<8 x double>* %p0, <8 x i64> %idx) {
+; ENABLE-LABEL: permpd_rm_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0
+; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
+; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_rm_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0
+; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
+; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x double>, <8 x double>* %p0, align 64
+ %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx)
+ %a1 = sitofp <8 x i64> %idx to <8 x double>
+ %res = fadd <8 x double> %2, %a1
+ ret <8 x double> %res
+}
+
+define <8 x double> @permpd_broadcast_512(double* %p0, <8 x i64> %idx) {
+; ENABLE-LABEL: permpd_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0
+; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
+; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0
+; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
+; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load double, double* %p0, align 4
+ %t0 = insertelement <8 x double> undef, double %v0, i64 0
+ %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx)
+ %a1 = sitofp <8 x i64> %idx to <8 x double>
+ %res = fadd <8 x double> %2, %a1
+ ret <8 x double> %res
+}
+
+define <8 x double> @permpd_maskz_512(<8 x double> %a0, <8 x i64> %idx, i8* %mask) {
+; ENABLE-LABEL: permpd_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z}
+; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
+; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permpd_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z}
+; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
+; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> %idx, <8 x double> zeroinitializer, i8 %2)
+ %a1 = sitofp <8 x i64> %idx to <8 x double>
+ %t = fadd <8 x double> %a0, %a1
+ %res = fadd <8 x double> %3, %t
+ ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+
+define <8 x float> @permps_rr_256(<8 x float> %a0, <8 x i32> %idx) {
+; ENABLE-LABEL: permps_rr_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps %ymm0, %ymm2
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1
+; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
+; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_rr_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps %ymm0, %ymm2
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; DISABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1
+; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
+; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <8 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %1, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
+ %a1 = sitofp <8 x i32> %idx to <8 x float>
+ %t = fadd <8 x float> %1, %a1
+ %res = fadd <8 x float> %2, %t
+ ret <8 x float> %res
+}
+
+define <8 x float> @permps_rm_256(<8 x float>* %p0, <8 x i32> %idx) {
+; ENABLE-LABEL: permps_rm_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0
+; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
+; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_rm_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0
+; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
+; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <8 x float>, <8 x float>* %p0, align 64
+ %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
+ %a1 = sitofp <8 x i32> %idx to <8 x float>
+ %res = fadd <8 x float> %2, %a1
+ ret <8 x float> %res
+}
+
+define <8 x float> @permps_broadcast_256(float* %p0, <8 x i32> %idx) {
+; ENABLE-LABEL: permps_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0
+; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
+; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0
+; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
+; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load float, float* %p0, align 4
+ %t0 = insertelement <8 x float> undef, float %v0, i32 0
+ %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
+ %a1 = sitofp <8 x i32> %idx to <8 x float>
+ %res = fadd <8 x float> %2, %a1
+ ret <8 x float> %res
+}
+
+define <8 x float> @permps_maskz_256(<8 x float> %a0, <8 x i32> %idx, i8* %mask) {
+; ENABLE-LABEL: permps_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z}
+; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
+; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z}
+; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
+; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i8, i8* %mask
+ %3 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 %2)
+ %a1 = sitofp <8 x i32> %idx to <8 x float>
+ %t = fadd <8 x float> %a0, %a1
+ %res = fadd <8 x float> %3, %t
+ ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <16 x float> @permps_rr_512(<16 x float> %a0, <16 x i32> %idx) {
+; ENABLE-LABEL: permps_rr_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps %zmm0, %zmm2
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1
+; ENABLE-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_rr_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps %zmm0, %zmm2
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; DISABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1
+; DISABLE-NEXT: vcvtdq2ps %zmm0, %zmm0
+; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <16 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %1, <16 x i32> %idx)
+ %a1 = sitofp <16 x i32> %idx to <16 x float>
+ %t = fadd <16 x float> %1, %a1
+ %res = fadd <16 x float> %2, %t
+ ret <16 x float> %res
+}
+
+define <16 x float> @permps_rm_512(<16 x float>* %p0, <16 x i32> %idx) {
+; ENABLE-LABEL: permps_rm_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0
+; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
+; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_rm_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0
+; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
+; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a0 = load <16 x float>, <16 x float>* %p0, align 64
+ %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx)
+ %a1 = sitofp <16 x i32> %idx to <16 x float>
+ %res = fadd <16 x float> %2, %a1
+ ret <16 x float> %res
+}
+
+define <16 x float> @permps_broadcast_512(float* %p0, <16 x i32> %idx) {
+; ENABLE-LABEL: permps_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0
+; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
+; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0
+; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
+; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v0 = load float, float* %p0, align 4
+ %t0 = insertelement <16 x float> undef, float %v0, i32 0
+ %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx)
+ %a1 = sitofp <16 x i32> %idx to <16 x float>
+ %res = fadd <16 x float> %2, %a1
+ ret <16 x float> %res
+}
+
+define <16 x float> @permps_maskz_512(<16 x float> %a0, <16 x i32> %idx, i16* %mask) {
+; ENABLE-LABEL: permps_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovw (%rdi), %k1
+; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; ENABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z}
+; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
+; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: permps_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovw (%rdi), %k1
+; DISABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z}
+; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
+; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load i16, i16* %mask
+ %3 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx, <16 x float> zeroinitializer, i16 %2)
+ %a1 = sitofp <16 x i32> %idx to <16 x float>
+ %t = fadd <16 x float> %a0, %a1
+ %res = fadd <16 x float> %3, %t
+ ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
diff --git a/llvm/test/CodeGen/X86/pmullq-false-deps.ll b/llvm/test/CodeGen/X86/pmullq-false-deps.ll
new file mode 100644
index 0000000000000..7dd4d50340bd9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pmullq-false-deps.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
+
+define <2 x i64> @pmullq_128(<2 x i64> %a0, <2 x i64> %a1) {
+; ENABLE-LABEL: pmullq_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ %3 = add <2 x i64> %a0, %a1
+ %res = add <2 x i64> %2, %3
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @pmullq_mem_128(<2 x i64> %a0, <2 x i64>* %p1) {
+; ENABLE-LABEL: pmullq_mem_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1
+; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_mem_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1
+; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <2 x i64>, <2 x i64>* %p1, align 64
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ %res = add <2 x i64> %2, %a0
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @pmullq_broadcast_128(<2 x i64> %a0, i64* %p1) {
+; ENABLE-LABEL: pmullq_broadcast_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1
+; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_broadcast_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1
+; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load i64, i64* %p1, align 4
+ %t0 = insertelement <2 x i64> undef, i64 %v1, i64 0
+ %a1 = shufflevector <2 x i64> %t0, <2 x i64> undef, <2 x i32> zeroinitializer
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ %res = add <2 x i64> %2, %a0
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @pmullq_maskz_128(<2 x i64> %a0, <2 x i64> %a1, i8* %pmask) {
+; ENABLE-LABEL: pmullq_maskz_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2
+; ENABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_maskz_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2
+; DISABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask)
+ %3 = add <2 x i64> %a0, %a1
+ %res = add <2 x i64> %2, %3
+ ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+
+define <4 x i64> @pmullq_256(<4 x i64> %a0, <4 x i64> %a1) {
+; ENABLE-LABEL: pmullq_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1
+; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1
+; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ %3 = add <4 x i64> %a0, %a1
+ %res = add <4 x i64> %2, %3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @pmullq_mem_256(<4 x i64> %a0, <4 x i64>* %p1) {
+; ENABLE-LABEL: pmullq_mem_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1
+; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_mem_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1
+; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x i64>, <4 x i64>* %p1, align 64
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ %res = add <4 x i64> %2, %a0
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @pmullq_broadcast_256(<4 x i64> %a0, i64* %p1) {
+; ENABLE-LABEL: pmullq_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1
+; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1
+; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load i64, i64* %p1, align 4
+ %t0 = insertelement <4 x i64> undef, i64 %v1, i64 0
+ %a1 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ %res = add <4 x i64> %2, %a0
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @pmullq_maskz_256(<4 x i64> %a0, <4 x i64> %a1, i8* %pmask) {
+; ENABLE-LABEL: pmullq_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2
+; ENABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2
+; DISABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask)
+ %3 = add <4 x i64> %a0, %a1
+ %res = add <4 x i64> %2, %3
+ ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+
+define <8 x i64> @pmullq_512(<8 x i64> %a0, <8 x i64> %a1) {
+; ENABLE-LABEL: pmullq_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1
+; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1
+; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
+ %3 = add <8 x i64> %a0, %a1
+ %res = add <8 x i64> %2, %3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @pmullq_mem_512(<8 x i64> %a0, <8 x i64>* %p1) {
+; ENABLE-LABEL: pmullq_mem_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1
+; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_mem_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1
+; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <8 x i64>, <8 x i64>* %p1, align 64
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
+ %res = add <8 x i64> %2, %a0
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @pmullq_broadcast_512(<8 x i64> %a0, i64* %p1) {
+; ENABLE-LABEL: pmullq_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1
+; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1
+; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load i64, i64* %p1, align 4
+ %t0 = insertelement <8 x i64> undef, i64 %v1, i64 0
+ %a1 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
+ %res = add <8 x i64> %2, %a0
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @pmullq_maskz_512(<8 x i64> %a0, <8 x i64> %a1, i8* %pmask) {
+; ENABLE-LABEL: pmullq_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2
+; ENABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: pmullq_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2
+; DISABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+ %3 = add <8 x i64> %a0, %a1
+ %res = add <8 x i64> %2, %3
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
diff --git a/llvm/test/CodeGen/X86/range-false-deps.ll b/llvm/test/CodeGen/X86/range-false-deps.ll
new file mode 100644
index 0000000000000..e211fb3d67bdd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/range-false-deps.ll
@@ -0,0 +1,984 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-range -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
+; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-range -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
+
+define <4 x float> @rangeps_128(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: rangeps_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 -1)
+ %3 = fadd <4 x float> %a0, %a1
+ %res = fadd <4 x float> %2, %3
+ ret <4 x float> %res
+}
+
+define <4 x float> @rangeps_mem_128(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: rangeps_mem_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangeps $88, (%rdi), %xmm1, %xmm0
+; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_mem_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vrangeps $88, (%rdi), %xmm1, %xmm0
+; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 -1)
+ %res = fadd <4 x float> %2, %a0
+ ret <4 x float> %res
+}
+
+define <4 x float> @rangeps_broadcast_128(<4 x float> %a0, float* %p1) {
+; ENABLE-LABEL: rangeps_broadcast_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangeps $88, (%rdi){1to4}, %xmm1, %xmm0
+; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_broadcast_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vrangeps $88, (%rdi){1to4}, %xmm1, %xmm0
+; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <4 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 -1)
+ %res = fadd <4 x float> %2, %a0
+ ret <4 x float> %res
+}
+
+define <4 x float> @rangeps_maskz_128(<4 x float> %a0, <4 x float> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangeps_maskz_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_maskz_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 %mask)
+ %3 = fadd <4 x float> %a0, %a1
+ %res = fadd <4 x float> %2, %3
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @rangeps_256(<8 x float> %a0, <8 x float> %a1) {
+; ENABLE-LABEL: rangeps_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangeps $88, %ymm2, %ymm0, %ymm1
+; ENABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vrangeps $88, %ymm2, %ymm0, %ymm1
+; DISABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 88, <8 x float> undef, i8 -1)
+ %3 = fadd <8 x float> %a0, %a1
+ %res = fadd <8 x float> %2, %3
+ ret <8 x float> %res
+}
+
+define <8 x float> @rangeps_mem_256(<8 x float> %a0, <8 x float>* %p1) {
+; ENABLE-LABEL: rangeps_mem_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangeps $88, (%rdi), %ymm1, %ymm0
+; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_mem_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vrangeps $88, (%rdi), %ymm1, %ymm0
+; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <8 x float>, <8 x float>* %p1, align 64
+ %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 88, <8 x float> undef, i8 -1)
+ %res = fadd <8 x float> %2, %a0
+ ret <8 x float> %res
+}
+
+define <8 x float> @rangeps_broadcast_256(<8 x float> %a0, float* %p1) {
+; ENABLE-LABEL: rangeps_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangeps $88, (%rdi){1to8}, %ymm1, %ymm0
+; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vrangeps $88, (%rdi){1to8}, %ymm1, %ymm0
+; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <8 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 88, <8 x float> undef, i8 -1)
+ %res = fadd <8 x float> %2, %a0
+ ret <8 x float> %res
+}
+
+define <8 x float> @rangeps_maskz_256(<8 x float> %a0, <8 x float> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangeps_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangeps $44, %ymm2, %ymm0, %ymm1 {%k1} {z}
+; ENABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vrangeps $44, %ymm2, %ymm0, %ymm1 {%k1} {z}
+; DISABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 44, <8 x float> undef, i8 %mask)
+ %3 = fadd <8 x float> %a0, %a1
+ %res = fadd <8 x float> %2, %3
+ ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) nounwind readnone
+
+define <16 x float> @rangeps_512(<16 x float> %a0, <16 x float> %a1) {
+; ENABLE-LABEL: rangeps_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1
+; ENABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1
+; DISABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 -1, i32 4)
+ %3 = fadd <16 x float> %a0, %a1
+ %res = fadd <16 x float> %2, %3
+ ret <16 x float> %res
+}
+
+define <16 x float> @rangeps_mem_512(<16 x float> %a0, <16 x float>* %p1) {
+; ENABLE-LABEL: rangeps_mem_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangeps $88, (%rdi), %zmm1, %zmm0
+; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_mem_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vrangeps $88, (%rdi), %zmm1, %zmm0
+; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <16 x float>, <16 x float>* %p1, align 64
+ %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 -1, i32 4)
+ %res = fadd <16 x float> %2, %a0
+ ret <16 x float> %res
+}
+
+define <16 x float> @rangeps_broadcast_512(<16 x float> %a0, float* %p1) {
+; ENABLE-LABEL: rangeps_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangeps $88, (%rdi){1to16}, %zmm1, %zmm0
+; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vrangeps $88, (%rdi){1to16}, %zmm1, %zmm0
+; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load float, float* %p1, align 4
+ %t0 = insertelement <16 x float> undef, float %v1, i64 0
+ %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 -1, i32 4)
+ %res = fadd <16 x float> %2, %a0
+ ret <16 x float> %res
+}
+
+define <16 x float> @rangeps_maskz_512(<16 x float> %a0, <16 x float> %a1, i16* %pmask) {
+; ENABLE-LABEL: rangeps_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovw (%rdi), %k1
+; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1 {%k1} {z}
+; ENABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangeps_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovw (%rdi), %k1
+; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1 {%k1} {z}
+; DISABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i16, i16* %pmask
+ %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 %mask, i32 4)
+ %3 = fadd <16 x float> %a0, %a1
+ %res = fadd <16 x float> %2, %3
+ ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32) nounwind readnone
+
+
+define <2 x double> @rangepd_128(<2 x double> %a0, <2 x double> %a1) {
+; ENABLE-LABEL: rangepd_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 -1)
+ %3 = fadd <2 x double> %a0, %a1
+ %res = fadd <2 x double> %2, %3
+ ret <2 x double> %res
+}
+
+define <2 x double> @rangepd_mem_128(<2 x double> %a0, <2 x double>* %p1) {
+; ENABLE-LABEL: rangepd_mem_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangepd $88, (%rdi), %xmm1, %xmm0
+; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_mem_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vrangepd $88, (%rdi), %xmm1, %xmm0
+; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <2 x double>, <2 x double>* %p1, align 64
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 -1)
+ %res = fadd <2 x double> %2, %a0
+ ret <2 x double> %res
+}
+
+define <2 x double> @rangepd_broadcast_128(<2 x double> %a0, double* %p1) {
+; ENABLE-LABEL: rangepd_broadcast_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangepd $88, (%rdi){1to2}, %xmm1, %xmm0
+; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_broadcast_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vrangepd $88, (%rdi){1to2}, %xmm1, %xmm0
+; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load double, double* %p1, align 4
+ %t0 = insertelement <2 x double> undef, double %v1, i64 0
+ %a1 = shufflevector <2 x double> %t0, <2 x double> undef, <2 x i32> zeroinitializer
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 -1)
+ %res = fadd <2 x double> %2, %a0
+ ret <2 x double> %res
+}
+
+define <2 x double> @rangepd_maskz_128(<2 x double> %a0, <2 x double> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangepd_maskz_128:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_maskz_128:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 %mask)
+ %3 = fadd <2 x double> %a0, %a1
+ %res = fadd <2 x double> %2, %3
+ ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @rangepd_256(<4 x double> %a0, <4 x double> %a1) {
+; ENABLE-LABEL: rangepd_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1
+; ENABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1
+; DISABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 -1)
+ %3 = fadd <4 x double> %a0, %a1
+ %res = fadd <4 x double> %2, %3
+ ret <4 x double> %res
+}
+
+define <4 x double> @rangepd_mem_256(<4 x double> %a0, <4 x double>* %p1) {
+; ENABLE-LABEL: rangepd_mem_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangepd $88, (%rdi), %ymm1, %ymm0
+; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_mem_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vrangepd $88, (%rdi), %ymm1, %ymm0
+; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x double>, <4 x double>* %p1, align 64
+ %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 -1)
+ %res = fadd <4 x double> %2, %a0
+ ret <4 x double> %res
+}
+
+define <4 x double> @rangepd_broadcast_256(<4 x double> %a0, double* %p1) {
+; ENABLE-LABEL: rangepd_broadcast_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangepd $88, (%rdi){1to4}, %ymm1, %ymm0
+; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_broadcast_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; DISABLE-NEXT: vrangepd $88, (%rdi){1to4}, %ymm1, %ymm0
+; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load double, double* %p1, align 4
+ %t0 = insertelement <4 x double> undef, double %v1, i64 0
+ %a1 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer
+ %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 -1)
+ %res = fadd <4 x double> %2, %a0
+ ret <4 x double> %res
+}
+
+define <4 x double> @rangepd_maskz_256(<4 x double> %a0, <4 x double> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangepd_maskz_256:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1 {%k1} {z}
+; ENABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_maskz_256:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; DISABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1 {%k1} {z}
+; DISABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 %mask)
+ %3 = fadd <4 x double> %a0, %a1
+ %res = fadd <4 x double> %2, %3
+ ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) nounwind readnone
+
+define <8 x double> @rangepd_512(<8 x double> %a0, <8 x double> %a1) {
+; ENABLE-LABEL: rangepd_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1
+; ENABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1
+; DISABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 -1, i32 4)
+ %3 = fadd <8 x double> %a0, %a1
+ %res = fadd <8 x double> %2, %3
+ ret <8 x double> %res
+}
+
+define <8 x double> @rangepd_mem_512(<8 x double> %a0, <8 x double>* %p1) {
+; ENABLE-LABEL: rangepd_mem_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangepd $88, (%rdi), %zmm1, %zmm0
+; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_mem_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vrangepd $88, (%rdi), %zmm1, %zmm0
+; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <8 x double>, <8 x double>* %p1, align 64
+ %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 -1, i32 4)
+ %res = fadd <8 x double> %2, %a0
+ ret <8 x double> %res
+}
+
+define <8 x double> @rangepd_broadcast_512(<8 x double> %a0, double* %p1) {
+; ENABLE-LABEL: rangepd_broadcast_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangepd $88, (%rdi){1to8}, %zmm1, %zmm0
+; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_broadcast_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; DISABLE-NEXT: vrangepd $88, (%rdi){1to8}, %zmm1, %zmm0
+; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %v1 = load double, double* %p1, align 4
+ %t0 = insertelement <8 x double> undef, double %v1, i64 0
+ %a1 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 -1, i32 4)
+ %res = fadd <8 x double> %2, %a0
+ ret <8 x double> %res
+}
+
+define <8 x double> @rangepd_maskz_512(<8 x double> %a0, <8 x double> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangepd_maskz_512:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1 {%k1} {z}
+; ENABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangepd_maskz_512:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; DISABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1 {%k1} {z}
+; DISABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 %mask, i32 4)
+ %3 = fadd <8 x double> %a0, %a1
+ %res = fadd <8 x double> %2, %3
+ ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32) nounwind readnone
+
+define <4 x float> @rangess(<4 x float> %a0, <4 x float> %a1) {
+; ENABLE-LABEL: rangess:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangess:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4, i32 4)
+ %3 = fadd <4 x float> %a1, %a0
+ %res = fadd <4 x float> %2, %3
+ ret <4 x float> %res
+}
+
+define <4 x float> @rangess_mem(<4 x float> %a0, <4 x float>* %p1) {
+; ENABLE-LABEL: rangess_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangess $4, (%rdi), %xmm0, %xmm1
+; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangess_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vrangess $4, (%rdi), %xmm0, %xmm1
+; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <4 x float>, <4 x float>* %p1, align 64
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4, i32 4)
+ %res = fadd <4 x float> %2, %a0
+ ret <4 x float> %res
+}
+
+define <4 x float> @rangess_maskz(<4 x float> %a0, <4 x float> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangess_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangess_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 %mask, i32 4, i32 4)
+ %3 = fadd <4 x float> %a0, %a1
+ %res = fadd <4 x float> %2, %3
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
+
+define <2 x double> @rangesd(<2 x double> %a0, <2 x double> %a1) {
+; ENABLE-LABEL: rangesd:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1
+; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangesd:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1
+; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> undef, i8 -1, i32 4, i32 4)
+ %3 = fadd <2 x double> %a0, %a1
+ %res = fadd <2 x double> %2, %3
+ ret <2 x double> %res
+}
+
+define <2 x double> @rangesd_mem(<2 x double> %a0, <2 x double>* %p1) {
+; ENABLE-LABEL: rangesd_mem:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ENABLE-NEXT: vrangesd $4, (%rdi), %xmm1, %xmm0
+; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangesd_mem:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; DISABLE-NEXT: vrangesd $4, (%rdi), %xmm1, %xmm0
+; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %a1 = load <2 x double>, <2 x double>* %p1, align 64
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> undef, i8 -1, i32 4, i32 4)
+ %res = fadd <2 x double> %2, %a0
+ ret <2 x double> %res
+}
+
+define <2 x double> @rangesd_maskz(<2 x double> %a0, <2 x double> %a1, i8* %pmask) {
+; ENABLE-LABEL: rangesd_maskz:
+; ENABLE: # %bb.0:
+; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ENABLE-NEXT: #APP
+; ENABLE-NEXT: nop
+; ENABLE-NEXT: #NO_APP
+; ENABLE-NEXT: kmovb (%rdi), %k1
+; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ENABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; ENABLE-NEXT: retq
+;
+; DISABLE-LABEL: rangesd_maskz:
+; DISABLE: # %bb.0:
+; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; DISABLE-NEXT: #APP
+; DISABLE-NEXT: nop
+; DISABLE-NEXT: #NO_APP
+; DISABLE-NEXT: kmovb (%rdi), %k1
+; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; DISABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0
+; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; DISABLE-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %mask = load i8, i8* %pmask
+ %2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> undef, i8 %mask, i32 4, i32 4)
+ %3 = fadd <2 x double> %a0, %a1
+ %res = fadd <2 x double> %2, %3
+ ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
More information about the llvm-commits
mailing list